kokorog2p 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/PKG-INFO +1 -1
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/_version.py +3 -3
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/base.py +2 -2
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/markdown.py +4 -3
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/mixed_language_g2p.py +16 -3
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/tokenizer.py +7 -2
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/token.py +15 -1
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/PKG-INFO +1 -1
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/SOURCES.txt +1 -0
- kokorog2p-0.3.3/tests/test_base.py +33 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_markdown.py +20 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_mixed_language_g2p.py +21 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_token.py +3 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_tokenizer.py +12 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.coveragerc +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/pytest.ini +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/codecov.yml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/pre-commit.yml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/python-publish.yml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/tests.yml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.gitignore +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.pre-commit-config.yaml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.prettierrc.yml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.readthedocs.yaml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.ruff.toml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/LICENSE +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/README.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/BENCHMARK_SUMMARY.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/CHINESE_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/FRENCH_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/GERMAN_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/ITALIAN_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/JAPANESE_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/KOREAN_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/PORTUGUESE_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/SPANISH_DATASET.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/add_zh_metadata.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/all_languages_benchmark_results.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_cs_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_de_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_de_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_en_gb_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_en_synthetic.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_en_us_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_es_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_fr_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_fr_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_it_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_ja_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_ja_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_ko_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_load_silver.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_misaki_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_pt_br_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_zh_comparison.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/README.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/de_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/en_gb_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/en_us_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/es_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/fr_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/it_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/ja_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/ko_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/pt_br_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/zh_synthetic.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/zh_synthetic_handcrafted.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/download_childes.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/extract_childes_fast.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/extract_childes_sentences.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/generate_phonemes.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/generate_zh_synthetic.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/random_sentence_generator.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/regenerate_phonemes.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/validate_synthetic_data.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/README_DOCS.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/abbreviation_customization.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/advanced.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/backends.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/chinese.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/core.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/czech.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/english.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/french.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/german.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/hebrew.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/italian.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/japanese.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/korean.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/mixed.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/portuguese.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/spanish.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/utils.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/changelog.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/conf.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/contributing.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/index.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/installation.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/languages.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/make.bat +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/make.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/phonemes.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/quickstart.rst +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/requirements.txt +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/abbreviation_customization.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/debug_mode_demo.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/demo_both_features.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/quick_demo_dr_to_drive.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/api.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/backend.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/voice.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/wrapper.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/goruut/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/goruut/backend.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/fallback.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/data/kokoro_config.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/data/kokoro_config_v1.1_zh.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/data/de_gold.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/fallback.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/lexicon.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/numbers.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/gb_gold.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/gb_silver.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/us_gold.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/us_silver.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/fallback.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/lexicon.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/numbers.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/espeak_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/data/fr_gold.json +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/fallback.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/lexicon.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/numbers.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/goruut_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/he/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/he/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/cutlet.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/data/ja_words.txt +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/num2kana.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/README.md +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/idioms.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/rules.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/table.csv +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/english.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/g2pk.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/jamo_to_ipa.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/numerals.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/regular.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/special.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/utils.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/phonemes.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/models.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/data/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/punctuation.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/py.typed +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/vocab.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/words_mismatch.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/frontend.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/tone_sandhi.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/transcription.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/dependency_links.txt +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/requires.txt +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/top_level.txt +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/pyproject.toml +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/requirements-test.txt +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/setup.cfg +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/setup.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/__init__.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/conftest.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_abbreviation_customization.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_benchmark_validation.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_ci_bug_fix.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_contraction_detection.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_cs_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_de_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_abbreviations.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_debug.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_lexicon.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_normalizer.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_es_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_espeak_backend.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_goruut_backend.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_he_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_it_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_ko_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_multilang.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_normalization.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_phoneme_spacing.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_phonemes.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_pt_g2p.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_punctuation.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_quote_handling.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_quote_phonemes.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_temperature_normalization.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_vocab.py +0 -0
- {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_words_mismatch.py +0 -0
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.3.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 3,
|
|
31
|
+
__version__ = version = '0.3.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 3, 3)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g2b66f6ceb'
|
|
@@ -70,12 +70,12 @@ class G2PBase(ABC):
|
|
|
70
70
|
if token.phonemes:
|
|
71
71
|
result.append(token.phonemes)
|
|
72
72
|
if token.whitespace:
|
|
73
|
-
result.append(
|
|
73
|
+
result.append(token.whitespace)
|
|
74
74
|
elif token.is_punctuation:
|
|
75
75
|
# Keep punctuation as-is
|
|
76
76
|
result.append(token.text)
|
|
77
77
|
if token.whitespace:
|
|
78
|
-
result.append(
|
|
78
|
+
result.append(token.whitespace)
|
|
79
79
|
return "".join(result).strip()
|
|
80
80
|
|
|
81
81
|
def word_to_phonemes(self, word: str, tag: str | None = None) -> str | None:
|
|
@@ -61,8 +61,8 @@ def preprocess_markdown(text: str) -> tuple[str, list[str], dict[int, str]]:
|
|
|
61
61
|
# Extract phonemes (group 2)
|
|
62
62
|
phonemes = m.group(2)
|
|
63
63
|
|
|
64
|
-
# Check if it's a phoneme annotation (starts with /)
|
|
65
|
-
if phonemes and phonemes.startswith("/"):
|
|
64
|
+
# Check if it's a phoneme annotation (starts and ends with /)
|
|
65
|
+
if phonemes and phonemes.startswith("/") and phonemes.endswith("/"):
|
|
66
66
|
phonemes = phonemes.strip("/") # Remove leading and trailing slashes
|
|
67
67
|
features[len(tokens)] = phonemes
|
|
68
68
|
|
|
@@ -99,8 +99,9 @@ def apply_markdown_features(
|
|
|
99
99
|
# This assumes G2P tokenization preserves words from preprocessing
|
|
100
100
|
token_map: dict[int, int] = {}
|
|
101
101
|
for i, orig_word in enumerate(original_tokens):
|
|
102
|
+
orig_key = orig_word.casefold()
|
|
102
103
|
for j, token in enumerate(tokens):
|
|
103
|
-
if token.text ==
|
|
104
|
+
if token.text.casefold() == orig_key and j not in token_map.values():
|
|
104
105
|
token_map[i] = j
|
|
105
106
|
break
|
|
106
107
|
|
|
@@ -165,15 +165,28 @@ class MixedLanguageG2P(G2PBase):
|
|
|
165
165
|
ImportError: If enable_detection=True but lingua-language-detector
|
|
166
166
|
is not installed (will warn and disable detection instead).
|
|
167
167
|
"""
|
|
168
|
-
super().__init__(language=primary_language)
|
|
169
|
-
self.version = version
|
|
170
|
-
|
|
171
168
|
if allowed_languages is None or len(allowed_languages) == 0:
|
|
172
169
|
raise ValueError(
|
|
173
170
|
"allowed_languages must be specified and non-empty. "
|
|
174
171
|
"Example: allowed_languages=['de', 'en-us']"
|
|
175
172
|
)
|
|
176
173
|
|
|
174
|
+
normalized_allowed = [
|
|
175
|
+
lang.lower().replace("_", "-") for lang in allowed_languages
|
|
176
|
+
]
|
|
177
|
+
primary_normalized = primary_language.lower().replace("_", "-")
|
|
178
|
+
if primary_normalized not in normalized_allowed:
|
|
179
|
+
raise ValueError(
|
|
180
|
+
"primary_language must be in allowed_languages. "
|
|
181
|
+
"Example: primary_language='de', allowed_languages=['de', 'en-us']"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
primary_language = primary_normalized
|
|
185
|
+
allowed_languages = normalized_allowed
|
|
186
|
+
|
|
187
|
+
super().__init__(language=primary_language)
|
|
188
|
+
self.version = version
|
|
189
|
+
|
|
177
190
|
# Check if lingua is available
|
|
178
191
|
if not LINGUA_AVAILABLE:
|
|
179
192
|
if enable_detection:
|
|
@@ -202,10 +202,15 @@ class BaseTokenizer(ABC):
|
|
|
202
202
|
"""
|
|
203
203
|
if self.phoneme_quotes == "ascii":
|
|
204
204
|
# Convert all quote variants to ASCII
|
|
205
|
-
text = text.replace('"', '"').replace(
|
|
205
|
+
text = text.replace("\u201c", '"').replace("\u201d", '"').replace("`", '"')
|
|
206
206
|
elif self.phoneme_quotes == "none":
|
|
207
207
|
# Strip all quotes
|
|
208
|
-
text =
|
|
208
|
+
text = (
|
|
209
|
+
text.replace('"', "")
|
|
210
|
+
.replace("\u201c", "")
|
|
211
|
+
.replace("\u201d", "")
|
|
212
|
+
.replace("`", "")
|
|
213
|
+
)
|
|
209
214
|
# else: "curly" - keep as-is
|
|
210
215
|
|
|
211
216
|
return text
|
|
@@ -37,7 +37,20 @@ class GToken:
|
|
|
37
37
|
@property
|
|
38
38
|
def is_punctuation(self) -> bool:
|
|
39
39
|
"""Check if this token is punctuation."""
|
|
40
|
-
return self.tag in (
|
|
40
|
+
return self.tag in (
|
|
41
|
+
".",
|
|
42
|
+
",",
|
|
43
|
+
":",
|
|
44
|
+
";",
|
|
45
|
+
"!",
|
|
46
|
+
"?",
|
|
47
|
+
"-",
|
|
48
|
+
"'",
|
|
49
|
+
'"',
|
|
50
|
+
"(",
|
|
51
|
+
")",
|
|
52
|
+
"PUNCT",
|
|
53
|
+
)
|
|
41
54
|
|
|
42
55
|
@property
|
|
43
56
|
def is_word(self) -> bool:
|
|
@@ -61,6 +74,7 @@ class GToken:
|
|
|
61
74
|
phonemes=self.phonemes,
|
|
62
75
|
start_ts=self.start_ts,
|
|
63
76
|
end_ts=self.end_ts,
|
|
77
|
+
rating=self.rating,
|
|
64
78
|
_=dict(self._),
|
|
65
79
|
)
|
|
66
80
|
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Tests for G2PBase utilities."""
|
|
2
|
+
|
|
3
|
+
from kokorog2p.base import G2PBase
|
|
4
|
+
from kokorog2p.token import GToken
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DummyG2P(G2PBase):
|
|
8
|
+
"""Minimal G2P implementation for base tests."""
|
|
9
|
+
|
|
10
|
+
def __init__(self, tokens: list[GToken]):
|
|
11
|
+
super().__init__(language="en-us")
|
|
12
|
+
self._tokens = tokens
|
|
13
|
+
|
|
14
|
+
def __call__(self, text: str) -> list[GToken]:
|
|
15
|
+
return list(self._tokens)
|
|
16
|
+
|
|
17
|
+
def lookup(self, word: str, tag: str | None = None) -> str | None:
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestG2PBase:
|
|
22
|
+
"""Tests for G2PBase helpers."""
|
|
23
|
+
|
|
24
|
+
def test_phonemize_preserves_whitespace(self):
|
|
25
|
+
"""phonemize should preserve token whitespace exactly."""
|
|
26
|
+
tokens = [
|
|
27
|
+
GToken(text="Hello", phonemes="h", whitespace=" "),
|
|
28
|
+
GToken(text="world", phonemes="w", whitespace=""),
|
|
29
|
+
GToken(text=".", tag=".", whitespace=""),
|
|
30
|
+
]
|
|
31
|
+
g2p = DummyG2P(tokens)
|
|
32
|
+
|
|
33
|
+
assert g2p.phonemize("ignored") == "h w."
|
|
@@ -52,6 +52,13 @@ class TestPreprocessMarkdown:
|
|
|
52
52
|
assert "link" in clean
|
|
53
53
|
assert len(features) == 0 # Not a phoneme annotation
|
|
54
54
|
|
|
55
|
+
def test_relative_link_is_not_annotation(self):
|
|
56
|
+
"""Relative links should not be treated as annotations."""
|
|
57
|
+
text = "[link](/docs/page) test"
|
|
58
|
+
clean, tokens, features = preprocess_markdown(text)
|
|
59
|
+
assert "link" in clean
|
|
60
|
+
assert len(features) == 0
|
|
61
|
+
|
|
55
62
|
def test_empty_annotation(self):
|
|
56
63
|
"""Test annotation with empty phonemes."""
|
|
57
64
|
text = "[word](/) test"
|
|
@@ -151,6 +158,19 @@ class TestApplyMarkdownFeatures:
|
|
|
151
158
|
result = apply_markdown_features(tokens, features, orig_tokens)
|
|
152
159
|
assert result[0].phonemes == "hɛloʊ"
|
|
153
160
|
|
|
161
|
+
def test_apply_feature_case_insensitive(self):
|
|
162
|
+
"""Features should match regardless of casing differences."""
|
|
163
|
+
tokens = [
|
|
164
|
+
GToken(text="hello", phonemes="hɛloʊ"),
|
|
165
|
+
GToken(text="world", phonemes="wɝld"),
|
|
166
|
+
]
|
|
167
|
+
features = {0: "hˈɛloʊ"}
|
|
168
|
+
orig_tokens = ["Hello", "world"]
|
|
169
|
+
|
|
170
|
+
result = apply_markdown_features(tokens, features, orig_tokens)
|
|
171
|
+
assert result[0].phonemes == "hˈɛloʊ"
|
|
172
|
+
assert result[0].get("rating") == 5
|
|
173
|
+
|
|
154
174
|
|
|
155
175
|
class TestPhonemizeWithMarkdown:
|
|
156
176
|
"""Tests for phonemize_with_markdown function."""
|
|
@@ -46,6 +46,27 @@ class TestMixedLanguageG2PBasic:
|
|
|
46
46
|
with pytest.raises(ValueError, match="allowed_languages must be specified"):
|
|
47
47
|
MixedLanguageG2P(primary_language="de", allowed_languages=[])
|
|
48
48
|
|
|
49
|
+
def test_primary_language_must_be_allowed(self):
|
|
50
|
+
"""Primary language should be part of allowed_languages."""
|
|
51
|
+
from kokorog2p.mixed_language_g2p import MixedLanguageG2P
|
|
52
|
+
|
|
53
|
+
with pytest.raises(
|
|
54
|
+
ValueError, match="primary_language must be in allowed_languages"
|
|
55
|
+
):
|
|
56
|
+
MixedLanguageG2P(primary_language="de", allowed_languages=["en-us"])
|
|
57
|
+
|
|
58
|
+
def test_language_codes_normalized(self):
|
|
59
|
+
"""Language codes should normalize to lowercase with hyphens."""
|
|
60
|
+
from kokorog2p.mixed_language_g2p import MixedLanguageG2P
|
|
61
|
+
|
|
62
|
+
g2p = MixedLanguageG2P(
|
|
63
|
+
primary_language="DE",
|
|
64
|
+
allowed_languages=["EN_US", "DE"],
|
|
65
|
+
enable_detection=False,
|
|
66
|
+
)
|
|
67
|
+
assert g2p.primary_language == "de"
|
|
68
|
+
assert g2p.allowed_languages == ["en-us", "de"]
|
|
69
|
+
|
|
49
70
|
def test_repr(self):
|
|
50
71
|
"""Test string representation."""
|
|
51
72
|
from kokorog2p.mixed_language_g2p import MixedLanguageG2P
|
|
@@ -52,6 +52,7 @@ class TestGToken:
|
|
|
52
52
|
assert GToken(text=".", tag=".").is_punctuation is True
|
|
53
53
|
assert GToken(text=",", tag=",").is_punctuation is True
|
|
54
54
|
assert GToken(text="!", tag="!").is_punctuation is True
|
|
55
|
+
assert GToken(text="!", tag="PUNCT").is_punctuation is True
|
|
55
56
|
assert GToken(text="hello", tag="NN").is_punctuation is False
|
|
56
57
|
|
|
57
58
|
def test_is_word(self):
|
|
@@ -80,6 +81,7 @@ class TestGToken:
|
|
|
80
81
|
text="hello",
|
|
81
82
|
tag="NN",
|
|
82
83
|
phonemes="hˈɛlO",
|
|
84
|
+
rating="espeak",
|
|
83
85
|
_={"rating": 4},
|
|
84
86
|
)
|
|
85
87
|
|
|
@@ -89,6 +91,7 @@ class TestGToken:
|
|
|
89
91
|
assert copy.text == original.text
|
|
90
92
|
assert copy.tag == original.tag
|
|
91
93
|
assert copy.phonemes == original.phonemes
|
|
94
|
+
assert copy.rating == original.rating
|
|
92
95
|
assert copy._["rating"] == 4
|
|
93
96
|
|
|
94
97
|
# Check it's a different object
|
|
@@ -111,6 +111,18 @@ class TestRegexTokenizer:
|
|
|
111
111
|
assert quote_tokens[0].quote_depth == 1
|
|
112
112
|
assert quote_tokens[1].quote_depth == 0
|
|
113
113
|
|
|
114
|
+
def test_normalize_phoneme_quotes_ascii(self):
|
|
115
|
+
"""Curly quotes should normalize to ASCII when requested."""
|
|
116
|
+
tokenizer = RegexTokenizer(phoneme_quotes="ascii")
|
|
117
|
+
text = "\u201cHello\u201d"
|
|
118
|
+
assert tokenizer.normalize_phoneme_quotes(text) == '"Hello"'
|
|
119
|
+
|
|
120
|
+
def test_normalize_phoneme_quotes_none(self):
|
|
121
|
+
"""Quote characters should be stripped when requested."""
|
|
122
|
+
tokenizer = RegexTokenizer(phoneme_quotes="none")
|
|
123
|
+
text = "\u201cHello\u201d"
|
|
124
|
+
assert tokenizer.normalize_phoneme_quotes(text) == "Hello"
|
|
125
|
+
|
|
114
126
|
def test_empty_string(self, tokenizer):
|
|
115
127
|
"""Test tokenizing empty string."""
|
|
116
128
|
tokens = tokenizer.tokenize("")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|