kokorog2p 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (250) hide show
  1. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/PKG-INFO +1 -1
  2. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/_version.py +3 -3
  3. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/base.py +2 -2
  4. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/markdown.py +4 -3
  5. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/mixed_language_g2p.py +16 -3
  6. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/tokenizer.py +7 -2
  7. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/token.py +15 -1
  8. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/PKG-INFO +1 -1
  9. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/SOURCES.txt +1 -0
  10. kokorog2p-0.3.3/tests/test_base.py +33 -0
  11. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_markdown.py +20 -0
  12. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_mixed_language_g2p.py +21 -0
  13. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_token.py +3 -0
  14. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_tokenizer.py +12 -0
  15. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.coveragerc +0 -0
  16. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/pytest.ini +0 -0
  17. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/codecov.yml +0 -0
  18. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/pre-commit.yml +0 -0
  19. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/python-publish.yml +0 -0
  20. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.github/workflows/tests.yml +0 -0
  21. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.gitignore +0 -0
  22. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.pre-commit-config.yaml +0 -0
  23. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.prettierrc.yml +0 -0
  24. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.readthedocs.yaml +0 -0
  25. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/.ruff.toml +0 -0
  26. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/LICENSE +0 -0
  27. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/README.md +0 -0
  28. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/BENCHMARK_SUMMARY.md +0 -0
  29. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/CHINESE_DATASET.md +0 -0
  30. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/FRENCH_DATASET.md +0 -0
  31. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/GERMAN_DATASET.md +0 -0
  32. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/ITALIAN_DATASET.md +0 -0
  33. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/JAPANESE_DATASET.md +0 -0
  34. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/KOREAN_DATASET.md +0 -0
  35. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/PORTUGUESE_DATASET.md +0 -0
  36. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/SPANISH_DATASET.md +0 -0
  37. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/__init__.py +0 -0
  38. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/add_zh_metadata.py +0 -0
  39. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/all_languages_benchmark_results.json +0 -0
  40. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_comparison.py +0 -0
  41. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_cs_g2p.py +0 -0
  42. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_de_comparison.py +0 -0
  43. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_de_g2p.py +0 -0
  44. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_en_gb_comparison.py +0 -0
  45. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_en_synthetic.py +0 -0
  46. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_en_us_comparison.py +0 -0
  47. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_es_comparison.py +0 -0
  48. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_fr_comparison.py +0 -0
  49. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_fr_g2p.py +0 -0
  50. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_g2p.py +0 -0
  51. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_it_comparison.py +0 -0
  52. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_ja_comparison.py +0 -0
  53. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_ja_g2p.py +0 -0
  54. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_ko_comparison.py +0 -0
  55. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_load_silver.py +0 -0
  56. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_misaki_comparison.py +0 -0
  57. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_pt_br_comparison.py +0 -0
  58. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/benchmark_zh_comparison.py +0 -0
  59. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/README.md +0 -0
  60. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/__init__.py +0 -0
  61. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/de_synthetic.json +0 -0
  62. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/en_gb_synthetic.json +0 -0
  63. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/en_us_synthetic.json +0 -0
  64. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/es_synthetic.json +0 -0
  65. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/fr_synthetic.json +0 -0
  66. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/it_synthetic.json +0 -0
  67. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/ja_synthetic.json +0 -0
  68. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/ko_synthetic.json +0 -0
  69. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/pt_br_synthetic.json +0 -0
  70. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/zh_synthetic.json +0 -0
  71. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/data/zh_synthetic_handcrafted.json +0 -0
  72. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/download_childes.py +0 -0
  73. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/extract_childes_fast.py +0 -0
  74. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/extract_childes_sentences.py +0 -0
  75. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/generate_phonemes.py +0 -0
  76. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/generate_zh_synthetic.py +0 -0
  77. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/random_sentence_generator.py +0 -0
  78. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/regenerate_phonemes.py +0 -0
  79. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/benchmarks/validate_synthetic_data.py +0 -0
  80. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/README_DOCS.md +0 -0
  81. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/abbreviation_customization.rst +0 -0
  82. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/advanced.rst +0 -0
  83. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/backends.rst +0 -0
  84. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/chinese.rst +0 -0
  85. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/core.rst +0 -0
  86. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/czech.rst +0 -0
  87. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/english.rst +0 -0
  88. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/french.rst +0 -0
  89. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/german.rst +0 -0
  90. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/hebrew.rst +0 -0
  91. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/italian.rst +0 -0
  92. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/japanese.rst +0 -0
  93. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/korean.rst +0 -0
  94. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/mixed.rst +0 -0
  95. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/portuguese.rst +0 -0
  96. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/spanish.rst +0 -0
  97. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/api/utils.rst +0 -0
  98. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/changelog.rst +0 -0
  99. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/conf.py +0 -0
  100. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/contributing.rst +0 -0
  101. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/index.rst +0 -0
  102. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/installation.rst +0 -0
  103. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/languages.rst +0 -0
  104. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/make.bat +0 -0
  105. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/make.py +0 -0
  106. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/phonemes.rst +0 -0
  107. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/quickstart.rst +0 -0
  108. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/docs/requirements.txt +0 -0
  109. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/abbreviation_customization.py +0 -0
  110. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/debug_mode_demo.py +0 -0
  111. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/demo_both_features.py +0 -0
  112. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/examples/quick_demo_dr_to_drive.py +0 -0
  113. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/__init__.py +0 -0
  114. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/__init__.py +0 -0
  115. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/__init__.py +0 -0
  116. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/api.py +0 -0
  117. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/backend.py +0 -0
  118. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/voice.py +0 -0
  119. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/espeak/wrapper.py +0 -0
  120. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/goruut/__init__.py +0 -0
  121. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/backends/goruut/backend.py +0 -0
  122. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/__init__.py +0 -0
  123. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/abbreviations.py +0 -0
  124. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/data/__init__.py +0 -0
  125. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/fallback.py +0 -0
  126. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/g2p.py +0 -0
  127. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/cs/normalizer.py +0 -0
  128. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/data/__init__.py +0 -0
  129. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/data/kokoro_config.json +0 -0
  130. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/data/kokoro_config_v1.1_zh.json +0 -0
  131. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/__init__.py +0 -0
  132. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/abbreviations.py +0 -0
  133. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/data/__init__.py +0 -0
  134. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/data/de_gold.json +0 -0
  135. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/fallback.py +0 -0
  136. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/g2p.py +0 -0
  137. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/lexicon.py +0 -0
  138. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/normalizer.py +0 -0
  139. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/de/numbers.py +0 -0
  140. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/__init__.py +0 -0
  141. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/abbreviations.py +0 -0
  142. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/__init__.py +0 -0
  143. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/gb_gold.json +0 -0
  144. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/gb_silver.json +0 -0
  145. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/us_gold.json +0 -0
  146. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/data/us_silver.json +0 -0
  147. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/fallback.py +0 -0
  148. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/g2p.py +0 -0
  149. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/lexicon.py +0 -0
  150. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/normalizer.py +0 -0
  151. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/en/numbers.py +0 -0
  152. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/__init__.py +0 -0
  153. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/abbreviations.py +0 -0
  154. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/data/__init__.py +0 -0
  155. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/g2p.py +0 -0
  156. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/es/normalizer.py +0 -0
  157. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/espeak_g2p.py +0 -0
  158. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/__init__.py +0 -0
  159. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/abbreviations.py +0 -0
  160. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/data/__init__.py +0 -0
  161. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/data/fr_gold.json +0 -0
  162. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/fallback.py +0 -0
  163. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/g2p.py +0 -0
  164. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/lexicon.py +0 -0
  165. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/normalizer.py +0 -0
  166. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/fr/numbers.py +0 -0
  167. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/goruut_g2p.py +0 -0
  168. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/he/__init__.py +0 -0
  169. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/he/g2p.py +0 -0
  170. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/__init__.py +0 -0
  171. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/abbreviations.py +0 -0
  172. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/data/__init__.py +0 -0
  173. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/g2p.py +0 -0
  174. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/it/normalizer.py +0 -0
  175. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/__init__.py +0 -0
  176. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/cutlet.py +0 -0
  177. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/data/__init__.py +0 -0
  178. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/data/ja_words.txt +0 -0
  179. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/g2p.py +0 -0
  180. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ja/num2kana.py +0 -0
  181. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/README.md +0 -0
  182. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/__init__.py +0 -0
  183. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/__init__.py +0 -0
  184. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/idioms.py +0 -0
  185. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/rules.py +0 -0
  186. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/data/table.csv +0 -0
  187. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/english.py +0 -0
  188. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/g2p.py +0 -0
  189. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/g2pk.py +0 -0
  190. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/jamo_to_ipa.py +0 -0
  191. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/numerals.py +0 -0
  192. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/regular.py +0 -0
  193. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/special.py +0 -0
  194. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/ko/utils.py +0 -0
  195. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/phonemes.py +0 -0
  196. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/__init__.py +0 -0
  197. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/abbreviations.py +0 -0
  198. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/models.py +0 -0
  199. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pipeline/normalizer.py +0 -0
  200. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/__init__.py +0 -0
  201. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/abbreviations.py +0 -0
  202. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/data/__init__.py +0 -0
  203. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/g2p.py +0 -0
  204. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/pt/normalizer.py +0 -0
  205. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/punctuation.py +0 -0
  206. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/py.typed +0 -0
  207. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/vocab.py +0 -0
  208. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/words_mismatch.py +0 -0
  209. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/__init__.py +0 -0
  210. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/frontend.py +0 -0
  211. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/g2p.py +0 -0
  212. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/tone_sandhi.py +0 -0
  213. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p/zh/transcription.py +0 -0
  214. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/dependency_links.txt +0 -0
  215. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/requires.txt +0 -0
  216. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/kokorog2p.egg-info/top_level.txt +0 -0
  217. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/pyproject.toml +0 -0
  218. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/requirements-test.txt +0 -0
  219. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/setup.cfg +0 -0
  220. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/setup.py +0 -0
  221. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/__init__.py +0 -0
  222. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/conftest.py +0 -0
  223. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_abbreviation_customization.py +0 -0
  224. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_benchmark_validation.py +0 -0
  225. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_ci_bug_fix.py +0 -0
  226. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_contraction_detection.py +0 -0
  227. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_cs_g2p.py +0 -0
  228. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_de_g2p.py +0 -0
  229. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_abbreviations.py +0 -0
  230. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_debug.py +0 -0
  231. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_g2p.py +0 -0
  232. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_lexicon.py +0 -0
  233. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_en_normalizer.py +0 -0
  234. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_es_g2p.py +0 -0
  235. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_espeak_backend.py +0 -0
  236. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_goruut_backend.py +0 -0
  237. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_he_g2p.py +0 -0
  238. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_it_g2p.py +0 -0
  239. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_ko_g2p.py +0 -0
  240. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_multilang.py +0 -0
  241. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_normalization.py +0 -0
  242. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_phoneme_spacing.py +0 -0
  243. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_phonemes.py +0 -0
  244. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_pt_g2p.py +0 -0
  245. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_punctuation.py +0 -0
  246. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_quote_handling.py +0 -0
  247. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_quote_phonemes.py +0 -0
  248. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_temperature_normalization.py +0 -0
  249. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_vocab.py +0 -0
  250. {kokorog2p-0.3.2 → kokorog2p-0.3.3}/tests/test_words_mismatch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kokorog2p
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: A unified G2P (Grapheme-to-Phoneme) library for Kokoro TTS
5
5
  Author-email: Holger Nahrstaedt <nahrstaedt@gmail.com>
6
6
  License: Apache License
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.3.2'
32
- __version_tuple__ = version_tuple = (0, 3, 2)
31
+ __version__ = version = '0.3.3'
32
+ __version_tuple__ = version_tuple = (0, 3, 3)
33
33
 
34
- __commit_id__ = commit_id = 'g47241d749'
34
+ __commit_id__ = commit_id = 'g2b66f6ceb'
@@ -70,12 +70,12 @@ class G2PBase(ABC):
70
70
  if token.phonemes:
71
71
  result.append(token.phonemes)
72
72
  if token.whitespace:
73
- result.append(" ")
73
+ result.append(token.whitespace)
74
74
  elif token.is_punctuation:
75
75
  # Keep punctuation as-is
76
76
  result.append(token.text)
77
77
  if token.whitespace:
78
- result.append(" ")
78
+ result.append(token.whitespace)
79
79
  return "".join(result).strip()
80
80
 
81
81
  def word_to_phonemes(self, word: str, tag: str | None = None) -> str | None:
@@ -61,8 +61,8 @@ def preprocess_markdown(text: str) -> tuple[str, list[str], dict[int, str]]:
61
61
  # Extract phonemes (group 2)
62
62
  phonemes = m.group(2)
63
63
 
64
- # Check if it's a phoneme annotation (starts with /)
65
- if phonemes and phonemes.startswith("/"):
64
+ # Check if it's a phoneme annotation (starts and ends with /)
65
+ if phonemes and phonemes.startswith("/") and phonemes.endswith("/"):
66
66
  phonemes = phonemes.strip("/") # Remove leading and trailing slashes
67
67
  features[len(tokens)] = phonemes
68
68
 
@@ -99,8 +99,9 @@ def apply_markdown_features(
99
99
  # This assumes G2P tokenization preserves words from preprocessing
100
100
  token_map: dict[int, int] = {}
101
101
  for i, orig_word in enumerate(original_tokens):
102
+ orig_key = orig_word.casefold()
102
103
  for j, token in enumerate(tokens):
103
- if token.text == orig_word and j not in token_map.values():
104
+ if token.text.casefold() == orig_key and j not in token_map.values():
104
105
  token_map[i] = j
105
106
  break
106
107
 
@@ -165,15 +165,28 @@ class MixedLanguageG2P(G2PBase):
165
165
  ImportError: If enable_detection=True but lingua-language-detector
166
166
  is not installed (will warn and disable detection instead).
167
167
  """
168
- super().__init__(language=primary_language)
169
- self.version = version
170
-
171
168
  if allowed_languages is None or len(allowed_languages) == 0:
172
169
  raise ValueError(
173
170
  "allowed_languages must be specified and non-empty. "
174
171
  "Example: allowed_languages=['de', 'en-us']"
175
172
  )
176
173
 
174
+ normalized_allowed = [
175
+ lang.lower().replace("_", "-") for lang in allowed_languages
176
+ ]
177
+ primary_normalized = primary_language.lower().replace("_", "-")
178
+ if primary_normalized not in normalized_allowed:
179
+ raise ValueError(
180
+ "primary_language must be in allowed_languages. "
181
+ "Example: primary_language='de', allowed_languages=['de', 'en-us']"
182
+ )
183
+
184
+ primary_language = primary_normalized
185
+ allowed_languages = normalized_allowed
186
+
187
+ super().__init__(language=primary_language)
188
+ self.version = version
189
+
177
190
  # Check if lingua is available
178
191
  if not LINGUA_AVAILABLE:
179
192
  if enable_detection:
@@ -202,10 +202,15 @@ class BaseTokenizer(ABC):
202
202
  """
203
203
  if self.phoneme_quotes == "ascii":
204
204
  # Convert all quote variants to ASCII
205
- text = text.replace('"', '"').replace('"', '"')
205
+ text = text.replace("\u201c", '"').replace("\u201d", '"').replace("`", '"')
206
206
  elif self.phoneme_quotes == "none":
207
207
  # Strip all quotes
208
- text = text.replace('"', "").replace('"', "").replace('"', "")
208
+ text = (
209
+ text.replace('"', "")
210
+ .replace("\u201c", "")
211
+ .replace("\u201d", "")
212
+ .replace("`", "")
213
+ )
209
214
  # else: "curly" - keep as-is
210
215
 
211
216
  return text
@@ -37,7 +37,20 @@ class GToken:
37
37
  @property
38
38
  def is_punctuation(self) -> bool:
39
39
  """Check if this token is punctuation."""
40
- return self.tag in (".", ",", ":", ";", "!", "?", "-", "'", '"', "(", ")")
40
+ return self.tag in (
41
+ ".",
42
+ ",",
43
+ ":",
44
+ ";",
45
+ "!",
46
+ "?",
47
+ "-",
48
+ "'",
49
+ '"',
50
+ "(",
51
+ ")",
52
+ "PUNCT",
53
+ )
41
54
 
42
55
  @property
43
56
  def is_word(self) -> bool:
@@ -61,6 +74,7 @@ class GToken:
61
74
  phonemes=self.phonemes,
62
75
  start_ts=self.start_ts,
63
76
  end_ts=self.end_ts,
77
+ rating=self.rating,
64
78
  _=dict(self._),
65
79
  )
66
80
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kokorog2p
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: A unified G2P (Grapheme-to-Phoneme) library for Kokoro TTS
5
5
  Author-email: Holger Nahrstaedt <nahrstaedt@gmail.com>
6
6
  License: Apache License
@@ -214,6 +214,7 @@ kokorog2p/zh/transcription.py
214
214
  tests/__init__.py
215
215
  tests/conftest.py
216
216
  tests/test_abbreviation_customization.py
217
+ tests/test_base.py
217
218
  tests/test_benchmark_validation.py
218
219
  tests/test_ci_bug_fix.py
219
220
  tests/test_contraction_detection.py
@@ -0,0 +1,33 @@
1
+ """Tests for G2PBase utilities."""
2
+
3
+ from kokorog2p.base import G2PBase
4
+ from kokorog2p.token import GToken
5
+
6
+
7
+ class DummyG2P(G2PBase):
8
+ """Minimal G2P implementation for base tests."""
9
+
10
+ def __init__(self, tokens: list[GToken]):
11
+ super().__init__(language="en-us")
12
+ self._tokens = tokens
13
+
14
+ def __call__(self, text: str) -> list[GToken]:
15
+ return list(self._tokens)
16
+
17
+ def lookup(self, word: str, tag: str | None = None) -> str | None:
18
+ return None
19
+
20
+
21
+ class TestG2PBase:
22
+ """Tests for G2PBase helpers."""
23
+
24
+ def test_phonemize_preserves_whitespace(self):
25
+ """phonemize should preserve token whitespace exactly."""
26
+ tokens = [
27
+ GToken(text="Hello", phonemes="h", whitespace=" "),
28
+ GToken(text="world", phonemes="w", whitespace=""),
29
+ GToken(text=".", tag=".", whitespace=""),
30
+ ]
31
+ g2p = DummyG2P(tokens)
32
+
33
+ assert g2p.phonemize("ignored") == "h w."
@@ -52,6 +52,13 @@ class TestPreprocessMarkdown:
52
52
  assert "link" in clean
53
53
  assert len(features) == 0 # Not a phoneme annotation
54
54
 
55
+ def test_relative_link_is_not_annotation(self):
56
+ """Relative links should not be treated as annotations."""
57
+ text = "[link](/docs/page) test"
58
+ clean, tokens, features = preprocess_markdown(text)
59
+ assert "link" in clean
60
+ assert len(features) == 0
61
+
55
62
  def test_empty_annotation(self):
56
63
  """Test annotation with empty phonemes."""
57
64
  text = "[word](/) test"
@@ -151,6 +158,19 @@ class TestApplyMarkdownFeatures:
151
158
  result = apply_markdown_features(tokens, features, orig_tokens)
152
159
  assert result[0].phonemes == "hɛloʊ"
153
160
 
161
+ def test_apply_feature_case_insensitive(self):
162
+ """Features should match regardless of casing differences."""
163
+ tokens = [
164
+ GToken(text="hello", phonemes="hɛloʊ"),
165
+ GToken(text="world", phonemes="wɝld"),
166
+ ]
167
+ features = {0: "hˈɛloʊ"}
168
+ orig_tokens = ["Hello", "world"]
169
+
170
+ result = apply_markdown_features(tokens, features, orig_tokens)
171
+ assert result[0].phonemes == "hˈɛloʊ"
172
+ assert result[0].get("rating") == 5
173
+
154
174
 
155
175
  class TestPhonemizeWithMarkdown:
156
176
  """Tests for phonemize_with_markdown function."""
@@ -46,6 +46,27 @@ class TestMixedLanguageG2PBasic:
46
46
  with pytest.raises(ValueError, match="allowed_languages must be specified"):
47
47
  MixedLanguageG2P(primary_language="de", allowed_languages=[])
48
48
 
49
+ def test_primary_language_must_be_allowed(self):
50
+ """Primary language should be part of allowed_languages."""
51
+ from kokorog2p.mixed_language_g2p import MixedLanguageG2P
52
+
53
+ with pytest.raises(
54
+ ValueError, match="primary_language must be in allowed_languages"
55
+ ):
56
+ MixedLanguageG2P(primary_language="de", allowed_languages=["en-us"])
57
+
58
+ def test_language_codes_normalized(self):
59
+ """Language codes should normalize to lowercase with hyphens."""
60
+ from kokorog2p.mixed_language_g2p import MixedLanguageG2P
61
+
62
+ g2p = MixedLanguageG2P(
63
+ primary_language="DE",
64
+ allowed_languages=["EN_US", "DE"],
65
+ enable_detection=False,
66
+ )
67
+ assert g2p.primary_language == "de"
68
+ assert g2p.allowed_languages == ["en-us", "de"]
69
+
49
70
  def test_repr(self):
50
71
  """Test string representation."""
51
72
  from kokorog2p.mixed_language_g2p import MixedLanguageG2P
@@ -52,6 +52,7 @@ class TestGToken:
52
52
  assert GToken(text=".", tag=".").is_punctuation is True
53
53
  assert GToken(text=",", tag=",").is_punctuation is True
54
54
  assert GToken(text="!", tag="!").is_punctuation is True
55
+ assert GToken(text="!", tag="PUNCT").is_punctuation is True
55
56
  assert GToken(text="hello", tag="NN").is_punctuation is False
56
57
 
57
58
  def test_is_word(self):
@@ -80,6 +81,7 @@ class TestGToken:
80
81
  text="hello",
81
82
  tag="NN",
82
83
  phonemes="hˈɛlO",
84
+ rating="espeak",
83
85
  _={"rating": 4},
84
86
  )
85
87
 
@@ -89,6 +91,7 @@ class TestGToken:
89
91
  assert copy.text == original.text
90
92
  assert copy.tag == original.tag
91
93
  assert copy.phonemes == original.phonemes
94
+ assert copy.rating == original.rating
92
95
  assert copy._["rating"] == 4
93
96
 
94
97
  # Check it's a different object
@@ -111,6 +111,18 @@ class TestRegexTokenizer:
111
111
  assert quote_tokens[0].quote_depth == 1
112
112
  assert quote_tokens[1].quote_depth == 0
113
113
 
114
+ def test_normalize_phoneme_quotes_ascii(self):
115
+ """Curly quotes should normalize to ASCII when requested."""
116
+ tokenizer = RegexTokenizer(phoneme_quotes="ascii")
117
+ text = "\u201cHello\u201d"
118
+ assert tokenizer.normalize_phoneme_quotes(text) == '"Hello"'
119
+
120
+ def test_normalize_phoneme_quotes_none(self):
121
+ """Quote characters should be stripped when requested."""
122
+ tokenizer = RegexTokenizer(phoneme_quotes="none")
123
+ text = "\u201cHello\u201d"
124
+ assert tokenizer.normalize_phoneme_quotes(text) == "Hello"
125
+
114
126
  def test_empty_string(self, tokenizer):
115
127
  """Test tokenizing empty string."""
116
128
  tokens = tokenizer.tokenize("")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes