BatchalignHK 0.7.18.post6__tar.gz → 0.7.18.post8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/BatchalignHK.egg-info/PKG-INFO +1 -1
  2. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/PKG-INFO +1 -1
  3. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/cli/cli.py +5 -1
  4. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/document.py +1 -0
  5. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/chat/parser.py +3 -1
  6. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/ud.py +8 -3
  7. batchalignhk-0.7.18.post8/batchalign/version +3 -0
  8. batchalignhk-0.7.18.post6/batchalign/version +0 -3
  9. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  10. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  11. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/BatchalignHK.egg-info/entry_points.txt +0 -0
  12. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/BatchalignHK.egg-info/requires.txt +0 -0
  13. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/BatchalignHK.egg-info/top_level.txt +0 -0
  14. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/LICENSE +0 -0
  15. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/MANIFEST.in +0 -0
  16. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/README.md +0 -0
  17. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/__init__.py +0 -0
  18. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/__main__.py +0 -0
  19. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/cli/__init__.py +0 -0
  20. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/cli/dispatch.py +0 -0
  21. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/constants.py +0 -0
  22. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/errors.py +0 -0
  23. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/__init__.py +0 -0
  24. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/base.py +0 -0
  25. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/chat/__init__.py +0 -0
  26. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/chat/file.py +0 -0
  27. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/chat/generator.py +0 -0
  28. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/chat/lexer.py +0 -0
  29. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/chat/utils.py +0 -0
  30. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/textgrid/__init__.py +0 -0
  31. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/textgrid/file.py +0 -0
  32. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/textgrid/generator.py +0 -0
  33. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/formats/textgrid/parser.py +0 -0
  34. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/__init__.py +0 -0
  35. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/resolve.py +0 -0
  36. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/speaker/__init__.py +0 -0
  37. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/speaker/config.yaml +0 -0
  38. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/speaker/infer.py +0 -0
  39. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/speaker/utils.py +0 -0
  40. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/training/__init__.py +0 -0
  41. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/training/run.py +0 -0
  42. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/training/utils.py +0 -0
  43. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utils.py +0 -0
  44. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/__init__.py +0 -0
  45. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/cantonese_infer.py +0 -0
  46. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/dataset.py +0 -0
  47. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/execute.py +0 -0
  48. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/infer.py +0 -0
  49. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/prep.py +0 -0
  50. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/utterance/train.py +0 -0
  51. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/wave2vec/__init__.py +0 -0
  52. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/wave2vec/infer_fa.py +0 -0
  53. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/whisper/__init__.py +0 -0
  54. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/whisper/infer_asr.py +0 -0
  55. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/models/whisper/infer_fa.py +0 -0
  56. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/__init__.py +0 -0
  57. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/analysis/__init__.py +0 -0
  58. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/analysis/eval.py +0 -0
  59. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/__init__.py +0 -0
  60. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/num2chinese.py +0 -0
  61. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/rev.py +0 -0
  62. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/tencent.py +0 -0
  63. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/utils.py +0 -0
  64. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/whisper.py +0 -0
  65. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/asr/whisperx.py +0 -0
  66. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/base.py +0 -0
  67. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/__init__.py +0 -0
  68. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  69. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  70. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  71. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/retrace.py +0 -0
  72. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  73. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  74. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/cleanup/support/test.test +0 -0
  75. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/dispatch.py +0 -0
  76. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/fa/__init__.py +0 -0
  77. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  78. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  79. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  80. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  81. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  82. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  83. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  84. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  85. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  86. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/pipeline.py +0 -0
  87. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/speaker/__init__.py +0 -0
  88. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  89. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/translate/__init__.py +0 -0
  90. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/translate/gtrans.py +0 -0
  91. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/translate/seamless.py +0 -0
  92. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/translate/utils.py +0 -0
  93. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/utr/__init__.py +0 -0
  94. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/utr/rev_utr.py +0 -0
  95. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/utr/utils.py +0 -0
  96. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  97. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/utterance/__init__.py +0 -0
  98. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  99. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/__init__.py +0 -0
  100. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/conftest.py +0 -0
  101. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  102. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  103. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  104. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  105. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  106. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  107. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  108. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  109. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  110. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  111. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  112. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  113. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/fixures.py +0 -0
  114. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  115. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  116. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/tests/test_document.py +0 -0
  117. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/utils/__init__.py +0 -0
  118. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/utils/config.py +0 -0
  119. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/utils/dp.py +0 -0
  120. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/batchalign/utils/utils.py +0 -0
  121. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/setup.cfg +0 -0
  122. {batchalignhk-0.7.18.post6 → batchalignhk-0.7.18.post8}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.18.post6
3
+ Version: 0.7.18.post8
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.18.post6
3
+ Version: 0.7.18.post8
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -230,6 +230,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
230
230
  @common_options
231
231
  @click.option("--retokenize/--keeptokens",
232
232
  default=False, help="Retokenize the main line to fit the UD tokenizations.")
233
+ @click.option("--skipmultilang/--multilang",
234
+ default=False, help="skip code switching")
233
235
  @click.option("--lexicon",
234
236
  type=click.Path(exists=True,
235
237
  file_okay=True, dir_okay=False),
@@ -253,7 +255,9 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
253
255
  doc.ba_special_["special_mor_notation"] = True
254
256
  return (
255
257
  doc,
256
- {"retokenize": kwargs["retokenize"], "mwt": mwt}
258
+ {"retokenize": kwargs["retokenize"],
259
+ "skipmultilang": kwargs["skipmultilang"],
260
+ "mwt": mwt}
257
261
  )
258
262
 
259
263
  def writer(doc, output):
@@ -153,6 +153,7 @@ class Utterance(BaseModel):
153
153
  tier: Tier = Field(default=Tier())
154
154
  content: Sentence
155
155
  text: Optional[str] = Field(default=None)
156
+ override_lang: Optional[str] = Field(default=None)
156
157
  translation: Optional[str] = Field(default=None)
157
158
  time: Optional[Tuple[int,int]] = Field(default=None)
158
159
  custom_dependencies: List[CustomLine] = Field(default=[])
@@ -300,6 +300,7 @@ def chat_parse_doc(lines, special_mor=False):
300
300
  content=line.strip()))
301
301
 
302
302
  # parse the actual utterance
303
+ multilingual = re.findall(r"^\[- (\w+)\]", text)
303
304
  parsed, delim = chat_parse_utterance(text, mor, gra, wor, additional)
304
305
 
305
306
  # get the timing of the utterance
@@ -313,7 +314,8 @@ def chat_parse_doc(lines, special_mor=False):
313
314
  "text": text,
314
315
  "delim": delim,
315
316
  "custom_dependencies": additional,
316
- "translation": translation
317
+ "translation": translation,
318
+ "override_lang": None if len(multilingual) == 0 else multilingual[0]
317
319
  })
318
320
 
319
321
  timing = re.findall(rf"\x15(\d+)_(\d+)\x15", text)
@@ -462,7 +462,10 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
462
462
  # specivl forms: recall the special form marker is xbxxx
463
463
  if "xbxxx" in word.text.strip():
464
464
  form = special_forms.pop(0)
465
- mor.append(f"{form[1].strip()}|{form[0].strip().replace(',', 'cm')}")
465
+ if form[1][0] == "s":
466
+ mor.append("L2|xxx")
467
+ else:
468
+ mor.append(f"{form[1].strip()}|{form[0].strip().replace(',', 'cm')}")
466
469
  special_form_ids.append(word.id)
467
470
  else:
468
471
  mor.append(mor_word)
@@ -707,7 +710,7 @@ def adlist_postprocessor(i, lang, adlist):
707
710
  return cpy
708
711
 
709
712
  ######
710
- def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, **kwargs):
713
+ def morphoanalyze(doc: Document, retokenize:bool, skipmultilang:bool, status_hook:callable = None, **kwargs):
711
714
  L.debug("Starting Stanza...")
712
715
  inputs = []
713
716
 
@@ -785,6 +788,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
785
788
  L.info(f"Stanza processing utterance {indx+1}/{len(doc.content)}")
786
789
  if not isinstance(i, Utterance):
787
790
  continue
791
+ if i.override_lang and skipmultilang:
792
+ continue
788
793
 
789
794
  # generate simplified version of the line
790
795
  line = str(i)
@@ -1011,4 +1016,4 @@ class StanzaEngine(BatchalignEngine):
1011
1016
  self.status_hook = status_hook
1012
1017
 
1013
1018
  def process(self, doc, **kwargs):
1014
- return morphoanalyze(doc, retokenize=kwargs.get("retokenize", False), status_hook=self.status_hook, mwt=kwargs.get("mwt", {}))
1019
+ return morphoanalyze(doc, retokenize=kwargs.get("retokenize", False), skipmultilang=kwargs.get("skipmultilang", False), status_hook=self.status_hook, mwt=kwargs.get("mwt", {}))
@@ -0,0 +1,3 @@
1
+ 0.7.18-post.8
2
+ April 27th, 2025
3
+ skip multilang
@@ -1,3 +0,0 @@
1
- 0.7.18-post.6
2
- April 26th, 2025
3
- Fix translation tabs