BatchalignHK 0.7.17.post17__tar.gz → 0.7.17.post19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/PKG-INFO +2 -3
  2. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/PKG-INFO +2 -3
  3. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/cli/dispatch.py +10 -6
  4. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/cantonese_infer.py +15 -12
  5. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/ud.py +4 -4
  6. batchalignhk-0.7.17.post19/batchalign/version +3 -0
  7. batchalignhk-0.7.17.post17/batchalign/version +0 -3
  8. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  9. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  10. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/entry_points.txt +0 -0
  11. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/requires.txt +0 -0
  12. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/top_level.txt +0 -0
  13. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/LICENSE +0 -0
  14. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/MANIFEST.in +0 -0
  15. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/README.md +0 -0
  16. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/__init__.py +0 -0
  17. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/__main__.py +0 -0
  18. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/cli/__init__.py +0 -0
  19. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/cli/cli.py +0 -0
  20. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/constants.py +0 -0
  21. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/document.py +0 -0
  22. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/errors.py +0 -0
  23. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/__init__.py +0 -0
  24. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/base.py +0 -0
  25. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/__init__.py +0 -0
  26. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/file.py +0 -0
  27. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/generator.py +0 -0
  28. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/lexer.py +0 -0
  29. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/parser.py +0 -0
  30. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/utils.py +0 -0
  31. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/__init__.py +0 -0
  32. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/file.py +0 -0
  33. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/generator.py +0 -0
  34. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/parser.py +0 -0
  35. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/__init__.py +0 -0
  36. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/resolve.py +0 -0
  37. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/__init__.py +0 -0
  38. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/config.yaml +0 -0
  39. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/infer.py +0 -0
  40. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/utils.py +0 -0
  41. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/__init__.py +0 -0
  42. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/run.py +0 -0
  43. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/utils.py +0 -0
  44. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utils.py +0 -0
  45. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/__init__.py +0 -0
  46. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/dataset.py +0 -0
  47. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/execute.py +0 -0
  48. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/infer.py +0 -0
  49. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/prep.py +0 -0
  50. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/train.py +0 -0
  51. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/wave2vec/__init__.py +0 -0
  52. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/wave2vec/infer_fa.py +0 -0
  53. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/__init__.py +0 -0
  54. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/infer_asr.py +0 -0
  55. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/infer_fa.py +0 -0
  56. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/__init__.py +0 -0
  57. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/analysis/__init__.py +0 -0
  58. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/analysis/eval.py +0 -0
  59. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/__init__.py +0 -0
  60. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/num2chinese.py +0 -0
  61. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/rev.py +0 -0
  62. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/tencent.py +0 -0
  63. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/utils.py +0 -0
  64. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/whisper.py +0 -0
  65. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/whisperx.py +0 -0
  66. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/base.py +0 -0
  67. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/__init__.py +0 -0
  68. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  69. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  70. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  71. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/retrace.py +0 -0
  72. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  73. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  74. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/support/test.test +0 -0
  75. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/dispatch.py +0 -0
  76. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/__init__.py +0 -0
  77. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  78. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  79. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  80. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  81. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  82. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  83. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  84. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  85. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  86. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/pipeline.py +0 -0
  87. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/speaker/__init__.py +0 -0
  88. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  89. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/translate/__init__.py +0 -0
  90. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/translate/seamless.py +0 -0
  91. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/__init__.py +0 -0
  92. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/rev_utr.py +0 -0
  93. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/utils.py +0 -0
  94. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  95. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utterance/__init__.py +0 -0
  96. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  97. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/__init__.py +0 -0
  98. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/conftest.py +0 -0
  99. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  100. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  101. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  102. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  103. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  104. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  105. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  106. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  107. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  108. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  109. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  110. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  111. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/fixures.py +0 -0
  112. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  113. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  114. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/test_document.py +0 -0
  115. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/__init__.py +0 -0
  116. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/config.py +0 -0
  117. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/dp.py +0 -0
  118. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/utils.py +0 -0
  119. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/setup.cfg +0 -0
  120. {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.17.post17
3
+ Version: 0.7.17.post19
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -48,7 +48,6 @@ Dynamic: author-email
48
48
  Dynamic: classifier
49
49
  Dynamic: description
50
50
  Dynamic: description-content-type
51
- Dynamic: license-file
52
51
  Dynamic: provides-extra
53
52
  Dynamic: requires-dist
54
53
  Dynamic: summary
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.17.post17
3
+ Version: 0.7.17.post19
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -48,7 +48,6 @@ Dynamic: author-email
48
48
  Dynamic: classifier
49
49
  Dynamic: description
50
50
  Dynamic: description-content-type
51
- Dynamic: license-file
52
51
  Dynamic: provides-extra
53
52
  Dynamic: requires-dist
54
53
  Dynamic: summary
@@ -66,12 +66,16 @@ def _dispatch(command, lang, num_speakers,
66
66
 
67
67
  if kwargs.get("data"):
68
68
  url = kwargs.get("data")
69
- url = urlparse(url)
70
- if url.scheme == "":
71
- url = url._replace(scheme="http")
72
- base = os.path.basename(url.path)
73
- files.append(url)
74
- outputs.append(os.path.join(out_dir, base))
69
+ with open(url.strip()) as data:
70
+ data = data.readlines()
71
+ data = [i.strip() for i in data if i.strip() != ""]
72
+ for url in data:
73
+ url = urlparse(url)
74
+ if url.scheme == "":
75
+ url = url._replace(scheme="http")
76
+ base = os.path.basename(url.path)
77
+ files.append(url)
78
+ outputs.append(os.path.join(out_dir, base))
75
79
 
76
80
  for basedir, _, fs in os.walk(in_dir):
77
81
  for f in fs:
@@ -19,6 +19,9 @@ from transformers import DataCollatorForTokenClassification
19
19
  # tqdm
20
20
  from tqdm import tqdm
21
21
 
22
+ import logging
23
+ L = logging.getLogger("batchalign")
24
+
22
25
  # seed device and tokens
23
26
  DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
24
27
 
@@ -34,8 +37,8 @@ class BertCantoneseUtteranceModel(object):
34
37
 
35
38
  # eval mode
36
39
  self.model.eval()
37
- print(f"Model and tokenizer initialized on device: {DEVICE}")
38
- print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
40
+ L.debug(f"Model and tokenizer initialized on device: {DEVICE}")
41
+ L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
39
42
 
40
43
  def __call__(self, passage):
41
44
  # Step 1: Clean up passage
@@ -79,14 +82,14 @@ class BertCantoneseUtteranceModel(object):
79
82
  break
80
83
 
81
84
  # Debugging: Print number of chunks and their content
82
- print(f"Created {len(chunks)} chunks based on keywords.")
85
+ L.debug(f"Created {len(chunks)} chunks based on keywords.")
83
86
  for i, chunk in enumerate(chunks):
84
- print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
87
+ L.debug(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
85
88
 
86
89
  # Step 3: Process each chunk and restore punctuation
87
90
  final_passage = []
88
91
  for chunk_index, chunk in enumerate(chunks):
89
- print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
92
+ L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
90
93
 
91
94
  # Step 3.1: Split chunk by characters (Chinese tokenization)
92
95
  tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
@@ -103,7 +106,7 @@ class BertCantoneseUtteranceModel(object):
103
106
  # Pass it through the model
104
107
  res = self.model(**tokd).logits
105
108
  except Exception as e:
106
- print(f"Error during model inference: {e}")
109
+ L.error(f"Error during model inference: {e}")
107
110
  return []
108
111
 
109
112
  # Argmax for classification
@@ -152,7 +155,7 @@ class BertCantoneseUtteranceModel(object):
152
155
  # Step 4: Join processed chunks together into the final passage
153
156
  final_passage = ' '.join(final_passage)
154
157
 
155
- print("Text processing completed. Generating final output...")
158
+ L.info("Text processing completed. Generating final output...")
156
159
 
157
160
  # Optionally, tokenize the final text into sentences based on punctuation
158
161
  def custom_sent_tokenize(text):
@@ -164,12 +167,12 @@ class BertCantoneseUtteranceModel(object):
164
167
  parts = re.split(sentence_endings, text)
165
168
 
166
169
  # Debug: Output the parts after splitting
167
- print(f"Parts after splitting: {parts}")
170
+ L.debug(f"Parts after splitting: {parts}")
168
171
 
169
172
  # Combine parts and punctuation together
170
173
  for i in range(0, len(parts) - 1, 2):
171
174
  sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
172
- print(f"Sentence formed: {sentence}") # Debug: Output the current sentence
175
+ L.debug(f"Sentence formed: {sentence}") # Debug: Output the current sentence
173
176
 
174
177
  if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
175
178
  split_passage.append(sentence)
@@ -177,18 +180,18 @@ class BertCantoneseUtteranceModel(object):
177
180
  # If the last part doesn't have punctuation, we handle it here
178
181
  if len(parts) % 2 != 0: # If there's no punctuation at the end
179
182
  last_part = parts[-1].strip()
180
- print(f"Last part without punctuation: {last_part}") # Debug: Output the last part
183
+ L.debug(f"Last part without punctuation: {last_part}") # Debug: Output the last part
181
184
 
182
185
  if last_part: # Only add non-empty sentences
183
186
  split_passage.append(last_part)
184
187
 
185
188
  # Final output
186
- print(f"Final split passage: {split_passage}")
189
+ L.debug(f"Final split passage: {split_passage}")
187
190
  return split_passage
188
191
 
189
192
  split_passage = custom_sent_tokenize(final_passage)
190
193
 
191
194
  # Debugging: Output the sentences after splitting
192
- print(f"Final sentences: {split_passage}")
195
+ L.debug(f"Final sentences: {split_passage}")
193
196
 
194
197
  return split_passage
@@ -462,7 +462,7 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
462
462
  # specivl forms: recall the special form marker is xbxxx
463
463
  if "xbxxx" in word.text.strip():
464
464
  form = special_forms.pop(0)
465
- mor.append(f"x|{form.strip().replace(',', 'cm')}")
465
+ mor.append(f"{form[1].strip()}|{form[0].strip().replace(',', 'cm')}")
466
466
  special_form_ids.append(word.id)
467
467
  else:
468
468
  mor.append(mor_word)
@@ -555,7 +555,6 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
555
555
  # add a deliminator
556
556
  mor_str = mor_str + " " + delimiter
557
557
 
558
-
559
558
  mor_str = mor_str.replace("<UNK>", "")
560
559
  gra_str = gra_str.replace("<UNK>", "")
561
560
 
@@ -843,7 +842,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
843
842
  special_forms_cleaned = []
844
843
  for form in special_forms:
845
844
  line_cut = line_cut.replace(form, "xbxxx")
846
- special_forms_cleaned.append(re.sub(r"@[\w\:]+", "", form).strip())
845
+ special_forms_cleaned.append(form.split("@"))
847
846
 
848
847
  # if line cut is still nothing, we get very angry
849
848
  if line_cut == "":
@@ -942,7 +941,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
942
941
  if ut[i.payload].text != ",":
943
942
  ut[i.payload].morphology = [Morphology(
944
943
  lemma = sents[0].tokens[i.payload].text if len(sents) > 0 and len(sents[0].tokens) > i.payload and sents[0].tokens[i.payload].text != "xbxxx" else ut[i.payload].text,
945
- pos = "x",
944
+ pos = ut[i.payload].morphology[0].pos if (ut[i.payload].morphology and len(ut[i.payload].morphology) > 0) else "x",
946
945
  feats = ""
947
946
  )]
948
947
  poses = [i.morphology[0].pos.upper() for i in ut
@@ -999,6 +998,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
999
998
  content.dependency = form.dependency
1000
999
 
1001
1000
  except Exception as e:
1001
+ raise e
1002
1002
  pass
1003
1003
  # warnings.warn(f"Utterance failed parsing, skipping ud tagging... line='{line}', error='{e}'.\n")
1004
1004
 
@@ -0,0 +1,3 @@
1
+ 0.7.17-post.19
2
+ April 06th, 2025
3
+ Japanese form fixes
@@ -1,3 +0,0 @@
1
- 0.7.17-post.17
2
- March 26th, 2025
3
- better tencent ASR