karaoke-gen 0.75.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of karaoke-gen might be problematic. Click here for more details.

Files changed (287) hide show
  1. karaoke_gen/__init__.py +38 -0
  2. karaoke_gen/audio_fetcher.py +1614 -0
  3. karaoke_gen/audio_processor.py +790 -0
  4. karaoke_gen/config.py +83 -0
  5. karaoke_gen/file_handler.py +387 -0
  6. karaoke_gen/instrumental_review/__init__.py +45 -0
  7. karaoke_gen/instrumental_review/analyzer.py +408 -0
  8. karaoke_gen/instrumental_review/editor.py +322 -0
  9. karaoke_gen/instrumental_review/models.py +171 -0
  10. karaoke_gen/instrumental_review/server.py +475 -0
  11. karaoke_gen/instrumental_review/static/index.html +1529 -0
  12. karaoke_gen/instrumental_review/waveform.py +409 -0
  13. karaoke_gen/karaoke_finalise/__init__.py +1 -0
  14. karaoke_gen/karaoke_finalise/karaoke_finalise.py +1833 -0
  15. karaoke_gen/karaoke_gen.py +1026 -0
  16. karaoke_gen/lyrics_processor.py +474 -0
  17. karaoke_gen/metadata.py +160 -0
  18. karaoke_gen/pipeline/__init__.py +87 -0
  19. karaoke_gen/pipeline/base.py +215 -0
  20. karaoke_gen/pipeline/context.py +230 -0
  21. karaoke_gen/pipeline/executors/__init__.py +21 -0
  22. karaoke_gen/pipeline/executors/local.py +159 -0
  23. karaoke_gen/pipeline/executors/remote.py +257 -0
  24. karaoke_gen/pipeline/stages/__init__.py +27 -0
  25. karaoke_gen/pipeline/stages/finalize.py +202 -0
  26. karaoke_gen/pipeline/stages/render.py +165 -0
  27. karaoke_gen/pipeline/stages/screens.py +139 -0
  28. karaoke_gen/pipeline/stages/separation.py +191 -0
  29. karaoke_gen/pipeline/stages/transcription.py +191 -0
  30. karaoke_gen/resources/AvenirNext-Bold.ttf +0 -0
  31. karaoke_gen/resources/Montserrat-Bold.ttf +0 -0
  32. karaoke_gen/resources/Oswald-Bold.ttf +0 -0
  33. karaoke_gen/resources/Oswald-SemiBold.ttf +0 -0
  34. karaoke_gen/resources/Zurich_Cn_BT_Bold.ttf +0 -0
  35. karaoke_gen/style_loader.py +531 -0
  36. karaoke_gen/utils/__init__.py +18 -0
  37. karaoke_gen/utils/bulk_cli.py +492 -0
  38. karaoke_gen/utils/cli_args.py +432 -0
  39. karaoke_gen/utils/gen_cli.py +978 -0
  40. karaoke_gen/utils/remote_cli.py +3268 -0
  41. karaoke_gen/video_background_processor.py +351 -0
  42. karaoke_gen/video_generator.py +424 -0
  43. karaoke_gen-0.75.54.dist-info/METADATA +718 -0
  44. karaoke_gen-0.75.54.dist-info/RECORD +287 -0
  45. karaoke_gen-0.75.54.dist-info/WHEEL +4 -0
  46. karaoke_gen-0.75.54.dist-info/entry_points.txt +5 -0
  47. karaoke_gen-0.75.54.dist-info/licenses/LICENSE +21 -0
  48. lyrics_transcriber/__init__.py +10 -0
  49. lyrics_transcriber/cli/__init__.py +0 -0
  50. lyrics_transcriber/cli/cli_main.py +285 -0
  51. lyrics_transcriber/core/__init__.py +0 -0
  52. lyrics_transcriber/core/config.py +50 -0
  53. lyrics_transcriber/core/controller.py +594 -0
  54. lyrics_transcriber/correction/__init__.py +0 -0
  55. lyrics_transcriber/correction/agentic/__init__.py +9 -0
  56. lyrics_transcriber/correction/agentic/adapter.py +71 -0
  57. lyrics_transcriber/correction/agentic/agent.py +313 -0
  58. lyrics_transcriber/correction/agentic/feedback/aggregator.py +12 -0
  59. lyrics_transcriber/correction/agentic/feedback/collector.py +17 -0
  60. lyrics_transcriber/correction/agentic/feedback/retention.py +24 -0
  61. lyrics_transcriber/correction/agentic/feedback/store.py +76 -0
  62. lyrics_transcriber/correction/agentic/handlers/__init__.py +24 -0
  63. lyrics_transcriber/correction/agentic/handlers/ambiguous.py +44 -0
  64. lyrics_transcriber/correction/agentic/handlers/background_vocals.py +68 -0
  65. lyrics_transcriber/correction/agentic/handlers/base.py +51 -0
  66. lyrics_transcriber/correction/agentic/handlers/complex_multi_error.py +46 -0
  67. lyrics_transcriber/correction/agentic/handlers/extra_words.py +74 -0
  68. lyrics_transcriber/correction/agentic/handlers/no_error.py +42 -0
  69. lyrics_transcriber/correction/agentic/handlers/punctuation.py +44 -0
  70. lyrics_transcriber/correction/agentic/handlers/registry.py +60 -0
  71. lyrics_transcriber/correction/agentic/handlers/repeated_section.py +44 -0
  72. lyrics_transcriber/correction/agentic/handlers/sound_alike.py +126 -0
  73. lyrics_transcriber/correction/agentic/models/__init__.py +5 -0
  74. lyrics_transcriber/correction/agentic/models/ai_correction.py +31 -0
  75. lyrics_transcriber/correction/agentic/models/correction_session.py +30 -0
  76. lyrics_transcriber/correction/agentic/models/enums.py +38 -0
  77. lyrics_transcriber/correction/agentic/models/human_feedback.py +30 -0
  78. lyrics_transcriber/correction/agentic/models/learning_data.py +26 -0
  79. lyrics_transcriber/correction/agentic/models/observability_metrics.py +28 -0
  80. lyrics_transcriber/correction/agentic/models/schemas.py +46 -0
  81. lyrics_transcriber/correction/agentic/models/utils.py +19 -0
  82. lyrics_transcriber/correction/agentic/observability/__init__.py +5 -0
  83. lyrics_transcriber/correction/agentic/observability/langfuse_integration.py +35 -0
  84. lyrics_transcriber/correction/agentic/observability/metrics.py +46 -0
  85. lyrics_transcriber/correction/agentic/observability/performance.py +19 -0
  86. lyrics_transcriber/correction/agentic/prompts/__init__.py +2 -0
  87. lyrics_transcriber/correction/agentic/prompts/classifier.py +227 -0
  88. lyrics_transcriber/correction/agentic/providers/__init__.py +6 -0
  89. lyrics_transcriber/correction/agentic/providers/base.py +36 -0
  90. lyrics_transcriber/correction/agentic/providers/circuit_breaker.py +145 -0
  91. lyrics_transcriber/correction/agentic/providers/config.py +73 -0
  92. lyrics_transcriber/correction/agentic/providers/constants.py +24 -0
  93. lyrics_transcriber/correction/agentic/providers/health.py +28 -0
  94. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +212 -0
  95. lyrics_transcriber/correction/agentic/providers/model_factory.py +209 -0
  96. lyrics_transcriber/correction/agentic/providers/response_cache.py +218 -0
  97. lyrics_transcriber/correction/agentic/providers/response_parser.py +111 -0
  98. lyrics_transcriber/correction/agentic/providers/retry_executor.py +127 -0
  99. lyrics_transcriber/correction/agentic/router.py +35 -0
  100. lyrics_transcriber/correction/agentic/workflows/__init__.py +5 -0
  101. lyrics_transcriber/correction/agentic/workflows/consensus_workflow.py +24 -0
  102. lyrics_transcriber/correction/agentic/workflows/correction_graph.py +59 -0
  103. lyrics_transcriber/correction/agentic/workflows/feedback_workflow.py +24 -0
  104. lyrics_transcriber/correction/anchor_sequence.py +919 -0
  105. lyrics_transcriber/correction/corrector.py +760 -0
  106. lyrics_transcriber/correction/feedback/__init__.py +2 -0
  107. lyrics_transcriber/correction/feedback/schemas.py +107 -0
  108. lyrics_transcriber/correction/feedback/store.py +236 -0
  109. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  110. lyrics_transcriber/correction/handlers/base.py +52 -0
  111. lyrics_transcriber/correction/handlers/extend_anchor.py +149 -0
  112. lyrics_transcriber/correction/handlers/levenshtein.py +189 -0
  113. lyrics_transcriber/correction/handlers/llm.py +293 -0
  114. lyrics_transcriber/correction/handlers/llm_providers.py +60 -0
  115. lyrics_transcriber/correction/handlers/no_space_punct_match.py +154 -0
  116. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +85 -0
  117. lyrics_transcriber/correction/handlers/repeat.py +88 -0
  118. lyrics_transcriber/correction/handlers/sound_alike.py +259 -0
  119. lyrics_transcriber/correction/handlers/syllables_match.py +252 -0
  120. lyrics_transcriber/correction/handlers/word_count_match.py +80 -0
  121. lyrics_transcriber/correction/handlers/word_operations.py +187 -0
  122. lyrics_transcriber/correction/operations.py +352 -0
  123. lyrics_transcriber/correction/phrase_analyzer.py +435 -0
  124. lyrics_transcriber/correction/text_utils.py +30 -0
  125. lyrics_transcriber/frontend/.gitignore +23 -0
  126. lyrics_transcriber/frontend/.yarn/releases/yarn-4.7.0.cjs +935 -0
  127. lyrics_transcriber/frontend/.yarnrc.yml +3 -0
  128. lyrics_transcriber/frontend/README.md +50 -0
  129. lyrics_transcriber/frontend/REPLACE_ALL_FUNCTIONALITY.md +210 -0
  130. lyrics_transcriber/frontend/__init__.py +25 -0
  131. lyrics_transcriber/frontend/eslint.config.js +28 -0
  132. lyrics_transcriber/frontend/index.html +18 -0
  133. lyrics_transcriber/frontend/package.json +42 -0
  134. lyrics_transcriber/frontend/public/android-chrome-192x192.png +0 -0
  135. lyrics_transcriber/frontend/public/android-chrome-512x512.png +0 -0
  136. lyrics_transcriber/frontend/public/apple-touch-icon.png +0 -0
  137. lyrics_transcriber/frontend/public/favicon-16x16.png +0 -0
  138. lyrics_transcriber/frontend/public/favicon-32x32.png +0 -0
  139. lyrics_transcriber/frontend/public/favicon.ico +0 -0
  140. lyrics_transcriber/frontend/public/nomad-karaoke-logo.png +0 -0
  141. lyrics_transcriber/frontend/src/App.tsx +214 -0
  142. lyrics_transcriber/frontend/src/api.ts +254 -0
  143. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +77 -0
  144. lyrics_transcriber/frontend/src/components/AddLyricsModal.tsx +114 -0
  145. lyrics_transcriber/frontend/src/components/AgenticCorrectionMetrics.tsx +204 -0
  146. lyrics_transcriber/frontend/src/components/AudioPlayer.tsx +180 -0
  147. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +167 -0
  148. lyrics_transcriber/frontend/src/components/CorrectionAnnotationModal.tsx +359 -0
  149. lyrics_transcriber/frontend/src/components/CorrectionDetailCard.tsx +281 -0
  150. lyrics_transcriber/frontend/src/components/CorrectionMetrics.tsx +162 -0
  151. lyrics_transcriber/frontend/src/components/DurationTimelineView.tsx +257 -0
  152. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +68 -0
  153. lyrics_transcriber/frontend/src/components/EditModal.tsx +702 -0
  154. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +496 -0
  155. lyrics_transcriber/frontend/src/components/EditWordList.tsx +379 -0
  156. lyrics_transcriber/frontend/src/components/FileUpload.tsx +77 -0
  157. lyrics_transcriber/frontend/src/components/FindReplaceModal.tsx +467 -0
  158. lyrics_transcriber/frontend/src/components/Header.tsx +413 -0
  159. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +1387 -0
  160. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
  161. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
  162. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
  163. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
  164. lyrics_transcriber/frontend/src/components/MetricsDashboard.tsx +51 -0
  165. lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
  166. lyrics_transcriber/frontend/src/components/ModeSelector.tsx +67 -0
  167. lyrics_transcriber/frontend/src/components/ModelSelector.tsx +23 -0
  168. lyrics_transcriber/frontend/src/components/PreviewVideoSection.tsx +144 -0
  169. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +268 -0
  170. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +336 -0
  171. lyrics_transcriber/frontend/src/components/ReviewChangesModal.tsx +354 -0
  172. lyrics_transcriber/frontend/src/components/SegmentDetailsModal.tsx +64 -0
  173. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +376 -0
  174. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +131 -0
  175. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +256 -0
  176. lyrics_transcriber/frontend/src/components/WordDivider.tsx +187 -0
  177. lyrics_transcriber/frontend/src/components/shared/components/HighlightedText.tsx +379 -0
  178. lyrics_transcriber/frontend/src/components/shared/components/SourceSelector.tsx +56 -0
  179. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +87 -0
  180. lyrics_transcriber/frontend/src/components/shared/constants.ts +20 -0
  181. lyrics_transcriber/frontend/src/components/shared/hooks/useWordClick.ts +180 -0
  182. lyrics_transcriber/frontend/src/components/shared/styles.ts +13 -0
  183. lyrics_transcriber/frontend/src/components/shared/types.js +2 -0
  184. lyrics_transcriber/frontend/src/components/shared/types.ts +129 -0
  185. lyrics_transcriber/frontend/src/components/shared/utils/keyboardHandlers.ts +177 -0
  186. lyrics_transcriber/frontend/src/components/shared/utils/localStorage.ts +78 -0
  187. lyrics_transcriber/frontend/src/components/shared/utils/referenceLineCalculator.ts +75 -0
  188. lyrics_transcriber/frontend/src/components/shared/utils/segmentOperations.ts +360 -0
  189. lyrics_transcriber/frontend/src/components/shared/utils/timingUtils.ts +110 -0
  190. lyrics_transcriber/frontend/src/components/shared/utils/wordUtils.ts +22 -0
  191. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +435 -0
  192. lyrics_transcriber/frontend/src/main.tsx +17 -0
  193. lyrics_transcriber/frontend/src/theme.ts +177 -0
  194. lyrics_transcriber/frontend/src/types/global.d.ts +9 -0
  195. lyrics_transcriber/frontend/src/types.js +2 -0
  196. lyrics_transcriber/frontend/src/types.ts +199 -0
  197. lyrics_transcriber/frontend/src/validation.ts +132 -0
  198. lyrics_transcriber/frontend/src/vite-env.d.ts +1 -0
  199. lyrics_transcriber/frontend/tsconfig.app.json +26 -0
  200. lyrics_transcriber/frontend/tsconfig.json +25 -0
  201. lyrics_transcriber/frontend/tsconfig.node.json +23 -0
  202. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -0
  203. lyrics_transcriber/frontend/update_version.js +11 -0
  204. lyrics_transcriber/frontend/vite.config.d.ts +2 -0
  205. lyrics_transcriber/frontend/vite.config.js +10 -0
  206. lyrics_transcriber/frontend/vite.config.ts +11 -0
  207. lyrics_transcriber/frontend/web_assets/android-chrome-192x192.png +0 -0
  208. lyrics_transcriber/frontend/web_assets/android-chrome-512x512.png +0 -0
  209. lyrics_transcriber/frontend/web_assets/apple-touch-icon.png +0 -0
  210. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js +43288 -0
  211. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +1 -0
  212. lyrics_transcriber/frontend/web_assets/favicon-16x16.png +0 -0
  213. lyrics_transcriber/frontend/web_assets/favicon-32x32.png +0 -0
  214. lyrics_transcriber/frontend/web_assets/favicon.ico +0 -0
  215. lyrics_transcriber/frontend/web_assets/index.html +18 -0
  216. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.png +0 -0
  217. lyrics_transcriber/frontend/yarn.lock +3752 -0
  218. lyrics_transcriber/lyrics/__init__.py +0 -0
  219. lyrics_transcriber/lyrics/base_lyrics_provider.py +211 -0
  220. lyrics_transcriber/lyrics/file_provider.py +95 -0
  221. lyrics_transcriber/lyrics/genius.py +384 -0
  222. lyrics_transcriber/lyrics/lrclib.py +231 -0
  223. lyrics_transcriber/lyrics/musixmatch.py +156 -0
  224. lyrics_transcriber/lyrics/spotify.py +290 -0
  225. lyrics_transcriber/lyrics/user_input_provider.py +44 -0
  226. lyrics_transcriber/output/__init__.py +0 -0
  227. lyrics_transcriber/output/ass/__init__.py +21 -0
  228. lyrics_transcriber/output/ass/ass.py +2088 -0
  229. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  230. lyrics_transcriber/output/ass/config.py +180 -0
  231. lyrics_transcriber/output/ass/constants.py +23 -0
  232. lyrics_transcriber/output/ass/event.py +94 -0
  233. lyrics_transcriber/output/ass/formatters.py +132 -0
  234. lyrics_transcriber/output/ass/lyrics_line.py +265 -0
  235. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  236. lyrics_transcriber/output/ass/section_detector.py +89 -0
  237. lyrics_transcriber/output/ass/section_screen.py +106 -0
  238. lyrics_transcriber/output/ass/style.py +187 -0
  239. lyrics_transcriber/output/cdg.py +619 -0
  240. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  241. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  242. lyrics_transcriber/output/cdgmaker/composer.py +2260 -0
  243. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  244. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  245. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  246. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  247. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  248. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  249. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  250. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  251. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  252. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  253. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  254. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  255. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  256. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  257. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  258. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  259. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  260. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  261. lyrics_transcriber/output/countdown_processor.py +306 -0
  262. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  263. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  264. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  265. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  266. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  267. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  268. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  269. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  270. lyrics_transcriber/output/generator.py +257 -0
  271. lyrics_transcriber/output/lrc_to_cdg.py +61 -0
  272. lyrics_transcriber/output/lyrics_file.py +102 -0
  273. lyrics_transcriber/output/plain_text.py +96 -0
  274. lyrics_transcriber/output/segment_resizer.py +431 -0
  275. lyrics_transcriber/output/subtitles.py +397 -0
  276. lyrics_transcriber/output/video.py +544 -0
  277. lyrics_transcriber/review/__init__.py +0 -0
  278. lyrics_transcriber/review/server.py +676 -0
  279. lyrics_transcriber/storage/__init__.py +0 -0
  280. lyrics_transcriber/storage/dropbox.py +225 -0
  281. lyrics_transcriber/transcribers/__init__.py +0 -0
  282. lyrics_transcriber/transcribers/audioshake.py +379 -0
  283. lyrics_transcriber/transcribers/base_transcriber.py +157 -0
  284. lyrics_transcriber/transcribers/whisper.py +330 -0
  285. lyrics_transcriber/types.py +650 -0
  286. lyrics_transcriber/utils/__init__.py +0 -0
  287. lyrics_transcriber/utils/word_utils.py +27 -0
@@ -0,0 +1,435 @@
1
+ from typing import List
2
+ import spacy
3
+ from spacy.tokens import Doc
4
+ import logging
5
+ from lyrics_transcriber.correction.text_utils import clean_text
6
+ from lyrics_transcriber.types import PhraseType, PhraseScore
7
+
8
+
9
+ class PhraseAnalyzer:
10
+ """Language-agnostic phrase analyzer using spaCy"""
11
+
12
+ def __init__(self, logger: logging.Logger, language_code: str = "en_core_web_sm"):
13
+ """Initialize with specific language model and logger
14
+
15
+ Args:
16
+ logger: Logger instance to use for this analyzer
17
+ language_code: spaCy language model to use
18
+ """
19
+ self.logger = logger
20
+ self.logger.info(f"Initializing PhraseAnalyzer with language model: {language_code}")
21
+ try:
22
+ self.nlp = spacy.load(language_code)
23
+ except OSError:
24
+ self.logger.info(f"Language model {language_code} not found. Attempting to download...")
25
+ import subprocess
26
+
27
+ try:
28
+ subprocess.check_call(["python", "-m", "spacy", "download", language_code])
29
+ self.nlp = spacy.load(language_code)
30
+ self.logger.info(f"Successfully downloaded and loaded {language_code}")
31
+ except subprocess.CalledProcessError as e:
32
+ self.logger.error(f"Failed to download language model: {language_code}")
33
+ raise OSError(
34
+ f"Language model '{language_code}' could not be downloaded. "
35
+ f"Please install it manually with: python -m spacy download {language_code}"
36
+ ) from e
37
+
38
+ def score_phrase(self, words: List[str], context: str) -> PhraseScore:
39
+ """Score a phrase based on grammatical completeness and natural breaks.
40
+
41
+ Args:
42
+ words: List of words in the phrase
43
+ context: Full text containing the phrase
44
+
45
+ Returns:
46
+ PhraseScore with phrase_type, natural_break_score, and length_score
47
+ """
48
+ # self.logger.info(f"Scoring phrase with context length {len(context)}: {' '.join(words)}")
49
+
50
+ phrase = " ".join(words)
51
+ phrase_doc = self.nlp(phrase)
52
+ context_doc = self.nlp(context)
53
+
54
+ # Get initial phrase type based on grammar
55
+ phrase_type = self._determine_phrase_type(phrase_doc)
56
+
57
+ # Calculate scores
58
+ break_score = self._calculate_break_score(phrase_doc, context_doc)
59
+ length_score = self._calculate_length_score(phrase_doc)
60
+
61
+ # If break score is 0 (crosses boundary), override to CROSS_BOUNDARY
62
+ if break_score == 0.0:
63
+ phrase_type = PhraseType.CROSS_BOUNDARY
64
+
65
+ return PhraseScore(phrase_type=phrase_type, natural_break_score=break_score, length_score=length_score)
66
+
67
+ def _determine_phrase_type(self, doc: Doc) -> PhraseType:
68
+ """Determine the grammatical type of a phrase using SpaCy's linguistic analysis.
69
+
70
+ This method categorizes text into three types:
71
+ 1. COMPLETE: A grammatically complete clause with subject and predicate
72
+ Examples: "I love you", "the cat sleeps"
73
+ - Subject (I, the cat) + Predicate (love you, sleeps)
74
+
75
+ 2. PARTIAL: A valid but incomplete grammatical unit, which can be:
76
+ a) Noun phrase: A group of words with a noun as the head
77
+ Example: "the big cat"
78
+ - Determiner (the) + Adjective (big) + Noun (cat)
79
+
80
+ b) Verb phrase: A group of words with a verb as the head
81
+ Example: "running fast"
82
+ - Verb (running) + Adverb (fast)
83
+
84
+ c) Prepositional phrase: Starting with a preposition
85
+ Example: "in my heart"
86
+ - Preposition (in) + Noun phrase (my heart)
87
+
88
+ d) Adverb phrase: A group of words with an adverb as the head
89
+ Example: "très rapidement" (French: "very quickly")
90
+ - Adverb (très) + Adverb (rapidement)
91
+
92
+ 3. CROSS_BOUNDARY: Invalid grammatical structure
93
+ Examples: "cat the big", "love but the"
94
+ - Words in unnatural order or incomplete structures
95
+
96
+ Args:
97
+ doc: SpaCy Doc object containing the parsed text
98
+
99
+ Returns:
100
+ PhraseType: COMPLETE, PARTIAL, or CROSS_BOUNDARY
101
+ """
102
+ # self.logger.debug(f"Determining phrase type for: {doc.text}")
103
+
104
+ # First check if it's a complete clause
105
+ if self.is_complete_clause(doc):
106
+ return PhraseType.COMPLETE
107
+
108
+ # Check if it's a valid partial phrase
109
+ if (
110
+ self.is_valid_noun_phrase(doc)
111
+ or self.is_valid_verb_phrase(doc)
112
+ or self.is_valid_prep_phrase(doc)
113
+ or self.is_valid_adverb_phrase(doc)
114
+ ):
115
+ # Additional check: if the phrase crosses sentence boundaries,
116
+ # it should be CROSS_BOUNDARY even if it's grammatically valid
117
+ if "." in doc.text: # Simple check for sentence boundary within phrase
118
+ return PhraseType.CROSS_BOUNDARY
119
+ return PhraseType.PARTIAL
120
+
121
+ return PhraseType.CROSS_BOUNDARY
122
+
123
+ def _calculate_break_score(self, phrase_doc: Doc, context_doc: Doc) -> float:
124
+ """Calculate how well the phrase respects natural breaks in the text.
125
+
126
+ Scores are based on alignment with line breaks and sentence boundaries:
127
+ 1.0 - Perfect alignment (matches full line or sentence)
128
+ 0.8-0.9 - Strong alignment (matches most of a natural unit)
129
+ 0.5-0.7 - Partial alignment (matches start or end of unit)
130
+ 0.0 - Poor alignment (crosses line/sentence boundary)
131
+
132
+ Examples from tests:
133
+ "my heart will go on" -> 1.0 (matches full line)
134
+ "go on and" -> 0.0 (crosses line break)
135
+ "Hello world" -> 1.0 (matches complete sentence)
136
+ "world How" -> 0.0 (crosses sentence boundary)
137
+ "I wake up" -> 0.85 (strong alignment with verb phrase)
138
+ """
139
+ # Clean both texts while preserving structure
140
+ phrase_text = clean_text(phrase_doc.text)
141
+ context_text = clean_text(context_doc.text)
142
+
143
+ # Find position in cleaned text
144
+ phrase_start = context_text.find(phrase_text)
145
+
146
+ if phrase_start == -1:
147
+ return 0.0
148
+
149
+ phrase_end = phrase_start + len(phrase_text)
150
+
151
+ # Check line breaks first
152
+ line_score = self.calculate_line_break_score(phrase_start, phrase_end, context_doc.text)
153
+ if line_score in {0.0, 1.0}: # Perfect match or crossing boundary
154
+ return line_score
155
+
156
+ # Then check sentence boundaries
157
+ sentence_score = self.calculate_sentence_break_score(phrase_doc, phrase_start, phrase_end, context_doc)
158
+ if sentence_score in {0.0, 1.0}: # Perfect match or crossing boundary
159
+ return sentence_score
160
+
161
+ # Return the higher of the two scores
162
+ return max(line_score, sentence_score)
163
+
164
+ def _calculate_length_score(self, doc: Doc) -> float:
165
+ """Calculate score based on phrase length and complexity.
166
+
167
+ Scores are based on the number of meaningful linguistic units:
168
+ - Noun chunks ("the big cat", "the mat")
169
+ - Verbs ("sleeps")
170
+ - Adverbial modifiers ("soundly")
171
+ - Prepositional phrases ("on the mat")
172
+
173
+ Scoring scale:
174
+ 0.0 - No meaningful units
175
+ 0.9 - One unit (e.g., "the cat")
176
+ 1.0 - Two units (e.g., "the cat sleeps")
177
+ 0.8 - Three units (e.g., "the big cat sleeps quickly")
178
+ 0.6 - Four or more units (e.g., "the big cat sleeps soundly on the mat")
179
+
180
+ Examples from tests:
181
+ "the cat" -> 1 unit (noun chunk) -> 0.9
182
+ "the cat sleeps" -> 2 units (noun chunk + verb) -> 1.0
183
+ "the big cat sleeps soundly on the mat" -> 4 units (noun chunk + verb + adverb + prep phrase) -> 0.6
184
+ """
185
+ # self.logger.debug(f"Calculating length score for: {doc.text}")
186
+ # Count meaningful linguistic units
187
+ units = 0
188
+
189
+ # Count noun chunks
190
+ units += len(list(doc.noun_chunks))
191
+
192
+ # Count verbs
193
+ units += len([token for token in doc if token.pos_ == "VERB"])
194
+
195
+ # Count adverbial modifiers
196
+ units += len([token for token in doc if token.dep_ == "advmod"])
197
+
198
+ # Count prepositional phrases
199
+ units += len([token for token in doc if token.dep_ == "prep"])
200
+
201
+ # Score based on complexity
202
+ if units == 0:
203
+ return 0.0
204
+ elif units == 1:
205
+ return 0.9 # Simple phrase
206
+ elif units == 2:
207
+ return 1.0 # Optimal complexity
208
+ elif units == 3:
209
+ return 0.8 # Slightly complex
210
+ return 0.6 # Too complex
211
+
212
+ def is_complete_clause(self, doc: Doc) -> bool:
213
+ """Check if the text forms a complete clause.
214
+
215
+ Different languages mark subject-verb relationships differently:
216
+ English/French:
217
+ - Subject has nsubj/nsubjpass dependency
218
+ - Verb is ROOT
219
+
220
+ Spanish:
221
+ - Sometimes marks pronoun as ROOT
222
+ - Verb can be marked as flat/aux
223
+ """
224
+ # self.logger.debug(f"Checking if complete clause: {doc.text}")
225
+ # Standard subject-verb pattern (English/French)
226
+ standard_pattern = any(token.dep_ in {"nsubj", "nsubjpass"} for token in doc) and any(
227
+ token.dep_ == "ROOT" and token.pos_ == "VERB" for token in doc
228
+ )
229
+
230
+ # Spanish pronoun-verb pattern
231
+ spanish_pattern = (
232
+ len(doc) == 2 # Two-word phrase
233
+ and doc[0].pos_ == "PRON" # First word is pronoun
234
+ and doc[1].pos_ in {"VERB", "AUX", "ADJ"} # Second word is verb-like
235
+ and doc[1].dep_ in {"flat", "aux"} # Common Spanish dependencies
236
+ )
237
+
238
+ return standard_pattern or spanish_pattern
239
+
240
+ def is_valid_noun_phrase(self, doc: Doc) -> bool:
241
+ """Check if the text is a valid noun phrase like "the big cat".
242
+
243
+ Valid noun phrases:
244
+ - "the cat" (determiner + noun)
245
+ - "the big cat" (determiner + adjective + noun)
246
+ - "my heart" (possessive + noun)
247
+ """
248
+ # self.logger.debug(f"Checking if valid noun phrase: {doc.text}")
249
+ chunks = list(doc.noun_chunks)
250
+ if not chunks:
251
+ return False
252
+
253
+ # The noun phrase should be the entire text
254
+ chunk = chunks[0]
255
+ if not (chunk.start == 0 and chunk.end == len(doc)):
256
+ return False
257
+
258
+ # Check for valid noun phrase structure
259
+ root_nouns = [t for t in doc if t.dep_ == "ROOT" and t.pos_ in {"NOUN", "PROPN"}]
260
+ compounds = [t for t in doc if t.dep_ == "compound"]
261
+
262
+ return len(root_nouns) == 1 and len(compounds) == 0
263
+
264
+ def is_valid_verb_phrase(self, doc: Doc) -> bool:
265
+ """Check if the text is a valid verb phrase like "running fast".
266
+
267
+ A verb phrase must:
268
+ 1. Contain a verb as the first content word
269
+ 2. Only use valid verb phrase dependencies
270
+ 3. Have correct word order (verb before modifiers)
271
+ """
272
+ # self.logger.debug(f"Checking if valid verb phrase: {doc.text}")
273
+ VALID_DEPS = {
274
+ "ROOT", # Main verb
275
+ "advmod", # Adverbial modifier
276
+ "dobj", # Direct object
277
+ "prt", # Verb particle
278
+ "prep", # Preposition
279
+ "pobj", # Object of preposition
280
+ "compound:prt", # Phrasal verb particle
281
+ }
282
+
283
+ # Find all verbs
284
+ verbs = [token for token in doc if token.pos_ == "VERB"]
285
+ if not verbs:
286
+ return False
287
+
288
+ # Check if first content word is a verb
289
+ content_words = [token for token in doc if token.pos_ not in {"DET", "PUNCT"}]
290
+ if not content_words or content_words[0].pos_ != "VERB":
291
+ return False
292
+
293
+ # Check dependencies
294
+ has_valid_deps = all(token.dep_ in VALID_DEPS for token in doc)
295
+ return has_valid_deps
296
+
297
+ def is_valid_prep_phrase(self, doc: Doc) -> bool:
298
+ """Check if the text is a valid prepositional phrase.
299
+
300
+ Examples:
301
+ - "in my heart" (English)
302
+ - "dans la maison" (French: "in the house")
303
+ - "en la casa" (Spanish: "in the house")
304
+ """
305
+ # self.logger.debug(f"Checking if valid prep phrase: {doc.text}")
306
+ starts_with_prep = doc[0].pos_ == "ADP"
307
+ has_content = len(doc) > 1
308
+ has_valid_structure = any(t.dep_ == "pobj" for t in doc) or ( # English style
309
+ doc[0].dep_ == "case" and any(t.dep_ == "ROOT" for t in doc)
310
+ ) # French/Spanish style
311
+
312
+ return starts_with_prep and has_content and has_valid_structure
313
+
314
+ def is_valid_adverb_phrase(self, doc: Doc) -> bool:
315
+ """Check if the text is a valid adverbial phrase.
316
+
317
+ Examples:
318
+ - "très rapidement" (French: "very quickly")
319
+ - "muy rápido" (Spanish: "very fast")
320
+ - "very quickly" (English)
321
+
322
+ Valid patterns:
323
+ - ADV + ADV/ADJ (modifier + main adverb/adjective)
324
+ - First word must modify second word
325
+ - Second word must be the root
326
+ """
327
+ # self.logger.debug(f"Checking if valid adverb phrase: {doc.text}")
328
+ # Check basic structure
329
+ if len(doc) != 2: # Only handle two-word phrases for now
330
+ return False
331
+
332
+ # Check parts of speech
333
+ has_valid_pos = all(token.pos_ in {"ADV", "ADJ"} for token in doc)
334
+ if not has_valid_pos:
335
+ return False
336
+
337
+ first_word = doc[0]
338
+ second_word = doc[1]
339
+
340
+ # The first word must be a modifier
341
+ if first_word.dep_ != "advmod":
342
+ return False
343
+
344
+ # The second word must be the root
345
+ if second_word.dep_ != "ROOT":
346
+ return False
347
+
348
+ # Check that the first word modifies the second
349
+ if first_word.head != second_word:
350
+ return False
351
+
352
+ return True
353
+
354
+ def calculate_line_break_score(self, phrase_start: int, phrase_end: int, context_text: str) -> float:
355
+ """Calculate score based on line break alignment."""
356
+ # Clean the context text while preserving line breaks
357
+ cleaned_lines = [clean_text(line) for line in context_text.split("\n")]
358
+ cleaned_context = "\n".join(cleaned_lines)
359
+
360
+ # Track current position in cleaned context
361
+ current_pos = 0
362
+
363
+ # Recalculate positions using cleaned text
364
+ for line in cleaned_lines:
365
+ if not line: # Skip empty lines
366
+ current_pos += 1 # Account for newline
367
+ continue
368
+
369
+ line_start = current_pos
370
+ line_end = line_start + len(line)
371
+
372
+ # Perfect match with a full line
373
+ if phrase_start == line_start and phrase_end == line_end:
374
+ return 1.0
375
+
376
+ # Strong alignment with start of line
377
+ if phrase_start == line_start:
378
+ coverage = (phrase_end - phrase_start) / len(line)
379
+ if coverage >= 0.7:
380
+ return 0.9
381
+ elif coverage >= 0.3:
382
+ return 0.8
383
+
384
+ # Strong alignment with end of line
385
+ if phrase_end == line_end:
386
+ coverage = (phrase_end - phrase_start) / len(line)
387
+ if coverage >= 0.7:
388
+ return 0.9
389
+ elif coverage >= 0.3:
390
+ return 0.8
391
+
392
+ # Update position for next line
393
+ current_pos = line_end + 1 # +1 for newline
394
+
395
+ # Check if phrase crosses any line boundary
396
+ if any(
397
+ phrase_start < cleaned_context.find("\n", i) < phrase_end for i in range(len(cleaned_context)) if "\n" in cleaned_context[i:]
398
+ ):
399
+ return 0.0
400
+
401
+ return 0.5
402
+
403
+ def calculate_sentence_break_score(self, phrase_doc: Doc, phrase_start: int, phrase_end: int, context_doc: Doc) -> float:
404
+ """Calculate score based on sentence boundary alignment."""
405
+ # self.logger.debug(f"Calculating sentence break score for: {phrase_doc.text}")
406
+ for sent in context_doc.sents:
407
+ sent_start = sent.start_char
408
+ sent_end = sent.end_char
409
+
410
+ # Perfect match with a full sentence
411
+ if phrase_start == sent_start and phrase_end == sent_end:
412
+ return 1.0
413
+
414
+ # Strong alignment with most of a sentence
415
+ if phrase_start >= sent_start and phrase_end <= sent_end:
416
+ has_verb = any(token.pos_ == "VERB" for token in phrase_doc)
417
+ has_subject = any(token.dep_ in {"nsubj", "nsubjpass"} for token in phrase_doc)
418
+
419
+ phrase_len = phrase_end - phrase_start
420
+ sent_len = sent_end - sent_start
421
+ coverage = phrase_len / sent_len
422
+
423
+ if has_verb and has_subject:
424
+ return 0.85
425
+ elif has_verb and coverage > 0.3:
426
+ return 0.8
427
+ elif coverage > 0.5:
428
+ return 0.8
429
+ return 0.7
430
+
431
+ # Crosses sentence boundary
432
+ if any(phrase_start < s.start_char < phrase_end for s in context_doc.sents):
433
+ return 0.0
434
+
435
+ return 0.5
@@ -0,0 +1,30 @@
1
+ import re
2
+
3
+
4
+ def clean_text(text: str) -> str:
5
+ """Clean text by removing punctuation and normalizing whitespace.
6
+
7
+ Args:
8
+ text: Text to clean
9
+
10
+ Returns:
11
+ Cleaned text with:
12
+ - All text converted to lowercase
13
+ - Multiple spaces/whitespace collapsed to single space
14
+ - Leading/trailing whitespace removed
15
+ - Hyphens and forward slashes replaced with spaces
16
+ - Apostrophes and other punctuation removed
17
+ """
18
+ # Convert to lowercase
19
+ text = text.lower()
20
+
21
+ # Replace hyphens and forward slashes with spaces
22
+ text = re.sub(r"[-/]", " ", text)
23
+
24
+ # Remove apostrophes and other punctuation
25
+ text = re.sub(r"[^\w\s]", "", text)
26
+
27
+ # Normalize whitespace (collapse multiple spaces, remove leading/trailing)
28
+ text = " ".join(text.split())
29
+
30
+ return text
@@ -0,0 +1,23 @@
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist-ssr
12
+ *.local
13
+
14
+ # Editor directories and files
15
+ .vscode/*
16
+ !.vscode/extensions.json
17
+ .idea
18
+ .DS_Store
19
+ *.suo
20
+ *.ntvs*
21
+ *.njsproj
22
+ *.sln
23
+ *.sw?