lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. lyrics_transcriber/__init__.py +2 -1
  2. lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
  3. lyrics_transcriber/core/config.py +35 -0
  4. lyrics_transcriber/core/controller.py +164 -166
  5. lyrics_transcriber/correction/anchor_sequence.py +471 -0
  6. lyrics_transcriber/correction/corrector.py +256 -0
  7. lyrics_transcriber/correction/handlers/__init__.py +0 -0
  8. lyrics_transcriber/correction/handlers/base.py +30 -0
  9. lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
  10. lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
  11. lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
  12. lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
  13. lyrics_transcriber/correction/handlers/repeat.py +71 -0
  14. lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
  15. lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
  16. lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
  17. lyrics_transcriber/correction/handlers/word_operations.py +135 -0
  18. lyrics_transcriber/correction/phrase_analyzer.py +426 -0
  19. lyrics_transcriber/correction/text_utils.py +30 -0
  20. lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
  21. lyrics_transcriber/lyrics/genius.py +73 -0
  22. lyrics_transcriber/lyrics/spotify.py +82 -0
  23. lyrics_transcriber/output/ass/__init__.py +21 -0
  24. lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
  25. lyrics_transcriber/output/ass/ass_specs.txt +732 -0
  26. lyrics_transcriber/output/ass/config.py +37 -0
  27. lyrics_transcriber/output/ass/constants.py +23 -0
  28. lyrics_transcriber/output/ass/event.py +94 -0
  29. lyrics_transcriber/output/ass/formatters.py +132 -0
  30. lyrics_transcriber/output/ass/lyrics_line.py +219 -0
  31. lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
  32. lyrics_transcriber/output/ass/section_detector.py +89 -0
  33. lyrics_transcriber/output/ass/section_screen.py +106 -0
  34. lyrics_transcriber/output/ass/style.py +187 -0
  35. lyrics_transcriber/output/cdg.py +503 -0
  36. lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
  37. lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
  38. lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
  39. lyrics_transcriber/output/cdgmaker/config.py +151 -0
  40. lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
  41. lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
  42. lyrics_transcriber/output/cdgmaker/pack.py +507 -0
  43. lyrics_transcriber/output/cdgmaker/render.py +346 -0
  44. lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
  45. lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
  46. lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
  47. lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
  48. lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
  49. lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
  50. lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
  51. lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
  52. lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
  53. lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
  54. lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
  55. lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
  56. lyrics_transcriber/output/cdgmaker/utils.py +132 -0
  57. lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
  58. lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
  59. lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
  60. lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
  61. lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
  62. lyrics_transcriber/output/fonts/arial.ttf +0 -0
  63. lyrics_transcriber/output/fonts/georgia.ttf +0 -0
  64. lyrics_transcriber/output/fonts/verdana.ttf +0 -0
  65. lyrics_transcriber/output/generator.py +140 -171
  66. lyrics_transcriber/output/lyrics_file.py +102 -0
  67. lyrics_transcriber/output/plain_text.py +91 -0
  68. lyrics_transcriber/output/segment_resizer.py +416 -0
  69. lyrics_transcriber/output/subtitles.py +328 -302
  70. lyrics_transcriber/output/video.py +219 -0
  71. lyrics_transcriber/review/__init__.py +1 -0
  72. lyrics_transcriber/review/server.py +138 -0
  73. lyrics_transcriber/storage/dropbox.py +110 -134
  74. lyrics_transcriber/transcribers/audioshake.py +171 -105
  75. lyrics_transcriber/transcribers/base_transcriber.py +149 -0
  76. lyrics_transcriber/transcribers/whisper.py +267 -133
  77. lyrics_transcriber/types.py +454 -0
  78. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
  79. lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
  80. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
  81. lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
  82. lyrics_transcriber/core/corrector.py +0 -56
  83. lyrics_transcriber/core/fetcher.py +0 -143
  84. lyrics_transcriber/storage/tokens.py +0 -116
  85. lyrics_transcriber/transcribers/base.py +0 -31
  86. lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
  87. lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
  88. {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,135 @@
1
+ from typing import List, Optional, Dict
2
+ from lyrics_transcriber.types import WordCorrection, GapSequence
3
+
4
+
5
+ class WordOperations:
6
+ """Utility class for common word manipulation operations used by correction handlers."""
7
+
8
+ @staticmethod
9
+ def calculate_reference_positions(gap: GapSequence, sources: Optional[List[str]] = None) -> Dict[str, int]:
10
+ """Calculate reference positions for given sources based on preceding anchor.
11
+
12
+ Args:
13
+ gap: The gap sequence containing the preceding anchor
14
+ sources: Optional list of sources to calculate positions for. If None, uses all sources.
15
+
16
+ Returns:
17
+ Dictionary mapping source names to their reference positions
18
+ """
19
+ reference_positions = {}
20
+ if gap.preceding_anchor:
21
+ # If no sources specified, use all sources from reference words
22
+ sources_to_check = sources or list(gap.reference_words.keys())
23
+
24
+ for source in sources_to_check:
25
+ if source in gap.preceding_anchor.reference_positions:
26
+ # Calculate position based on anchor position and offset
27
+ anchor_pos = gap.preceding_anchor.reference_positions[source]
28
+ ref_pos = anchor_pos + len(gap.preceding_anchor.words)
29
+ reference_positions[source] = ref_pos
30
+ return reference_positions
31
+
32
+ @staticmethod
33
+ def create_word_replacement_correction(
34
+ original_word: str,
35
+ corrected_word: str,
36
+ original_position: int,
37
+ source: str,
38
+ confidence: float,
39
+ reason: str,
40
+ reference_positions: Optional[Dict[str, int]] = None,
41
+ ) -> WordCorrection:
42
+ """Creates a correction for replacing a single word with another word."""
43
+ return WordCorrection(
44
+ original_word=original_word,
45
+ corrected_word=corrected_word,
46
+ segment_index=0,
47
+ original_position=original_position,
48
+ confidence=confidence,
49
+ source=source,
50
+ reason=reason,
51
+ alternatives={},
52
+ reference_positions=reference_positions,
53
+ length=1, # Single word replacement
54
+ )
55
+
56
+ @staticmethod
57
+ def create_word_split_corrections(
58
+ original_word: str,
59
+ reference_words: List[str],
60
+ original_position: int,
61
+ source: str,
62
+ confidence: float,
63
+ reason: str,
64
+ reference_positions: Optional[Dict[str, int]] = None,
65
+ ) -> List[WordCorrection]:
66
+ """Creates corrections for splitting a single word into multiple words."""
67
+ corrections = []
68
+ for split_idx, ref_word in enumerate(reference_words):
69
+ corrections.append(
70
+ WordCorrection(
71
+ original_word=original_word,
72
+ corrected_word=ref_word,
73
+ segment_index=0,
74
+ original_position=original_position,
75
+ confidence=confidence,
76
+ source=source,
77
+ reason=reason,
78
+ alternatives={},
79
+ split_index=split_idx,
80
+ split_total=len(reference_words),
81
+ reference_positions=reference_positions,
82
+ length=1, # Each split word is length 1
83
+ )
84
+ )
85
+ return corrections
86
+
87
+ @staticmethod
88
+ def create_word_combine_corrections(
89
+ original_words: List[str],
90
+ reference_word: str,
91
+ original_position: int,
92
+ source: str,
93
+ confidence: float,
94
+ combine_reason: str,
95
+ delete_reason: str,
96
+ reference_positions: Optional[Dict[str, int]] = None,
97
+ ) -> List[WordCorrection]:
98
+ """Creates corrections for combining multiple words into a single word."""
99
+ corrections = []
100
+
101
+ # First word gets replaced
102
+ corrections.append(
103
+ WordCorrection(
104
+ original_word=original_words[0],
105
+ corrected_word=reference_word,
106
+ segment_index=0,
107
+ original_position=original_position,
108
+ confidence=confidence,
109
+ source=source,
110
+ reason=combine_reason,
111
+ alternatives={},
112
+ reference_positions=reference_positions,
113
+ length=len(original_words), # Combined word spans all original words
114
+ )
115
+ )
116
+
117
+ # Additional words get marked for deletion
118
+ for i, word in enumerate(original_words[1:], start=1):
119
+ corrections.append(
120
+ WordCorrection(
121
+ original_word=word,
122
+ corrected_word="",
123
+ segment_index=0,
124
+ original_position=original_position + i,
125
+ confidence=confidence,
126
+ source=source,
127
+ reason=delete_reason,
128
+ alternatives={},
129
+ is_deletion=True,
130
+ reference_positions=reference_positions,
131
+ length=1, # Deleted words are length 1
132
+ )
133
+ )
134
+
135
+ return corrections
@@ -0,0 +1,426 @@
1
+ from typing import List
2
+ import spacy
3
+ from spacy.tokens import Doc
4
+ import logging
5
+ from lyrics_transcriber.correction.text_utils import clean_text
6
+ from lyrics_transcriber.types import PhraseType, PhraseScore
7
+
8
+
9
+ class PhraseAnalyzer:
10
+ """Language-agnostic phrase analyzer using spaCy"""
11
+
12
+ def __init__(self, logger: logging.Logger, language_code: str = "en_core_web_sm"):
13
+ """Initialize with specific language model and logger
14
+
15
+ Args:
16
+ logger: Logger instance to use for this analyzer
17
+ language_code: spaCy language model to use
18
+ """
19
+ self.logger = logger
20
+ self.logger.info(f"Initializing PhraseAnalyzer with language model: {language_code}")
21
+ try:
22
+ self.nlp = spacy.load(language_code)
23
+ except OSError:
24
+ self.logger.error(f"Failed to load language model: {language_code}")
25
+ raise OSError(
26
+ f"Language model '{language_code}' not found. " f"Please install it with: python -m spacy download {language_code}"
27
+ )
28
+
29
+ def score_phrase(self, words: List[str], context: str) -> PhraseScore:
30
+ """Score a phrase based on grammatical completeness and natural breaks.
31
+
32
+ Args:
33
+ words: List of words in the phrase
34
+ context: Full text containing the phrase
35
+
36
+ Returns:
37
+ PhraseScore with phrase_type, natural_break_score, and length_score
38
+ """
39
+ # self.logger.info(f"Scoring phrase with context length {len(context)}: {' '.join(words)}")
40
+
41
+ phrase = " ".join(words)
42
+ phrase_doc = self.nlp(phrase)
43
+ context_doc = self.nlp(context)
44
+
45
+ # Get initial phrase type based on grammar
46
+ phrase_type = self._determine_phrase_type(phrase_doc)
47
+
48
+ # Calculate scores
49
+ break_score = self._calculate_break_score(phrase_doc, context_doc)
50
+ length_score = self._calculate_length_score(phrase_doc)
51
+
52
+ # If break score is 0 (crosses boundary), override to CROSS_BOUNDARY
53
+ if break_score == 0.0:
54
+ phrase_type = PhraseType.CROSS_BOUNDARY
55
+
56
+ return PhraseScore(phrase_type=phrase_type, natural_break_score=break_score, length_score=length_score)
57
+
58
+ def _determine_phrase_type(self, doc: Doc) -> PhraseType:
59
+ """Determine the grammatical type of a phrase using SpaCy's linguistic analysis.
60
+
61
+ This method categorizes text into three types:
62
+ 1. COMPLETE: A grammatically complete clause with subject and predicate
63
+ Examples: "I love you", "the cat sleeps"
64
+ - Subject (I, the cat) + Predicate (love you, sleeps)
65
+
66
+ 2. PARTIAL: A valid but incomplete grammatical unit, which can be:
67
+ a) Noun phrase: A group of words with a noun as the head
68
+ Example: "the big cat"
69
+ - Determiner (the) + Adjective (big) + Noun (cat)
70
+
71
+ b) Verb phrase: A group of words with a verb as the head
72
+ Example: "running fast"
73
+ - Verb (running) + Adverb (fast)
74
+
75
+ c) Prepositional phrase: Starting with a preposition
76
+ Example: "in my heart"
77
+ - Preposition (in) + Noun phrase (my heart)
78
+
79
+ d) Adverb phrase: A group of words with an adverb as the head
80
+ Example: "très rapidement" (French: "very quickly")
81
+ - Adverb (très) + Adverb (rapidement)
82
+
83
+ 3. CROSS_BOUNDARY: Invalid grammatical structure
84
+ Examples: "cat the big", "love but the"
85
+ - Words in unnatural order or incomplete structures
86
+
87
+ Args:
88
+ doc: SpaCy Doc object containing the parsed text
89
+
90
+ Returns:
91
+ PhraseType: COMPLETE, PARTIAL, or CROSS_BOUNDARY
92
+ """
93
+ # self.logger.debug(f"Determining phrase type for: {doc.text}")
94
+
95
+ # First check if it's a complete clause
96
+ if self.is_complete_clause(doc):
97
+ return PhraseType.COMPLETE
98
+
99
+ # Check if it's a valid partial phrase
100
+ if (
101
+ self.is_valid_noun_phrase(doc)
102
+ or self.is_valid_verb_phrase(doc)
103
+ or self.is_valid_prep_phrase(doc)
104
+ or self.is_valid_adverb_phrase(doc)
105
+ ):
106
+ # Additional check: if the phrase crosses sentence boundaries,
107
+ # it should be CROSS_BOUNDARY even if it's grammatically valid
108
+ if "." in doc.text: # Simple check for sentence boundary within phrase
109
+ return PhraseType.CROSS_BOUNDARY
110
+ return PhraseType.PARTIAL
111
+
112
+ return PhraseType.CROSS_BOUNDARY
113
+
114
+ def _calculate_break_score(self, phrase_doc: Doc, context_doc: Doc) -> float:
115
+ """Calculate how well the phrase respects natural breaks in the text.
116
+
117
+ Scores are based on alignment with line breaks and sentence boundaries:
118
+ 1.0 - Perfect alignment (matches full line or sentence)
119
+ 0.8-0.9 - Strong alignment (matches most of a natural unit)
120
+ 0.5-0.7 - Partial alignment (matches start or end of unit)
121
+ 0.0 - Poor alignment (crosses line/sentence boundary)
122
+
123
+ Examples from tests:
124
+ "my heart will go on" -> 1.0 (matches full line)
125
+ "go on and" -> 0.0 (crosses line break)
126
+ "Hello world" -> 1.0 (matches complete sentence)
127
+ "world How" -> 0.0 (crosses sentence boundary)
128
+ "I wake up" -> 0.85 (strong alignment with verb phrase)
129
+ """
130
+ # Clean both texts while preserving structure
131
+ phrase_text = clean_text(phrase_doc.text)
132
+ context_text = clean_text(context_doc.text)
133
+
134
+ # Find position in cleaned text
135
+ phrase_start = context_text.find(phrase_text)
136
+
137
+ if phrase_start == -1:
138
+ return 0.0
139
+
140
+ phrase_end = phrase_start + len(phrase_text)
141
+
142
+ # Check line breaks first
143
+ line_score = self.calculate_line_break_score(phrase_start, phrase_end, context_doc.text)
144
+ if line_score in {0.0, 1.0}: # Perfect match or crossing boundary
145
+ return line_score
146
+
147
+ # Then check sentence boundaries
148
+ sentence_score = self.calculate_sentence_break_score(phrase_doc, phrase_start, phrase_end, context_doc)
149
+ if sentence_score in {0.0, 1.0}: # Perfect match or crossing boundary
150
+ return sentence_score
151
+
152
+ # Return the higher of the two scores
153
+ return max(line_score, sentence_score)
154
+
155
+ def _calculate_length_score(self, doc: Doc) -> float:
156
+ """Calculate score based on phrase length and complexity.
157
+
158
+ Scores are based on the number of meaningful linguistic units:
159
+ - Noun chunks ("the big cat", "the mat")
160
+ - Verbs ("sleeps")
161
+ - Adverbial modifiers ("soundly")
162
+ - Prepositional phrases ("on the mat")
163
+
164
+ Scoring scale:
165
+ 0.0 - No meaningful units
166
+ 0.9 - One unit (e.g., "the cat")
167
+ 1.0 - Two units (e.g., "the cat sleeps")
168
+ 0.8 - Three units (e.g., "the big cat sleeps quickly")
169
+ 0.6 - Four or more units (e.g., "the big cat sleeps soundly on the mat")
170
+
171
+ Examples from tests:
172
+ "the cat" -> 1 unit (noun chunk) -> 0.9
173
+ "the cat sleeps" -> 2 units (noun chunk + verb) -> 1.0
174
+ "the big cat sleeps soundly on the mat" -> 4 units (noun chunk + verb + adverb + prep phrase) -> 0.6
175
+ """
176
+ # self.logger.debug(f"Calculating length score for: {doc.text}")
177
+ # Count meaningful linguistic units
178
+ units = 0
179
+
180
+ # Count noun chunks
181
+ units += len(list(doc.noun_chunks))
182
+
183
+ # Count verbs
184
+ units += len([token for token in doc if token.pos_ == "VERB"])
185
+
186
+ # Count adverbial modifiers
187
+ units += len([token for token in doc if token.dep_ == "advmod"])
188
+
189
+ # Count prepositional phrases
190
+ units += len([token for token in doc if token.dep_ == "prep"])
191
+
192
+ # Score based on complexity
193
+ if units == 0:
194
+ return 0.0
195
+ elif units == 1:
196
+ return 0.9 # Simple phrase
197
+ elif units == 2:
198
+ return 1.0 # Optimal complexity
199
+ elif units == 3:
200
+ return 0.8 # Slightly complex
201
+ return 0.6 # Too complex
202
+
203
+ def is_complete_clause(self, doc: Doc) -> bool:
204
+ """Check if the text forms a complete clause.
205
+
206
+ Different languages mark subject-verb relationships differently:
207
+ English/French:
208
+ - Subject has nsubj/nsubjpass dependency
209
+ - Verb is ROOT
210
+
211
+ Spanish:
212
+ - Sometimes marks pronoun as ROOT
213
+ - Verb can be marked as flat/aux
214
+ """
215
+ # self.logger.debug(f"Checking if complete clause: {doc.text}")
216
+ # Standard subject-verb pattern (English/French)
217
+ standard_pattern = any(token.dep_ in {"nsubj", "nsubjpass"} for token in doc) and any(
218
+ token.dep_ == "ROOT" and token.pos_ == "VERB" for token in doc
219
+ )
220
+
221
+ # Spanish pronoun-verb pattern
222
+ spanish_pattern = (
223
+ len(doc) == 2 # Two-word phrase
224
+ and doc[0].pos_ == "PRON" # First word is pronoun
225
+ and doc[1].pos_ in {"VERB", "AUX", "ADJ"} # Second word is verb-like
226
+ and doc[1].dep_ in {"flat", "aux"} # Common Spanish dependencies
227
+ )
228
+
229
+ return standard_pattern or spanish_pattern
230
+
231
+ def is_valid_noun_phrase(self, doc: Doc) -> bool:
232
+ """Check if the text is a valid noun phrase like "the big cat".
233
+
234
+ Valid noun phrases:
235
+ - "the cat" (determiner + noun)
236
+ - "the big cat" (determiner + adjective + noun)
237
+ - "my heart" (possessive + noun)
238
+ """
239
+ # self.logger.debug(f"Checking if valid noun phrase: {doc.text}")
240
+ chunks = list(doc.noun_chunks)
241
+ if not chunks:
242
+ return False
243
+
244
+ # The noun phrase should be the entire text
245
+ chunk = chunks[0]
246
+ if not (chunk.start == 0 and chunk.end == len(doc)):
247
+ return False
248
+
249
+ # Check for valid noun phrase structure
250
+ root_nouns = [t for t in doc if t.dep_ == "ROOT" and t.pos_ in {"NOUN", "PROPN"}]
251
+ compounds = [t for t in doc if t.dep_ == "compound"]
252
+
253
+ return len(root_nouns) == 1 and len(compounds) == 0
254
+
255
+ def is_valid_verb_phrase(self, doc: Doc) -> bool:
256
+ """Check if the text is a valid verb phrase like "running fast".
257
+
258
+ A verb phrase must:
259
+ 1. Contain a verb as the first content word
260
+ 2. Only use valid verb phrase dependencies
261
+ 3. Have correct word order (verb before modifiers)
262
+ """
263
+ # self.logger.debug(f"Checking if valid verb phrase: {doc.text}")
264
+ VALID_DEPS = {
265
+ "ROOT", # Main verb
266
+ "advmod", # Adverbial modifier
267
+ "dobj", # Direct object
268
+ "prt", # Verb particle
269
+ "prep", # Preposition
270
+ "pobj", # Object of preposition
271
+ "compound:prt", # Phrasal verb particle
272
+ }
273
+
274
+ # Find all verbs
275
+ verbs = [token for token in doc if token.pos_ == "VERB"]
276
+ if not verbs:
277
+ return False
278
+
279
+ # Check if first content word is a verb
280
+ content_words = [token for token in doc if token.pos_ not in {"DET", "PUNCT"}]
281
+ if not content_words or content_words[0].pos_ != "VERB":
282
+ return False
283
+
284
+ # Check dependencies
285
+ has_valid_deps = all(token.dep_ in VALID_DEPS for token in doc)
286
+ return has_valid_deps
287
+
288
+ def is_valid_prep_phrase(self, doc: Doc) -> bool:
289
+ """Check if the text is a valid prepositional phrase.
290
+
291
+ Examples:
292
+ - "in my heart" (English)
293
+ - "dans la maison" (French: "in the house")
294
+ - "en la casa" (Spanish: "in the house")
295
+ """
296
+ # self.logger.debug(f"Checking if valid prep phrase: {doc.text}")
297
+ starts_with_prep = doc[0].pos_ == "ADP"
298
+ has_content = len(doc) > 1
299
+ has_valid_structure = any(t.dep_ == "pobj" for t in doc) or ( # English style
300
+ doc[0].dep_ == "case" and any(t.dep_ == "ROOT" for t in doc)
301
+ ) # French/Spanish style
302
+
303
+ return starts_with_prep and has_content and has_valid_structure
304
+
305
+ def is_valid_adverb_phrase(self, doc: Doc) -> bool:
306
+ """Check if the text is a valid adverbial phrase.
307
+
308
+ Examples:
309
+ - "très rapidement" (French: "very quickly")
310
+ - "muy rápido" (Spanish: "very fast")
311
+ - "very quickly" (English)
312
+
313
+ Valid patterns:
314
+ - ADV + ADV/ADJ (modifier + main adverb/adjective)
315
+ - First word must modify second word
316
+ - Second word must be the root
317
+ """
318
+ # self.logger.debug(f"Checking if valid adverb phrase: {doc.text}")
319
+ # Check basic structure
320
+ if len(doc) != 2: # Only handle two-word phrases for now
321
+ return False
322
+
323
+ # Check parts of speech
324
+ has_valid_pos = all(token.pos_ in {"ADV", "ADJ"} for token in doc)
325
+ if not has_valid_pos:
326
+ return False
327
+
328
+ first_word = doc[0]
329
+ second_word = doc[1]
330
+
331
+ # The first word must be a modifier
332
+ if first_word.dep_ != "advmod":
333
+ return False
334
+
335
+ # The second word must be the root
336
+ if second_word.dep_ != "ROOT":
337
+ return False
338
+
339
+ # Check that the first word modifies the second
340
+ if first_word.head != second_word:
341
+ return False
342
+
343
+ return True
344
+
345
+ def calculate_line_break_score(self, phrase_start: int, phrase_end: int, context_text: str) -> float:
346
+ """Calculate score based on line break alignment."""
347
+ # Clean the context text while preserving line breaks
348
+ cleaned_lines = [clean_text(line) for line in context_text.split("\n")]
349
+ cleaned_context = "\n".join(cleaned_lines)
350
+
351
+ # Track current position in cleaned context
352
+ current_pos = 0
353
+
354
+ # Recalculate positions using cleaned text
355
+ for line in cleaned_lines:
356
+ if not line: # Skip empty lines
357
+ current_pos += 1 # Account for newline
358
+ continue
359
+
360
+ line_start = current_pos
361
+ line_end = line_start + len(line)
362
+
363
+ # Perfect match with a full line
364
+ if phrase_start == line_start and phrase_end == line_end:
365
+ return 1.0
366
+
367
+ # Strong alignment with start of line
368
+ if phrase_start == line_start:
369
+ coverage = (phrase_end - phrase_start) / len(line)
370
+ if coverage >= 0.7:
371
+ return 0.9
372
+ elif coverage >= 0.3:
373
+ return 0.8
374
+
375
+ # Strong alignment with end of line
376
+ if phrase_end == line_end:
377
+ coverage = (phrase_end - phrase_start) / len(line)
378
+ if coverage >= 0.7:
379
+ return 0.9
380
+ elif coverage >= 0.3:
381
+ return 0.8
382
+
383
+ # Update position for next line
384
+ current_pos = line_end + 1 # +1 for newline
385
+
386
+ # Check if phrase crosses any line boundary
387
+ if any(
388
+ phrase_start < cleaned_context.find("\n", i) < phrase_end for i in range(len(cleaned_context)) if "\n" in cleaned_context[i:]
389
+ ):
390
+ return 0.0
391
+
392
+ return 0.5
393
+
394
+ def calculate_sentence_break_score(self, phrase_doc: Doc, phrase_start: int, phrase_end: int, context_doc: Doc) -> float:
395
+ """Calculate score based on sentence boundary alignment."""
396
+ # self.logger.debug(f"Calculating sentence break score for: {phrase_doc.text}")
397
+ for sent in context_doc.sents:
398
+ sent_start = sent.start_char
399
+ sent_end = sent.end_char
400
+
401
+ # Perfect match with a full sentence
402
+ if phrase_start == sent_start and phrase_end == sent_end:
403
+ return 1.0
404
+
405
+ # Strong alignment with most of a sentence
406
+ if phrase_start >= sent_start and phrase_end <= sent_end:
407
+ has_verb = any(token.pos_ == "VERB" for token in phrase_doc)
408
+ has_subject = any(token.dep_ in {"nsubj", "nsubjpass"} for token in phrase_doc)
409
+
410
+ phrase_len = phrase_end - phrase_start
411
+ sent_len = sent_end - sent_start
412
+ coverage = phrase_len / sent_len
413
+
414
+ if has_verb and has_subject:
415
+ return 0.85
416
+ elif has_verb and coverage > 0.3:
417
+ return 0.8
418
+ elif coverage > 0.5:
419
+ return 0.8
420
+ return 0.7
421
+
422
+ # Crosses sentence boundary
423
+ if any(phrase_start < s.start_char < phrase_end for s in context_doc.sents):
424
+ return 0.0
425
+
426
+ return 0.5
@@ -0,0 +1,30 @@
1
+ import re
2
+
3
+
4
+ def clean_text(text: str) -> str:
5
+ """Clean text by removing punctuation and normalizing whitespace.
6
+
7
+ Args:
8
+ text: Text to clean
9
+
10
+ Returns:
11
+ Cleaned text with:
12
+ - All text converted to lowercase
13
+ - Hyphens and slashes converted to spaces
14
+ - All other punctuation removed
15
+ - Multiple spaces/whitespace collapsed to single space
16
+ - Leading/trailing whitespace removed
17
+ """
18
+ # Convert to lowercase
19
+ text = text.lower()
20
+
21
+ # Replace hyphens and slashes with spaces first
22
+ text = text.replace("-", " ").replace("/", " ")
23
+
24
+ # Remove remaining punctuation
25
+ text = re.sub(r"[^\w\s]", "", text)
26
+
27
+ # Normalize whitespace (collapse multiple spaces, remove leading/trailing)
28
+ text = " ".join(text.split())
29
+
30
+ return text