lyrics-transcriber 0.30.0__py3-none-any.whl → 0.32.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/__init__.py +2 -1
- lyrics_transcriber/cli/{main.py → cli_main.py} +47 -14
- lyrics_transcriber/core/config.py +35 -0
- lyrics_transcriber/core/controller.py +164 -166
- lyrics_transcriber/correction/anchor_sequence.py +471 -0
- lyrics_transcriber/correction/corrector.py +256 -0
- lyrics_transcriber/correction/handlers/__init__.py +0 -0
- lyrics_transcriber/correction/handlers/base.py +30 -0
- lyrics_transcriber/correction/handlers/extend_anchor.py +91 -0
- lyrics_transcriber/correction/handlers/levenshtein.py +147 -0
- lyrics_transcriber/correction/handlers/no_space_punct_match.py +98 -0
- lyrics_transcriber/correction/handlers/relaxed_word_count_match.py +55 -0
- lyrics_transcriber/correction/handlers/repeat.py +71 -0
- lyrics_transcriber/correction/handlers/sound_alike.py +223 -0
- lyrics_transcriber/correction/handlers/syllables_match.py +182 -0
- lyrics_transcriber/correction/handlers/word_count_match.py +54 -0
- lyrics_transcriber/correction/handlers/word_operations.py +135 -0
- lyrics_transcriber/correction/phrase_analyzer.py +426 -0
- lyrics_transcriber/correction/text_utils.py +30 -0
- lyrics_transcriber/lyrics/base_lyrics_provider.py +125 -0
- lyrics_transcriber/lyrics/genius.py +73 -0
- lyrics_transcriber/lyrics/spotify.py +82 -0
- lyrics_transcriber/output/ass/__init__.py +21 -0
- lyrics_transcriber/output/{ass.py → ass/ass.py} +150 -690
- lyrics_transcriber/output/ass/ass_specs.txt +732 -0
- lyrics_transcriber/output/ass/config.py +37 -0
- lyrics_transcriber/output/ass/constants.py +23 -0
- lyrics_transcriber/output/ass/event.py +94 -0
- lyrics_transcriber/output/ass/formatters.py +132 -0
- lyrics_transcriber/output/ass/lyrics_line.py +219 -0
- lyrics_transcriber/output/ass/lyrics_screen.py +252 -0
- lyrics_transcriber/output/ass/section_detector.py +89 -0
- lyrics_transcriber/output/ass/section_screen.py +106 -0
- lyrics_transcriber/output/ass/style.py +187 -0
- lyrics_transcriber/output/cdg.py +503 -0
- lyrics_transcriber/output/cdgmaker/__init__.py +0 -0
- lyrics_transcriber/output/cdgmaker/cdg.py +262 -0
- lyrics_transcriber/output/cdgmaker/composer.py +1919 -0
- lyrics_transcriber/output/cdgmaker/config.py +151 -0
- lyrics_transcriber/output/cdgmaker/images/instrumental.png +0 -0
- lyrics_transcriber/output/cdgmaker/images/intro.png +0 -0
- lyrics_transcriber/output/cdgmaker/pack.py +507 -0
- lyrics_transcriber/output/cdgmaker/render.py +346 -0
- lyrics_transcriber/output/cdgmaker/transitions/centertexttoplogobottomtext.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circlein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/circleout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/fizzle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/largecentertexttoplogo.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/rectangle.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/spiral.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/topleftmusicalnotes.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipein.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeleft.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wipeout.png +0 -0
- lyrics_transcriber/output/cdgmaker/transitions/wiperight.png +0 -0
- lyrics_transcriber/output/cdgmaker/utils.py +132 -0
- lyrics_transcriber/output/fonts/AvenirNext-Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSans-VariableFont_opsz,wght.ttf +0 -0
- lyrics_transcriber/output/fonts/DMSerifDisplay-Regular.ttf +0 -0
- lyrics_transcriber/output/fonts/Oswald-SemiBold.ttf +0 -0
- lyrics_transcriber/output/fonts/Zurich_Cn_BT_Bold.ttf +0 -0
- lyrics_transcriber/output/fonts/arial.ttf +0 -0
- lyrics_transcriber/output/fonts/georgia.ttf +0 -0
- lyrics_transcriber/output/fonts/verdana.ttf +0 -0
- lyrics_transcriber/output/generator.py +140 -171
- lyrics_transcriber/output/lyrics_file.py +102 -0
- lyrics_transcriber/output/plain_text.py +91 -0
- lyrics_transcriber/output/segment_resizer.py +416 -0
- lyrics_transcriber/output/subtitles.py +328 -302
- lyrics_transcriber/output/video.py +219 -0
- lyrics_transcriber/review/__init__.py +1 -0
- lyrics_transcriber/review/server.py +138 -0
- lyrics_transcriber/storage/dropbox.py +110 -134
- lyrics_transcriber/transcribers/audioshake.py +171 -105
- lyrics_transcriber/transcribers/base_transcriber.py +149 -0
- lyrics_transcriber/transcribers/whisper.py +267 -133
- lyrics_transcriber/types.py +454 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/METADATA +14 -3
- lyrics_transcriber-0.32.1.dist-info/RECORD +86 -0
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/WHEEL +1 -1
- lyrics_transcriber-0.32.1.dist-info/entry_points.txt +4 -0
- lyrics_transcriber/core/corrector.py +0 -56
- lyrics_transcriber/core/fetcher.py +0 -143
- lyrics_transcriber/storage/tokens.py +0 -116
- lyrics_transcriber/transcribers/base.py +0 -31
- lyrics_transcriber-0.30.0.dist-info/RECORD +0 -22
- lyrics_transcriber-0.30.0.dist-info/entry_points.txt +0 -3
- {lyrics_transcriber-0.30.0.dist-info → lyrics_transcriber-0.32.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,135 @@
|
|
1
|
+
from typing import List, Optional, Dict
|
2
|
+
from lyrics_transcriber.types import WordCorrection, GapSequence
|
3
|
+
|
4
|
+
|
5
|
+
class WordOperations:
|
6
|
+
"""Utility class for common word manipulation operations used by correction handlers."""
|
7
|
+
|
8
|
+
@staticmethod
|
9
|
+
def calculate_reference_positions(gap: GapSequence, sources: Optional[List[str]] = None) -> Dict[str, int]:
|
10
|
+
"""Calculate reference positions for given sources based on preceding anchor.
|
11
|
+
|
12
|
+
Args:
|
13
|
+
gap: The gap sequence containing the preceding anchor
|
14
|
+
sources: Optional list of sources to calculate positions for. If None, uses all sources.
|
15
|
+
|
16
|
+
Returns:
|
17
|
+
Dictionary mapping source names to their reference positions
|
18
|
+
"""
|
19
|
+
reference_positions = {}
|
20
|
+
if gap.preceding_anchor:
|
21
|
+
# If no sources specified, use all sources from reference words
|
22
|
+
sources_to_check = sources or list(gap.reference_words.keys())
|
23
|
+
|
24
|
+
for source in sources_to_check:
|
25
|
+
if source in gap.preceding_anchor.reference_positions:
|
26
|
+
# Calculate position based on anchor position and offset
|
27
|
+
anchor_pos = gap.preceding_anchor.reference_positions[source]
|
28
|
+
ref_pos = anchor_pos + len(gap.preceding_anchor.words)
|
29
|
+
reference_positions[source] = ref_pos
|
30
|
+
return reference_positions
|
31
|
+
|
32
|
+
@staticmethod
|
33
|
+
def create_word_replacement_correction(
|
34
|
+
original_word: str,
|
35
|
+
corrected_word: str,
|
36
|
+
original_position: int,
|
37
|
+
source: str,
|
38
|
+
confidence: float,
|
39
|
+
reason: str,
|
40
|
+
reference_positions: Optional[Dict[str, int]] = None,
|
41
|
+
) -> WordCorrection:
|
42
|
+
"""Creates a correction for replacing a single word with another word."""
|
43
|
+
return WordCorrection(
|
44
|
+
original_word=original_word,
|
45
|
+
corrected_word=corrected_word,
|
46
|
+
segment_index=0,
|
47
|
+
original_position=original_position,
|
48
|
+
confidence=confidence,
|
49
|
+
source=source,
|
50
|
+
reason=reason,
|
51
|
+
alternatives={},
|
52
|
+
reference_positions=reference_positions,
|
53
|
+
length=1, # Single word replacement
|
54
|
+
)
|
55
|
+
|
56
|
+
@staticmethod
|
57
|
+
def create_word_split_corrections(
|
58
|
+
original_word: str,
|
59
|
+
reference_words: List[str],
|
60
|
+
original_position: int,
|
61
|
+
source: str,
|
62
|
+
confidence: float,
|
63
|
+
reason: str,
|
64
|
+
reference_positions: Optional[Dict[str, int]] = None,
|
65
|
+
) -> List[WordCorrection]:
|
66
|
+
"""Creates corrections for splitting a single word into multiple words."""
|
67
|
+
corrections = []
|
68
|
+
for split_idx, ref_word in enumerate(reference_words):
|
69
|
+
corrections.append(
|
70
|
+
WordCorrection(
|
71
|
+
original_word=original_word,
|
72
|
+
corrected_word=ref_word,
|
73
|
+
segment_index=0,
|
74
|
+
original_position=original_position,
|
75
|
+
confidence=confidence,
|
76
|
+
source=source,
|
77
|
+
reason=reason,
|
78
|
+
alternatives={},
|
79
|
+
split_index=split_idx,
|
80
|
+
split_total=len(reference_words),
|
81
|
+
reference_positions=reference_positions,
|
82
|
+
length=1, # Each split word is length 1
|
83
|
+
)
|
84
|
+
)
|
85
|
+
return corrections
|
86
|
+
|
87
|
+
@staticmethod
|
88
|
+
def create_word_combine_corrections(
|
89
|
+
original_words: List[str],
|
90
|
+
reference_word: str,
|
91
|
+
original_position: int,
|
92
|
+
source: str,
|
93
|
+
confidence: float,
|
94
|
+
combine_reason: str,
|
95
|
+
delete_reason: str,
|
96
|
+
reference_positions: Optional[Dict[str, int]] = None,
|
97
|
+
) -> List[WordCorrection]:
|
98
|
+
"""Creates corrections for combining multiple words into a single word."""
|
99
|
+
corrections = []
|
100
|
+
|
101
|
+
# First word gets replaced
|
102
|
+
corrections.append(
|
103
|
+
WordCorrection(
|
104
|
+
original_word=original_words[0],
|
105
|
+
corrected_word=reference_word,
|
106
|
+
segment_index=0,
|
107
|
+
original_position=original_position,
|
108
|
+
confidence=confidence,
|
109
|
+
source=source,
|
110
|
+
reason=combine_reason,
|
111
|
+
alternatives={},
|
112
|
+
reference_positions=reference_positions,
|
113
|
+
length=len(original_words), # Combined word spans all original words
|
114
|
+
)
|
115
|
+
)
|
116
|
+
|
117
|
+
# Additional words get marked for deletion
|
118
|
+
for i, word in enumerate(original_words[1:], start=1):
|
119
|
+
corrections.append(
|
120
|
+
WordCorrection(
|
121
|
+
original_word=word,
|
122
|
+
corrected_word="",
|
123
|
+
segment_index=0,
|
124
|
+
original_position=original_position + i,
|
125
|
+
confidence=confidence,
|
126
|
+
source=source,
|
127
|
+
reason=delete_reason,
|
128
|
+
alternatives={},
|
129
|
+
is_deletion=True,
|
130
|
+
reference_positions=reference_positions,
|
131
|
+
length=1, # Deleted words are length 1
|
132
|
+
)
|
133
|
+
)
|
134
|
+
|
135
|
+
return corrections
|
@@ -0,0 +1,426 @@
|
|
1
|
+
from typing import List
|
2
|
+
import spacy
|
3
|
+
from spacy.tokens import Doc
|
4
|
+
import logging
|
5
|
+
from lyrics_transcriber.correction.text_utils import clean_text
|
6
|
+
from lyrics_transcriber.types import PhraseType, PhraseScore
|
7
|
+
|
8
|
+
|
9
|
+
class PhraseAnalyzer:
|
10
|
+
"""Language-agnostic phrase analyzer using spaCy"""
|
11
|
+
|
12
|
+
def __init__(self, logger: logging.Logger, language_code: str = "en_core_web_sm"):
|
13
|
+
"""Initialize with specific language model and logger
|
14
|
+
|
15
|
+
Args:
|
16
|
+
logger: Logger instance to use for this analyzer
|
17
|
+
language_code: spaCy language model to use
|
18
|
+
"""
|
19
|
+
self.logger = logger
|
20
|
+
self.logger.info(f"Initializing PhraseAnalyzer with language model: {language_code}")
|
21
|
+
try:
|
22
|
+
self.nlp = spacy.load(language_code)
|
23
|
+
except OSError:
|
24
|
+
self.logger.error(f"Failed to load language model: {language_code}")
|
25
|
+
raise OSError(
|
26
|
+
f"Language model '{language_code}' not found. " f"Please install it with: python -m spacy download {language_code}"
|
27
|
+
)
|
28
|
+
|
29
|
+
def score_phrase(self, words: List[str], context: str) -> PhraseScore:
|
30
|
+
"""Score a phrase based on grammatical completeness and natural breaks.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
words: List of words in the phrase
|
34
|
+
context: Full text containing the phrase
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
PhraseScore with phrase_type, natural_break_score, and length_score
|
38
|
+
"""
|
39
|
+
# self.logger.info(f"Scoring phrase with context length {len(context)}: {' '.join(words)}")
|
40
|
+
|
41
|
+
phrase = " ".join(words)
|
42
|
+
phrase_doc = self.nlp(phrase)
|
43
|
+
context_doc = self.nlp(context)
|
44
|
+
|
45
|
+
# Get initial phrase type based on grammar
|
46
|
+
phrase_type = self._determine_phrase_type(phrase_doc)
|
47
|
+
|
48
|
+
# Calculate scores
|
49
|
+
break_score = self._calculate_break_score(phrase_doc, context_doc)
|
50
|
+
length_score = self._calculate_length_score(phrase_doc)
|
51
|
+
|
52
|
+
# If break score is 0 (crosses boundary), override to CROSS_BOUNDARY
|
53
|
+
if break_score == 0.0:
|
54
|
+
phrase_type = PhraseType.CROSS_BOUNDARY
|
55
|
+
|
56
|
+
return PhraseScore(phrase_type=phrase_type, natural_break_score=break_score, length_score=length_score)
|
57
|
+
|
58
|
+
def _determine_phrase_type(self, doc: Doc) -> PhraseType:
|
59
|
+
"""Determine the grammatical type of a phrase using SpaCy's linguistic analysis.
|
60
|
+
|
61
|
+
This method categorizes text into three types:
|
62
|
+
1. COMPLETE: A grammatically complete clause with subject and predicate
|
63
|
+
Examples: "I love you", "the cat sleeps"
|
64
|
+
- Subject (I, the cat) + Predicate (love you, sleeps)
|
65
|
+
|
66
|
+
2. PARTIAL: A valid but incomplete grammatical unit, which can be:
|
67
|
+
a) Noun phrase: A group of words with a noun as the head
|
68
|
+
Example: "the big cat"
|
69
|
+
- Determiner (the) + Adjective (big) + Noun (cat)
|
70
|
+
|
71
|
+
b) Verb phrase: A group of words with a verb as the head
|
72
|
+
Example: "running fast"
|
73
|
+
- Verb (running) + Adverb (fast)
|
74
|
+
|
75
|
+
c) Prepositional phrase: Starting with a preposition
|
76
|
+
Example: "in my heart"
|
77
|
+
- Preposition (in) + Noun phrase (my heart)
|
78
|
+
|
79
|
+
d) Adverb phrase: A group of words with an adverb as the head
|
80
|
+
Example: "très rapidement" (French: "very quickly")
|
81
|
+
- Adverb (très) + Adverb (rapidement)
|
82
|
+
|
83
|
+
3. CROSS_BOUNDARY: Invalid grammatical structure
|
84
|
+
Examples: "cat the big", "love but the"
|
85
|
+
- Words in unnatural order or incomplete structures
|
86
|
+
|
87
|
+
Args:
|
88
|
+
doc: SpaCy Doc object containing the parsed text
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
PhraseType: COMPLETE, PARTIAL, or CROSS_BOUNDARY
|
92
|
+
"""
|
93
|
+
# self.logger.debug(f"Determining phrase type for: {doc.text}")
|
94
|
+
|
95
|
+
# First check if it's a complete clause
|
96
|
+
if self.is_complete_clause(doc):
|
97
|
+
return PhraseType.COMPLETE
|
98
|
+
|
99
|
+
# Check if it's a valid partial phrase
|
100
|
+
if (
|
101
|
+
self.is_valid_noun_phrase(doc)
|
102
|
+
or self.is_valid_verb_phrase(doc)
|
103
|
+
or self.is_valid_prep_phrase(doc)
|
104
|
+
or self.is_valid_adverb_phrase(doc)
|
105
|
+
):
|
106
|
+
# Additional check: if the phrase crosses sentence boundaries,
|
107
|
+
# it should be CROSS_BOUNDARY even if it's grammatically valid
|
108
|
+
if "." in doc.text: # Simple check for sentence boundary within phrase
|
109
|
+
return PhraseType.CROSS_BOUNDARY
|
110
|
+
return PhraseType.PARTIAL
|
111
|
+
|
112
|
+
return PhraseType.CROSS_BOUNDARY
|
113
|
+
|
114
|
+
def _calculate_break_score(self, phrase_doc: Doc, context_doc: Doc) -> float:
|
115
|
+
"""Calculate how well the phrase respects natural breaks in the text.
|
116
|
+
|
117
|
+
Scores are based on alignment with line breaks and sentence boundaries:
|
118
|
+
1.0 - Perfect alignment (matches full line or sentence)
|
119
|
+
0.8-0.9 - Strong alignment (matches most of a natural unit)
|
120
|
+
0.5-0.7 - Partial alignment (matches start or end of unit)
|
121
|
+
0.0 - Poor alignment (crosses line/sentence boundary)
|
122
|
+
|
123
|
+
Examples from tests:
|
124
|
+
"my heart will go on" -> 1.0 (matches full line)
|
125
|
+
"go on and" -> 0.0 (crosses line break)
|
126
|
+
"Hello world" -> 1.0 (matches complete sentence)
|
127
|
+
"world How" -> 0.0 (crosses sentence boundary)
|
128
|
+
"I wake up" -> 0.85 (strong alignment with verb phrase)
|
129
|
+
"""
|
130
|
+
# Clean both texts while preserving structure
|
131
|
+
phrase_text = clean_text(phrase_doc.text)
|
132
|
+
context_text = clean_text(context_doc.text)
|
133
|
+
|
134
|
+
# Find position in cleaned text
|
135
|
+
phrase_start = context_text.find(phrase_text)
|
136
|
+
|
137
|
+
if phrase_start == -1:
|
138
|
+
return 0.0
|
139
|
+
|
140
|
+
phrase_end = phrase_start + len(phrase_text)
|
141
|
+
|
142
|
+
# Check line breaks first
|
143
|
+
line_score = self.calculate_line_break_score(phrase_start, phrase_end, context_doc.text)
|
144
|
+
if line_score in {0.0, 1.0}: # Perfect match or crossing boundary
|
145
|
+
return line_score
|
146
|
+
|
147
|
+
# Then check sentence boundaries
|
148
|
+
sentence_score = self.calculate_sentence_break_score(phrase_doc, phrase_start, phrase_end, context_doc)
|
149
|
+
if sentence_score in {0.0, 1.0}: # Perfect match or crossing boundary
|
150
|
+
return sentence_score
|
151
|
+
|
152
|
+
# Return the higher of the two scores
|
153
|
+
return max(line_score, sentence_score)
|
154
|
+
|
155
|
+
def _calculate_length_score(self, doc: Doc) -> float:
|
156
|
+
"""Calculate score based on phrase length and complexity.
|
157
|
+
|
158
|
+
Scores are based on the number of meaningful linguistic units:
|
159
|
+
- Noun chunks ("the big cat", "the mat")
|
160
|
+
- Verbs ("sleeps")
|
161
|
+
- Adverbial modifiers ("soundly")
|
162
|
+
- Prepositional phrases ("on the mat")
|
163
|
+
|
164
|
+
Scoring scale:
|
165
|
+
0.0 - No meaningful units
|
166
|
+
0.9 - One unit (e.g., "the cat")
|
167
|
+
1.0 - Two units (e.g., "the cat sleeps")
|
168
|
+
0.8 - Three units (e.g., "the big cat sleeps quickly")
|
169
|
+
0.6 - Four or more units (e.g., "the big cat sleeps soundly on the mat")
|
170
|
+
|
171
|
+
Examples from tests:
|
172
|
+
"the cat" -> 1 unit (noun chunk) -> 0.9
|
173
|
+
"the cat sleeps" -> 2 units (noun chunk + verb) -> 1.0
|
174
|
+
"the big cat sleeps soundly on the mat" -> 4 units (noun chunk + verb + adverb + prep phrase) -> 0.6
|
175
|
+
"""
|
176
|
+
# self.logger.debug(f"Calculating length score for: {doc.text}")
|
177
|
+
# Count meaningful linguistic units
|
178
|
+
units = 0
|
179
|
+
|
180
|
+
# Count noun chunks
|
181
|
+
units += len(list(doc.noun_chunks))
|
182
|
+
|
183
|
+
# Count verbs
|
184
|
+
units += len([token for token in doc if token.pos_ == "VERB"])
|
185
|
+
|
186
|
+
# Count adverbial modifiers
|
187
|
+
units += len([token for token in doc if token.dep_ == "advmod"])
|
188
|
+
|
189
|
+
# Count prepositional phrases
|
190
|
+
units += len([token for token in doc if token.dep_ == "prep"])
|
191
|
+
|
192
|
+
# Score based on complexity
|
193
|
+
if units == 0:
|
194
|
+
return 0.0
|
195
|
+
elif units == 1:
|
196
|
+
return 0.9 # Simple phrase
|
197
|
+
elif units == 2:
|
198
|
+
return 1.0 # Optimal complexity
|
199
|
+
elif units == 3:
|
200
|
+
return 0.8 # Slightly complex
|
201
|
+
return 0.6 # Too complex
|
202
|
+
|
203
|
+
def is_complete_clause(self, doc: Doc) -> bool:
|
204
|
+
"""Check if the text forms a complete clause.
|
205
|
+
|
206
|
+
Different languages mark subject-verb relationships differently:
|
207
|
+
English/French:
|
208
|
+
- Subject has nsubj/nsubjpass dependency
|
209
|
+
- Verb is ROOT
|
210
|
+
|
211
|
+
Spanish:
|
212
|
+
- Sometimes marks pronoun as ROOT
|
213
|
+
- Verb can be marked as flat/aux
|
214
|
+
"""
|
215
|
+
# self.logger.debug(f"Checking if complete clause: {doc.text}")
|
216
|
+
# Standard subject-verb pattern (English/French)
|
217
|
+
standard_pattern = any(token.dep_ in {"nsubj", "nsubjpass"} for token in doc) and any(
|
218
|
+
token.dep_ == "ROOT" and token.pos_ == "VERB" for token in doc
|
219
|
+
)
|
220
|
+
|
221
|
+
# Spanish pronoun-verb pattern
|
222
|
+
spanish_pattern = (
|
223
|
+
len(doc) == 2 # Two-word phrase
|
224
|
+
and doc[0].pos_ == "PRON" # First word is pronoun
|
225
|
+
and doc[1].pos_ in {"VERB", "AUX", "ADJ"} # Second word is verb-like
|
226
|
+
and doc[1].dep_ in {"flat", "aux"} # Common Spanish dependencies
|
227
|
+
)
|
228
|
+
|
229
|
+
return standard_pattern or spanish_pattern
|
230
|
+
|
231
|
+
def is_valid_noun_phrase(self, doc: Doc) -> bool:
|
232
|
+
"""Check if the text is a valid noun phrase like "the big cat".
|
233
|
+
|
234
|
+
Valid noun phrases:
|
235
|
+
- "the cat" (determiner + noun)
|
236
|
+
- "the big cat" (determiner + adjective + noun)
|
237
|
+
- "my heart" (possessive + noun)
|
238
|
+
"""
|
239
|
+
# self.logger.debug(f"Checking if valid noun phrase: {doc.text}")
|
240
|
+
chunks = list(doc.noun_chunks)
|
241
|
+
if not chunks:
|
242
|
+
return False
|
243
|
+
|
244
|
+
# The noun phrase should be the entire text
|
245
|
+
chunk = chunks[0]
|
246
|
+
if not (chunk.start == 0 and chunk.end == len(doc)):
|
247
|
+
return False
|
248
|
+
|
249
|
+
# Check for valid noun phrase structure
|
250
|
+
root_nouns = [t for t in doc if t.dep_ == "ROOT" and t.pos_ in {"NOUN", "PROPN"}]
|
251
|
+
compounds = [t for t in doc if t.dep_ == "compound"]
|
252
|
+
|
253
|
+
return len(root_nouns) == 1 and len(compounds) == 0
|
254
|
+
|
255
|
+
def is_valid_verb_phrase(self, doc: Doc) -> bool:
|
256
|
+
"""Check if the text is a valid verb phrase like "running fast".
|
257
|
+
|
258
|
+
A verb phrase must:
|
259
|
+
1. Contain a verb as the first content word
|
260
|
+
2. Only use valid verb phrase dependencies
|
261
|
+
3. Have correct word order (verb before modifiers)
|
262
|
+
"""
|
263
|
+
# self.logger.debug(f"Checking if valid verb phrase: {doc.text}")
|
264
|
+
VALID_DEPS = {
|
265
|
+
"ROOT", # Main verb
|
266
|
+
"advmod", # Adverbial modifier
|
267
|
+
"dobj", # Direct object
|
268
|
+
"prt", # Verb particle
|
269
|
+
"prep", # Preposition
|
270
|
+
"pobj", # Object of preposition
|
271
|
+
"compound:prt", # Phrasal verb particle
|
272
|
+
}
|
273
|
+
|
274
|
+
# Find all verbs
|
275
|
+
verbs = [token for token in doc if token.pos_ == "VERB"]
|
276
|
+
if not verbs:
|
277
|
+
return False
|
278
|
+
|
279
|
+
# Check if first content word is a verb
|
280
|
+
content_words = [token for token in doc if token.pos_ not in {"DET", "PUNCT"}]
|
281
|
+
if not content_words or content_words[0].pos_ != "VERB":
|
282
|
+
return False
|
283
|
+
|
284
|
+
# Check dependencies
|
285
|
+
has_valid_deps = all(token.dep_ in VALID_DEPS for token in doc)
|
286
|
+
return has_valid_deps
|
287
|
+
|
288
|
+
def is_valid_prep_phrase(self, doc: Doc) -> bool:
|
289
|
+
"""Check if the text is a valid prepositional phrase.
|
290
|
+
|
291
|
+
Examples:
|
292
|
+
- "in my heart" (English)
|
293
|
+
- "dans la maison" (French: "in the house")
|
294
|
+
- "en la casa" (Spanish: "in the house")
|
295
|
+
"""
|
296
|
+
# self.logger.debug(f"Checking if valid prep phrase: {doc.text}")
|
297
|
+
starts_with_prep = doc[0].pos_ == "ADP"
|
298
|
+
has_content = len(doc) > 1
|
299
|
+
has_valid_structure = any(t.dep_ == "pobj" for t in doc) or ( # English style
|
300
|
+
doc[0].dep_ == "case" and any(t.dep_ == "ROOT" for t in doc)
|
301
|
+
) # French/Spanish style
|
302
|
+
|
303
|
+
return starts_with_prep and has_content and has_valid_structure
|
304
|
+
|
305
|
+
def is_valid_adverb_phrase(self, doc: Doc) -> bool:
|
306
|
+
"""Check if the text is a valid adverbial phrase.
|
307
|
+
|
308
|
+
Examples:
|
309
|
+
- "très rapidement" (French: "very quickly")
|
310
|
+
- "muy rápido" (Spanish: "very fast")
|
311
|
+
- "very quickly" (English)
|
312
|
+
|
313
|
+
Valid patterns:
|
314
|
+
- ADV + ADV/ADJ (modifier + main adverb/adjective)
|
315
|
+
- First word must modify second word
|
316
|
+
- Second word must be the root
|
317
|
+
"""
|
318
|
+
# self.logger.debug(f"Checking if valid adverb phrase: {doc.text}")
|
319
|
+
# Check basic structure
|
320
|
+
if len(doc) != 2: # Only handle two-word phrases for now
|
321
|
+
return False
|
322
|
+
|
323
|
+
# Check parts of speech
|
324
|
+
has_valid_pos = all(token.pos_ in {"ADV", "ADJ"} for token in doc)
|
325
|
+
if not has_valid_pos:
|
326
|
+
return False
|
327
|
+
|
328
|
+
first_word = doc[0]
|
329
|
+
second_word = doc[1]
|
330
|
+
|
331
|
+
# The first word must be a modifier
|
332
|
+
if first_word.dep_ != "advmod":
|
333
|
+
return False
|
334
|
+
|
335
|
+
# The second word must be the root
|
336
|
+
if second_word.dep_ != "ROOT":
|
337
|
+
return False
|
338
|
+
|
339
|
+
# Check that the first word modifies the second
|
340
|
+
if first_word.head != second_word:
|
341
|
+
return False
|
342
|
+
|
343
|
+
return True
|
344
|
+
|
345
|
+
def calculate_line_break_score(self, phrase_start: int, phrase_end: int, context_text: str) -> float:
|
346
|
+
"""Calculate score based on line break alignment."""
|
347
|
+
# Clean the context text while preserving line breaks
|
348
|
+
cleaned_lines = [clean_text(line) for line in context_text.split("\n")]
|
349
|
+
cleaned_context = "\n".join(cleaned_lines)
|
350
|
+
|
351
|
+
# Track current position in cleaned context
|
352
|
+
current_pos = 0
|
353
|
+
|
354
|
+
# Recalculate positions using cleaned text
|
355
|
+
for line in cleaned_lines:
|
356
|
+
if not line: # Skip empty lines
|
357
|
+
current_pos += 1 # Account for newline
|
358
|
+
continue
|
359
|
+
|
360
|
+
line_start = current_pos
|
361
|
+
line_end = line_start + len(line)
|
362
|
+
|
363
|
+
# Perfect match with a full line
|
364
|
+
if phrase_start == line_start and phrase_end == line_end:
|
365
|
+
return 1.0
|
366
|
+
|
367
|
+
# Strong alignment with start of line
|
368
|
+
if phrase_start == line_start:
|
369
|
+
coverage = (phrase_end - phrase_start) / len(line)
|
370
|
+
if coverage >= 0.7:
|
371
|
+
return 0.9
|
372
|
+
elif coverage >= 0.3:
|
373
|
+
return 0.8
|
374
|
+
|
375
|
+
# Strong alignment with end of line
|
376
|
+
if phrase_end == line_end:
|
377
|
+
coverage = (phrase_end - phrase_start) / len(line)
|
378
|
+
if coverage >= 0.7:
|
379
|
+
return 0.9
|
380
|
+
elif coverage >= 0.3:
|
381
|
+
return 0.8
|
382
|
+
|
383
|
+
# Update position for next line
|
384
|
+
current_pos = line_end + 1 # +1 for newline
|
385
|
+
|
386
|
+
# Check if phrase crosses any line boundary
|
387
|
+
if any(
|
388
|
+
phrase_start < cleaned_context.find("\n", i) < phrase_end for i in range(len(cleaned_context)) if "\n" in cleaned_context[i:]
|
389
|
+
):
|
390
|
+
return 0.0
|
391
|
+
|
392
|
+
return 0.5
|
393
|
+
|
394
|
+
def calculate_sentence_break_score(self, phrase_doc: Doc, phrase_start: int, phrase_end: int, context_doc: Doc) -> float:
|
395
|
+
"""Calculate score based on sentence boundary alignment."""
|
396
|
+
# self.logger.debug(f"Calculating sentence break score for: {phrase_doc.text}")
|
397
|
+
for sent in context_doc.sents:
|
398
|
+
sent_start = sent.start_char
|
399
|
+
sent_end = sent.end_char
|
400
|
+
|
401
|
+
# Perfect match with a full sentence
|
402
|
+
if phrase_start == sent_start and phrase_end == sent_end:
|
403
|
+
return 1.0
|
404
|
+
|
405
|
+
# Strong alignment with most of a sentence
|
406
|
+
if phrase_start >= sent_start and phrase_end <= sent_end:
|
407
|
+
has_verb = any(token.pos_ == "VERB" for token in phrase_doc)
|
408
|
+
has_subject = any(token.dep_ in {"nsubj", "nsubjpass"} for token in phrase_doc)
|
409
|
+
|
410
|
+
phrase_len = phrase_end - phrase_start
|
411
|
+
sent_len = sent_end - sent_start
|
412
|
+
coverage = phrase_len / sent_len
|
413
|
+
|
414
|
+
if has_verb and has_subject:
|
415
|
+
return 0.85
|
416
|
+
elif has_verb and coverage > 0.3:
|
417
|
+
return 0.8
|
418
|
+
elif coverage > 0.5:
|
419
|
+
return 0.8
|
420
|
+
return 0.7
|
421
|
+
|
422
|
+
# Crosses sentence boundary
|
423
|
+
if any(phrase_start < s.start_char < phrase_end for s in context_doc.sents):
|
424
|
+
return 0.0
|
425
|
+
|
426
|
+
return 0.5
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import re
|
2
|
+
|
3
|
+
|
4
|
+
def clean_text(text: str) -> str:
|
5
|
+
"""Clean text by removing punctuation and normalizing whitespace.
|
6
|
+
|
7
|
+
Args:
|
8
|
+
text: Text to clean
|
9
|
+
|
10
|
+
Returns:
|
11
|
+
Cleaned text with:
|
12
|
+
- All text converted to lowercase
|
13
|
+
- Hyphens and slashes converted to spaces
|
14
|
+
- All other punctuation removed
|
15
|
+
- Multiple spaces/whitespace collapsed to single space
|
16
|
+
- Leading/trailing whitespace removed
|
17
|
+
"""
|
18
|
+
# Convert to lowercase
|
19
|
+
text = text.lower()
|
20
|
+
|
21
|
+
# Replace hyphens and slashes with spaces first
|
22
|
+
text = text.replace("-", " ").replace("/", " ")
|
23
|
+
|
24
|
+
# Remove remaining punctuation
|
25
|
+
text = re.sub(r"[^\w\s]", "", text)
|
26
|
+
|
27
|
+
# Normalize whitespace (collapse multiple spaces, remove leading/trailing)
|
28
|
+
text = " ".join(text.split())
|
29
|
+
|
30
|
+
return text
|