karaoke-gen 0.71.27__py3-none-any.whl → 0.71.42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- karaoke_gen/file_handler.py +192 -0
- karaoke_gen/instrumental_review/__init__.py +45 -0
- karaoke_gen/instrumental_review/analyzer.py +408 -0
- karaoke_gen/instrumental_review/editor.py +322 -0
- karaoke_gen/instrumental_review/models.py +171 -0
- karaoke_gen/instrumental_review/server.py +1181 -0
- karaoke_gen/instrumental_review/waveform.py +409 -0
- karaoke_gen/utils/cli_args.py +5 -0
- karaoke_gen/utils/gen_cli.py +186 -0
- karaoke_gen/utils/remote_cli.py +629 -69
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.71.42.dist-info}/METADATA +4 -1
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.71.42.dist-info}/RECORD +16 -10
- lyrics_transcriber/correction/anchor_sequence.py +226 -350
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.71.42.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.71.42.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.71.42.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import threading
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
|
4
4
|
import logging
|
|
5
5
|
from tqdm import tqdm
|
|
6
|
-
from multiprocessing import Pool, cpu_count
|
|
7
6
|
from functools import partial
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
import json
|
|
@@ -64,8 +63,70 @@ class AnchorSequenceFinder:
|
|
|
64
63
|
# self.logger.debug(f"_find_ngrams called with {len(words)} words, n={n}")
|
|
65
64
|
return [(words[i : i + n], i) for i in range(len(words) - n + 1)]
|
|
66
65
|
|
|
66
|
+
def _build_ngram_index(
|
|
67
|
+
self,
|
|
68
|
+
references: Dict[str, List[str]],
|
|
69
|
+
n: int
|
|
70
|
+
) -> Dict[Tuple[str, ...], Dict[str, List[int]]]:
|
|
71
|
+
"""
|
|
72
|
+
Build a hash-based index mapping n-grams to their positions in each reference.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
references: Dict mapping source names to lists of cleaned words
|
|
76
|
+
n: The n-gram length to index
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Dict mapping n-gram tuples to {source: [positions]} dicts
|
|
80
|
+
"""
|
|
81
|
+
index: Dict[Tuple[str, ...], Dict[str, List[int]]] = {}
|
|
82
|
+
|
|
83
|
+
for source, words in references.items():
|
|
84
|
+
for i in range(len(words) - n + 1):
|
|
85
|
+
ngram_tuple = tuple(words[i:i + n])
|
|
86
|
+
if ngram_tuple not in index:
|
|
87
|
+
index[ngram_tuple] = {}
|
|
88
|
+
if source not in index[ngram_tuple]:
|
|
89
|
+
index[ngram_tuple][source] = []
|
|
90
|
+
index[ngram_tuple][source].append(i)
|
|
91
|
+
|
|
92
|
+
return index
|
|
93
|
+
|
|
94
|
+
def _find_matching_sources_indexed(
|
|
95
|
+
self,
|
|
96
|
+
ngram: List[str],
|
|
97
|
+
ngram_index: Dict[Tuple[str, ...], Dict[str, List[int]]]
|
|
98
|
+
) -> Dict[str, int]:
|
|
99
|
+
"""
|
|
100
|
+
Find which sources contain the given n-gram using pre-built index (O(1) lookup).
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
ngram: List of words to find
|
|
104
|
+
ngram_index: Pre-built index from _build_ngram_index()
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Dict mapping source names to first unused position
|
|
108
|
+
"""
|
|
109
|
+
matches = {}
|
|
110
|
+
ngram_tuple = tuple(ngram)
|
|
111
|
+
|
|
112
|
+
# O(1) lookup in the index
|
|
113
|
+
if ngram_tuple not in ngram_index:
|
|
114
|
+
return matches
|
|
115
|
+
|
|
116
|
+
source_positions = ngram_index[ngram_tuple]
|
|
117
|
+
|
|
118
|
+
# For each source that contains this n-gram, find first unused position
|
|
119
|
+
for source, positions in source_positions.items():
|
|
120
|
+
used = self.used_positions.get(source, set())
|
|
121
|
+
for pos in positions:
|
|
122
|
+
if pos not in used:
|
|
123
|
+
matches[source] = pos
|
|
124
|
+
break
|
|
125
|
+
|
|
126
|
+
return matches
|
|
127
|
+
|
|
67
128
|
def _find_matching_sources(self, ngram: List[str], references: Dict[str, List[str]], n: int) -> Dict[str, int]:
|
|
68
|
-
"""Find which sources contain the given n-gram and at what positions."""
|
|
129
|
+
"""Find which sources contain the given n-gram and at what positions (legacy O(n) method)."""
|
|
69
130
|
# self.logger.debug(f"_find_matching_sources called for ngram: '{' '.join(ngram)}'")
|
|
70
131
|
matches = {}
|
|
71
132
|
all_positions = {source: [] for source in references}
|
|
@@ -193,121 +254,59 @@ class AnchorSequenceFinder:
|
|
|
193
254
|
ref_words: Dict[str, List[Word]],
|
|
194
255
|
min_sources: int,
|
|
195
256
|
) -> List[AnchorSequence]:
|
|
196
|
-
"""Process a single n-gram length to find matching sequences
|
|
257
|
+
"""Process a single n-gram length to find matching sequences using hash-based index."""
|
|
197
258
|
self.logger.debug(f"🔍 N-GRAM {n}: Starting processing with {len(trans_words)} transcription words")
|
|
198
|
-
self.logger.debug(f"🔍 N-GRAM {n}: Reference sources: {list(ref_texts_clean.keys())}")
|
|
199
|
-
self.logger.debug(f"🔍 N-GRAM {n}: Max iterations limit: {self.max_iterations_per_ngram}")
|
|
200
259
|
|
|
201
260
|
candidate_anchors = []
|
|
202
|
-
|
|
203
|
-
used_trans_positions = set()
|
|
204
|
-
|
|
205
|
-
iteration_count = 0
|
|
206
|
-
last_progress_check = 0
|
|
207
|
-
last_anchor_count = 0
|
|
208
|
-
stagnation_count = 0
|
|
261
|
+
used_trans_positions: Set[int] = set()
|
|
209
262
|
|
|
210
|
-
|
|
263
|
+
# Build hash-based index for O(1) lookups
|
|
264
|
+
ngram_index = self._build_ngram_index(ref_texts_clean, n)
|
|
265
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Built index with {len(ngram_index)} unique n-grams")
|
|
211
266
|
|
|
212
|
-
# Generate n-grams from transcribed text
|
|
267
|
+
# Generate n-grams from transcribed text
|
|
213
268
|
trans_ngrams = self._find_ngrams(trans_words, n)
|
|
214
|
-
self.logger.debug(f"🔍 N-GRAM {n}:
|
|
215
|
-
|
|
216
|
-
# Process all n-grams efficiently in multiple passes
|
|
217
|
-
found_new_match = True
|
|
218
|
-
while found_new_match and iteration_count < self.max_iterations_per_ngram:
|
|
219
|
-
found_new_match = False
|
|
220
|
-
iteration_count += 1
|
|
221
|
-
anchors_found_this_iteration = 0
|
|
222
|
-
|
|
223
|
-
# Log every 10th iteration to track progress
|
|
224
|
-
if iteration_count % 10 == 0:
|
|
225
|
-
self.logger.debug(f"🔍 N-GRAM {n}: Iteration {iteration_count}, anchors found: {len(candidate_anchors)}")
|
|
226
|
-
|
|
227
|
-
# Check for progress stagnation every N iterations
|
|
228
|
-
if iteration_count - last_progress_check >= self.progress_check_interval:
|
|
229
|
-
current_anchor_count = len(candidate_anchors)
|
|
230
|
-
if current_anchor_count == last_anchor_count:
|
|
231
|
-
stagnation_count += 1
|
|
232
|
-
self.logger.debug(f"🔍 N-GRAM {n}: Stagnation check {stagnation_count}/3 at iteration {iteration_count}")
|
|
233
|
-
if stagnation_count >= 3: # No progress for 3 consecutive checks
|
|
234
|
-
self.logger.debug(f"🔍 N-GRAM {n}: ⏹️ Early termination due to stagnation after {iteration_count} iterations")
|
|
235
|
-
break
|
|
236
|
-
else:
|
|
237
|
-
stagnation_count = 0 # Reset stagnation counter
|
|
238
|
-
|
|
239
|
-
last_anchor_count = current_anchor_count
|
|
240
|
-
last_progress_check = iteration_count
|
|
241
|
-
|
|
242
|
-
self.logger.debug(f"🔍 N-GRAM {n}: iteration {iteration_count}, anchors: {current_anchor_count}, stagnation: {stagnation_count}")
|
|
243
|
-
|
|
244
|
-
# Process all n-grams in this iteration
|
|
245
|
-
for ngram, trans_pos in trans_ngrams:
|
|
246
|
-
# Skip if we've already used this transcription position
|
|
247
|
-
if trans_pos in used_trans_positions:
|
|
248
|
-
continue
|
|
269
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Processing {len(trans_ngrams)} transcription n-grams")
|
|
249
270
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
self.logger.error(f"🔍 N-GRAM {n}: ❌ Mismatch between ngram and actual words at position {trans_pos}:")
|
|
256
|
-
self.logger.error(f"🔍 N-GRAM {n}: Ngram words: {ngram_words}")
|
|
257
|
-
self.logger.error(f"🔍 N-GRAM {n}: Actual words: {actual_words}")
|
|
258
|
-
self.logger.error(f"🔍 N-GRAM {n}: Full trans_words: {trans_words}")
|
|
259
|
-
self.logger.error(f"🔍 N-GRAM {n}: Full all_words: {[w.text for w in all_words]}")
|
|
260
|
-
raise AssertionError(
|
|
261
|
-
f"Ngram words don't match actual words at position {trans_pos}. "
|
|
262
|
-
f"This should never happen as trans_words should be derived from all_words."
|
|
263
|
-
)
|
|
271
|
+
# Single pass through all transcription n-grams
|
|
272
|
+
for ngram, trans_pos in trans_ngrams:
|
|
273
|
+
# Skip if we've already used this transcription position
|
|
274
|
+
if trans_pos in used_trans_positions:
|
|
275
|
+
continue
|
|
264
276
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
# Log successful match
|
|
268
|
-
if len(candidate_anchors) < 5: # Only log first few matches to avoid spam
|
|
269
|
-
self.logger.debug(f"🔍 N-GRAM {n}: ✅ Found match: '{' '.join(ngram)}' at pos {trans_pos} with {len(matches)} sources")
|
|
270
|
-
|
|
271
|
-
# Get Word IDs for transcribed words
|
|
272
|
-
transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
|
|
273
|
-
|
|
274
|
-
# Get Word IDs for reference words
|
|
275
|
-
reference_word_ids = {source: [w.id for w in ref_words[source][pos : pos + n]] for source, pos in matches.items()}
|
|
276
|
-
|
|
277
|
-
# Mark positions as used
|
|
278
|
-
for source, pos in matches.items():
|
|
279
|
-
used_positions[source].add(pos)
|
|
280
|
-
used_trans_positions.add(trans_pos)
|
|
281
|
-
|
|
282
|
-
anchor = AnchorSequence(
|
|
283
|
-
id=WordUtils.generate_id(),
|
|
284
|
-
transcribed_word_ids=transcribed_word_ids,
|
|
285
|
-
transcription_position=trans_pos,
|
|
286
|
-
reference_positions=matches,
|
|
287
|
-
reference_word_ids=reference_word_ids,
|
|
288
|
-
confidence=len(matches) / len(ref_texts_clean),
|
|
289
|
-
)
|
|
290
|
-
candidate_anchors.append(anchor)
|
|
291
|
-
anchors_found_this_iteration += 1
|
|
292
|
-
found_new_match = True
|
|
293
|
-
|
|
294
|
-
# For efficiency, if we have very low iteration limits, find one match per iteration
|
|
295
|
-
if self.max_iterations_per_ngram <= 10:
|
|
296
|
-
break
|
|
277
|
+
# Use indexed lookup (O(1) instead of O(n))
|
|
278
|
+
matches = self._find_matching_sources_indexed(ngram, ngram_index)
|
|
297
279
|
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
280
|
+
if len(matches) >= min_sources:
|
|
281
|
+
# Get Word IDs for transcribed words
|
|
282
|
+
transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
|
|
283
|
+
|
|
284
|
+
# Get Word IDs for reference words
|
|
285
|
+
reference_word_ids = {
|
|
286
|
+
source: [w.id for w in ref_words[source][pos : pos + n]]
|
|
287
|
+
for source, pos in matches.items()
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# Mark transcription position as used
|
|
291
|
+
used_trans_positions.add(trans_pos)
|
|
292
|
+
|
|
293
|
+
# Mark reference positions as used
|
|
294
|
+
for source, pos in matches.items():
|
|
295
|
+
if source not in self.used_positions:
|
|
296
|
+
self.used_positions[source] = set()
|
|
297
|
+
self.used_positions[source].add(pos)
|
|
306
298
|
|
|
307
|
-
|
|
308
|
-
|
|
299
|
+
anchor = AnchorSequence(
|
|
300
|
+
id=WordUtils.generate_id(),
|
|
301
|
+
transcribed_word_ids=transcribed_word_ids,
|
|
302
|
+
transcription_position=trans_pos,
|
|
303
|
+
reference_positions=matches,
|
|
304
|
+
reference_word_ids=reference_word_ids,
|
|
305
|
+
confidence=len(matches) / len(ref_texts_clean),
|
|
306
|
+
)
|
|
307
|
+
candidate_anchors.append(anchor)
|
|
309
308
|
|
|
310
|
-
self.logger.debug(f"🔍 N-GRAM {n}:
|
|
309
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Found {len(candidate_anchors)} anchors")
|
|
311
310
|
return candidate_anchors
|
|
312
311
|
|
|
313
312
|
def find_anchors(
|
|
@@ -320,17 +319,18 @@ class AnchorSequenceFinder:
|
|
|
320
319
|
start_time = time.time()
|
|
321
320
|
|
|
322
321
|
try:
|
|
323
|
-
self.logger.info(f"🔍 ANCHOR SEARCH: Starting
|
|
324
|
-
self.logger.
|
|
322
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Starting find_anchors with timeout {self.timeout_seconds}s")
|
|
323
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
|
|
324
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Reference sources: {list(references.keys())}")
|
|
325
325
|
|
|
326
326
|
cache_key = self._get_cache_key(transcribed, references, transcription_result)
|
|
327
327
|
cache_path = self.cache_dir / f"anchors_{cache_key}.json"
|
|
328
|
-
self.logger.
|
|
328
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
|
|
329
329
|
|
|
330
330
|
# Try to load from cache
|
|
331
|
-
self.logger.
|
|
331
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
|
|
332
332
|
if cached_data := self._load_from_cache(cache_path):
|
|
333
|
-
self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit
|
|
333
|
+
self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit! Loading anchors from cache")
|
|
334
334
|
try:
|
|
335
335
|
# Convert cached_data to dictionary before logging
|
|
336
336
|
if cached_data:
|
|
@@ -347,25 +347,27 @@ class AnchorSequenceFinder:
|
|
|
347
347
|
|
|
348
348
|
# If not in cache or cache format invalid, perform the computation
|
|
349
349
|
self.logger.info(f"🔍 ANCHOR SEARCH: Cache miss - computing anchors")
|
|
350
|
-
|
|
350
|
+
|
|
351
|
+
# Reset used positions for fresh computation
|
|
352
|
+
self.used_positions = {}
|
|
351
353
|
|
|
352
354
|
# Check timeout before starting computation
|
|
353
355
|
self._check_timeout(start_time, "anchor computation initialization")
|
|
354
|
-
self.logger.
|
|
356
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
|
|
355
357
|
|
|
356
358
|
# Get all words from transcription
|
|
357
|
-
self.logger.
|
|
359
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
|
|
358
360
|
all_words = []
|
|
359
361
|
for segment in transcription_result.result.segments:
|
|
360
362
|
all_words.extend(segment.words)
|
|
361
|
-
self.logger.
|
|
363
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
|
|
362
364
|
|
|
363
365
|
# Clean and split texts
|
|
364
|
-
self.logger.
|
|
366
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
|
|
365
367
|
trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words]
|
|
366
|
-
self.logger.
|
|
368
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
|
|
367
369
|
|
|
368
|
-
self.logger.
|
|
370
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Processing reference sources...")
|
|
369
371
|
ref_texts_clean = {
|
|
370
372
|
source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
|
|
371
373
|
for source, lyrics in references.items()
|
|
@@ -373,14 +375,14 @@ class AnchorSequenceFinder:
|
|
|
373
375
|
ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
|
|
374
376
|
|
|
375
377
|
for source, words in ref_texts_clean.items():
|
|
376
|
-
self.logger.
|
|
378
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
|
|
377
379
|
|
|
378
380
|
# Check timeout after preprocessing
|
|
379
381
|
self._check_timeout(start_time, "anchor computation preprocessing")
|
|
380
|
-
self.logger.
|
|
382
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
|
|
381
383
|
|
|
382
384
|
# Filter out very short reference sources for n-gram length calculation
|
|
383
|
-
self.logger.
|
|
385
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
|
|
384
386
|
valid_ref_lengths = [
|
|
385
387
|
len(words) for words in ref_texts_clean.values()
|
|
386
388
|
if len(words) >= self.min_sequence_length
|
|
@@ -393,10 +395,10 @@ class AnchorSequenceFinder:
|
|
|
393
395
|
# Calculate max length using only valid reference sources
|
|
394
396
|
max_length = min(len(trans_words), min(valid_ref_lengths))
|
|
395
397
|
n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
|
|
396
|
-
self.logger.
|
|
398
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
|
|
397
399
|
|
|
398
400
|
# Process n-gram lengths in parallel with timeout
|
|
399
|
-
self.logger.
|
|
401
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
|
|
400
402
|
process_length_partial = partial(
|
|
401
403
|
self._process_ngram_length,
|
|
402
404
|
trans_words=trans_words,
|
|
@@ -406,139 +408,61 @@ class AnchorSequenceFinder:
|
|
|
406
408
|
min_sources=self.min_sources,
|
|
407
409
|
)
|
|
408
410
|
|
|
409
|
-
# Process n-gram lengths
|
|
411
|
+
# Process n-gram lengths sequentially (single-threaded for cloud compatibility)
|
|
410
412
|
candidate_anchors = []
|
|
411
|
-
pool_timeout = max(60, self.timeout_seconds // 2) if self.timeout_seconds > 0 else 300 # Use half the total timeout for pool operations
|
|
412
413
|
|
|
413
|
-
# Check timeout before
|
|
414
|
-
self._check_timeout(start_time, "
|
|
415
|
-
self.logger.
|
|
414
|
+
# Check timeout before processing
|
|
415
|
+
self._check_timeout(start_time, "n-gram processing start")
|
|
416
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Starting sequential n-gram processing ({len(n_gram_lengths)} lengths)")
|
|
416
417
|
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
# Submit all jobs first
|
|
426
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting {len(n_gram_lengths)} n-gram processing jobs...")
|
|
427
|
-
async_results = []
|
|
428
|
-
for i, n in enumerate(n_gram_lengths):
|
|
429
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting job {i+1}/{len(n_gram_lengths)} for n-gram length {n}")
|
|
430
|
-
async_result = pool.apply_async(process_length_partial, (n,))
|
|
431
|
-
async_results.append(async_result)
|
|
432
|
-
|
|
433
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ All {len(async_results)} jobs submitted")
|
|
434
|
-
|
|
435
|
-
# Collect results with individual timeouts
|
|
436
|
-
batch_results = []
|
|
437
|
-
batch_size = 10
|
|
438
|
-
|
|
439
|
-
for i, async_result in enumerate(async_results):
|
|
440
|
-
n_gram_length = n_gram_lengths[i]
|
|
441
|
-
try:
|
|
442
|
-
# Check remaining time for pool timeout (more lenient than overall timeout)
|
|
418
|
+
batch_size = 10
|
|
419
|
+
batch_results = []
|
|
420
|
+
|
|
421
|
+
for i, n in enumerate(n_gram_lengths):
|
|
422
|
+
try:
|
|
423
|
+
# Check timeout periodically
|
|
424
|
+
if self.timeout_seconds > 0:
|
|
443
425
|
elapsed_time = time.time() - start_time
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
result = async_result.get(timeout=individual_timeout)
|
|
452
|
-
results.append(result)
|
|
453
|
-
|
|
454
|
-
# Batch logging - collect info for batched logging
|
|
455
|
-
batch_results.append((n_gram_length, len(result)))
|
|
456
|
-
|
|
457
|
-
# Log progress every batch_size results or on the last result (at DEBUG level)
|
|
458
|
-
if (i + 1) % batch_size == 0 or (i + 1) == len(async_results):
|
|
459
|
-
total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
|
|
460
|
-
n_gram_ranges = [str(ng) for ng, _ in batch_results]
|
|
461
|
-
range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
|
|
462
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} ({i+1-len(batch_results)+1}-{i+1}/{len(async_results)}) - found {total_anchors_in_batch} anchors")
|
|
463
|
-
batch_results = [] # Reset batch
|
|
464
|
-
|
|
465
|
-
except Exception as e:
|
|
466
|
-
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n_gram_length} failed or timed out: {str(e)}")
|
|
467
|
-
results.append([]) # Add empty result to maintain order
|
|
468
|
-
|
|
469
|
-
# Add failed result to batch for logging
|
|
470
|
-
batch_results.append((n_gram_length, 0))
|
|
471
|
-
|
|
472
|
-
# If we're running short on time, trigger fallback early
|
|
473
|
-
if self.timeout_seconds > 0 and (time.time() - start_time) > (self.timeout_seconds * 0.8):
|
|
474
|
-
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Approaching timeout limit, triggering early fallback")
|
|
475
|
-
# Raise exception to trigger fallback to sequential processing
|
|
476
|
-
raise Exception("Parallel processing timeout, triggering fallback")
|
|
477
|
-
|
|
478
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: Parallel processing completed, combining results...")
|
|
479
|
-
for anchors in results:
|
|
426
|
+
if elapsed_time > self.timeout_seconds:
|
|
427
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached at n-gram {n}, stopping")
|
|
428
|
+
break
|
|
429
|
+
|
|
430
|
+
anchors = self._process_ngram_length(
|
|
431
|
+
n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
|
|
432
|
+
)
|
|
480
433
|
candidate_anchors.extend(anchors)
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
except Exception as e:
|
|
493
|
-
self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing failed: {str(e)}")
|
|
494
|
-
# Fall back to sequential processing with timeout checks
|
|
495
|
-
self.logger.info("🔍 ANCHOR SEARCH: Falling back to sequential processing")
|
|
496
|
-
for n in n_gram_lengths:
|
|
497
|
-
try:
|
|
498
|
-
# Check timeout more leniently during sequential processing
|
|
499
|
-
if self.timeout_seconds > 0:
|
|
500
|
-
elapsed_time = time.time() - start_time
|
|
501
|
-
# Allow more time for sequential processing (up to 2x the original timeout)
|
|
502
|
-
if elapsed_time > (self.timeout_seconds * 2.0):
|
|
503
|
-
self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Sequential processing timeout for n-gram {n}")
|
|
504
|
-
break
|
|
505
|
-
|
|
506
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: Sequential processing n-gram length {n}")
|
|
434
|
+
|
|
435
|
+
# Batch logging
|
|
436
|
+
batch_results.append((n, len(anchors)))
|
|
437
|
+
|
|
438
|
+
# Log progress every batch_size results or on the last result
|
|
439
|
+
if (i + 1) % batch_size == 0 or (i + 1) == len(n_gram_lengths):
|
|
440
|
+
total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
|
|
441
|
+
n_gram_ranges = [str(ng) for ng, _ in batch_results]
|
|
442
|
+
range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
|
|
443
|
+
self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} - found {total_anchors_in_batch} anchors")
|
|
444
|
+
batch_results = []
|
|
507
445
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Sequential processing failed for n-gram length {n}: {str(e)}")
|
|
515
|
-
continue
|
|
516
|
-
finally:
|
|
517
|
-
# Always ensure pool is cleaned up to avoid hangs in containerized environments
|
|
518
|
-
if pool is not None:
|
|
519
|
-
try:
|
|
520
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: 🧹 Final pool cleanup...")
|
|
521
|
-
pool.terminate()
|
|
522
|
-
pool.join(timeout=5) # Wait max 5 seconds for workers to terminate
|
|
523
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Final pool cleanup completed")
|
|
524
|
-
except Exception as cleanup_error:
|
|
525
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: ⚠️ Pool cleanup error (ignored): {cleanup_error}")
|
|
526
|
-
|
|
527
|
-
self.logger.debug(f"🔍 ANCHOR SEARCH: Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
|
|
446
|
+
except Exception as e:
|
|
447
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
|
|
448
|
+
batch_results.append((n, 0))
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
|
|
528
452
|
|
|
529
453
|
# Check timeout before expensive filtering operation
|
|
530
454
|
self._check_timeout(start_time, "overlap filtering start")
|
|
531
|
-
self.logger.
|
|
455
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Starting overlap filtering...")
|
|
532
456
|
|
|
533
457
|
filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
|
|
534
|
-
self.logger.
|
|
458
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Filtering completed - {len(filtered_anchors)} final anchors")
|
|
535
459
|
|
|
536
460
|
# Save to cache
|
|
537
|
-
self.logger.
|
|
461
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 💾 Saving results to cache...")
|
|
538
462
|
self._save_to_cache(cache_path, filtered_anchors)
|
|
539
463
|
|
|
540
464
|
total_time = time.time() - start_time
|
|
541
|
-
self.logger.info(f"🔍 ANCHOR SEARCH:
|
|
465
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 🎉 Anchor sequence computation completed successfully in {total_time:.1f}s")
|
|
542
466
|
|
|
543
467
|
return filtered_anchors
|
|
544
468
|
|
|
@@ -618,13 +542,13 @@ class AnchorSequenceFinder:
|
|
|
618
542
|
transcription_result: TranscriptionResult,
|
|
619
543
|
) -> List[ScoredAnchor]:
|
|
620
544
|
"""Remove overlapping sequences using phrase analysis with timeout protection."""
|
|
621
|
-
self.logger.
|
|
545
|
+
self.logger.info(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
|
|
622
546
|
|
|
623
547
|
if not anchors:
|
|
624
|
-
self.logger.
|
|
548
|
+
self.logger.info(f"🔍 FILTERING: No anchors to process")
|
|
625
549
|
return []
|
|
626
550
|
|
|
627
|
-
self.logger.
|
|
551
|
+
self.logger.info(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
|
|
628
552
|
|
|
629
553
|
# Create word map for scoring
|
|
630
554
|
word_map = {w.id: w for s in transcription_result.result.segments for w in s.words}
|
|
@@ -669,123 +593,75 @@ class AnchorSequenceFinder:
|
|
|
669
593
|
|
|
670
594
|
start_time = time.time()
|
|
671
595
|
|
|
672
|
-
#
|
|
673
|
-
|
|
674
|
-
self.logger.info(f"🔍 FILTERING: Using {num_processes} processes for scoring")
|
|
675
|
-
|
|
676
|
-
# Create a partial function with the context parameter fixed
|
|
677
|
-
score_anchor_partial = partial(self._score_anchor_static, context=context)
|
|
678
|
-
|
|
679
|
-
# Use multiprocessing to score anchors in parallel with timeout
|
|
596
|
+
# Score anchors sequentially using simple rule-based scoring
|
|
597
|
+
# (Avoids expensive spaCy NLP and works in cloud environments)
|
|
680
598
|
scored_anchors = []
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
scoring_pool = None
|
|
684
|
-
try:
|
|
685
|
-
self.logger.debug(f"🔍 FILTERING: Starting parallel scoring with timeout {pool_timeout}s")
|
|
686
|
-
scoring_pool = Pool(processes=num_processes)
|
|
687
|
-
# Submit scoring jobs with timeout
|
|
688
|
-
async_results = []
|
|
689
|
-
batch_size = 50
|
|
690
|
-
|
|
691
|
-
self.logger.debug(f"🔍 FILTERING: Splitting {len(anchors)} anchors into batches of {batch_size}")
|
|
692
|
-
for i in range(0, len(anchors), batch_size):
|
|
693
|
-
batch = anchors[i:i + batch_size]
|
|
694
|
-
async_result = scoring_pool.apply_async(self._score_batch_static, (batch, context))
|
|
695
|
-
async_results.append(async_result)
|
|
696
|
-
|
|
697
|
-
self.logger.debug(f"🔍 FILTERING: Submitted {len(async_results)} scoring batches")
|
|
698
|
-
|
|
699
|
-
# Collect results with timeout
|
|
700
|
-
for i, async_result in enumerate(async_results):
|
|
701
|
-
try:
|
|
702
|
-
self.logger.debug(f"🔍 FILTERING: Collecting batch {i+1}/{len(async_results)}")
|
|
703
|
-
batch_results = async_result.get(timeout=pool_timeout)
|
|
704
|
-
scored_anchors.extend(batch_results)
|
|
705
|
-
self.logger.debug(f"🔍 FILTERING: Completed scoring batch {i+1}/{len(async_results)}")
|
|
706
|
-
except Exception as e:
|
|
707
|
-
self.logger.warning(f"🔍 FILTERING: ⚠️ Scoring batch {i+1} failed or timed out: {str(e)}")
|
|
708
|
-
# Add basic scores for failed batch
|
|
709
|
-
start_idx = i * batch_size
|
|
710
|
-
end_idx = min((i + 1) * batch_size, len(anchors))
|
|
711
|
-
for j in range(start_idx, end_idx):
|
|
712
|
-
if j < len(anchors):
|
|
713
|
-
try:
|
|
714
|
-
phrase_score = PhraseScore(
|
|
715
|
-
total_score=1.0,
|
|
716
|
-
natural_break_score=1.0,
|
|
717
|
-
phrase_type=PhraseType.COMPLETE
|
|
718
|
-
)
|
|
719
|
-
scored_anchors.append(ScoredAnchor(anchor=anchors[j], phrase_score=phrase_score))
|
|
720
|
-
except:
|
|
721
|
-
continue
|
|
722
|
-
|
|
723
|
-
# Explicitly cleanup pool to avoid hangs in containerized environments
|
|
724
|
-
self.logger.debug(f"🔍 FILTERING: Cleaning up scoring pool...")
|
|
725
|
-
scoring_pool.close()
|
|
726
|
-
scoring_pool.terminate()
|
|
727
|
-
self.logger.debug(f"🔍 FILTERING: Scoring pool cleanup completed")
|
|
728
|
-
|
|
729
|
-
except Exception as e:
|
|
730
|
-
self.logger.warning(f"🔍 FILTERING: ❌ Parallel scoring failed: {str(e)}, falling back to basic scoring")
|
|
731
|
-
# Fall back to basic scoring
|
|
732
|
-
for anchor in anchors:
|
|
733
|
-
try:
|
|
734
|
-
phrase_score = PhraseScore(
|
|
735
|
-
total_score=1.0,
|
|
736
|
-
natural_break_score=1.0,
|
|
737
|
-
phrase_type=PhraseType.COMPLETE
|
|
738
|
-
)
|
|
739
|
-
scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
|
|
740
|
-
except:
|
|
741
|
-
continue
|
|
742
|
-
finally:
|
|
743
|
-
# Always ensure scoring pool is cleaned up to avoid hangs
|
|
744
|
-
if scoring_pool is not None:
|
|
745
|
-
try:
|
|
746
|
-
self.logger.debug(f"🔍 FILTERING: Final scoring pool cleanup...")
|
|
747
|
-
scoring_pool.terminate()
|
|
748
|
-
scoring_pool.join(timeout=5) # Wait max 5 seconds for workers to terminate
|
|
749
|
-
self.logger.debug(f"🔍 FILTERING: Final scoring pool cleanup completed")
|
|
750
|
-
except Exception as cleanup_error:
|
|
751
|
-
self.logger.debug(f"🔍 FILTERING: Scoring pool cleanup error (ignored): {cleanup_error}")
|
|
599
|
+
self.logger.debug(f"🔍 FILTERING: Scoring {len(anchors)} anchors sequentially")
|
|
752
600
|
|
|
753
|
-
|
|
754
|
-
|
|
601
|
+
for i, anchor in enumerate(anchors):
|
|
602
|
+
try:
|
|
603
|
+
# Simple rule-based scoring based on anchor properties
|
|
604
|
+
phrase_score = self._simple_score_anchor(anchor)
|
|
605
|
+
scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
|
|
606
|
+
except Exception as e:
|
|
607
|
+
# Fallback to default score on error
|
|
608
|
+
self.logger.debug(f"🔍 FILTERING: Scoring failed for anchor {i}: {e}")
|
|
609
|
+
phrase_score = PhraseScore(
|
|
610
|
+
phrase_type=PhraseType.COMPLETE,
|
|
611
|
+
natural_break_score=1.0,
|
|
612
|
+
length_score=1.0
|
|
613
|
+
)
|
|
614
|
+
scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
|
|
755
615
|
|
|
756
|
-
|
|
616
|
+
scoring_time = time.time() - start_time
|
|
617
|
+
self.logger.debug(f"🔍 FILTERING: Scoring completed in {scoring_time:.2f}s, scored {len(scored_anchors)} anchors")
|
|
618
|
+
|
|
619
|
+
# Sort anchors by priority (highest first)
|
|
757
620
|
self.logger.debug(f"🔍 FILTERING: Sorting anchors by priority...")
|
|
758
621
|
scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
|
|
759
|
-
self.logger.debug(f"🔍 FILTERING: Sorting completed")
|
|
760
622
|
|
|
623
|
+
# O(N) overlap filtering using covered positions set
|
|
761
624
|
self.logger.debug(f"🔍 FILTERING: Filtering {len(scored_anchors)} overlapping sequences")
|
|
762
625
|
filtered_scored = []
|
|
763
|
-
|
|
764
|
-
for i, scored_anchor in enumerate(scored_anchors):
|
|
765
|
-
# Check timeout every 100 anchors using our timeout mechanism (more lenient)
|
|
766
|
-
if i % 100 == 0 and i > 0:
|
|
767
|
-
# Only check timeout if we're significantly over the limit
|
|
768
|
-
if self.timeout_seconds > 0:
|
|
769
|
-
elapsed_time = time.time() - start_time
|
|
770
|
-
# Use a more lenient timeout for filtering (allow 50% more time)
|
|
771
|
-
if elapsed_time > (self.timeout_seconds * 1.5):
|
|
772
|
-
self.logger.warning(f"🔍 FILTERING: ⏰ Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
|
|
773
|
-
break
|
|
774
|
-
|
|
775
|
-
self.logger.debug(f"🔍 FILTERING: Progress: {i}/{len(scored_anchors)} processed, {len(filtered_scored)} kept")
|
|
776
|
-
|
|
777
|
-
overlaps = False
|
|
778
|
-
for existing in filtered_scored:
|
|
779
|
-
if self._sequences_overlap(scored_anchor.anchor, existing.anchor):
|
|
780
|
-
overlaps = True
|
|
781
|
-
break
|
|
626
|
+
covered_positions: Set[int] = set()
|
|
782
627
|
|
|
783
|
-
|
|
628
|
+
for scored_anchor in scored_anchors:
|
|
629
|
+
anchor = scored_anchor.anchor
|
|
630
|
+
start_pos = anchor.transcription_position
|
|
631
|
+
end_pos = start_pos + anchor.length
|
|
632
|
+
|
|
633
|
+
# Check if any position in this anchor's range is already covered
|
|
634
|
+
anchor_positions = set(range(start_pos, end_pos))
|
|
635
|
+
if not anchor_positions & covered_positions: # No overlap with covered
|
|
784
636
|
filtered_scored.append(scored_anchor)
|
|
637
|
+
covered_positions.update(anchor_positions)
|
|
785
638
|
|
|
786
|
-
self.logger.debug(f"🔍 FILTERING:
|
|
639
|
+
self.logger.debug(f"🔍 FILTERING: Kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
|
|
787
640
|
return filtered_scored
|
|
788
641
|
|
|
642
|
+
def _simple_score_anchor(self, anchor: AnchorSequence) -> PhraseScore:
|
|
643
|
+
"""
|
|
644
|
+
Simple rule-based scoring for anchors without expensive NLP.
|
|
645
|
+
|
|
646
|
+
Scoring criteria:
|
|
647
|
+
- Longer sequences are preferred (length_score)
|
|
648
|
+
- Sequences matching more reference sources are preferred (natural_break_score)
|
|
649
|
+
- All sequences treated as COMPLETE type for simplicity
|
|
650
|
+
"""
|
|
651
|
+
# Length score: normalize to 0-1 range (3-15 words typical)
|
|
652
|
+
length = anchor.length
|
|
653
|
+
length_score = min(1.0, (length - 2) / 10.0) # 3 words = 0.1, 12 words = 1.0
|
|
654
|
+
|
|
655
|
+
# Source match score: more sources = higher score
|
|
656
|
+
num_sources = len(anchor.reference_positions)
|
|
657
|
+
natural_break_score = min(1.0, num_sources / 3.0) # 1 source = 0.33, 3+ sources = 1.0
|
|
658
|
+
|
|
659
|
+
return PhraseScore(
|
|
660
|
+
phrase_type=PhraseType.COMPLETE,
|
|
661
|
+
natural_break_score=natural_break_score,
|
|
662
|
+
length_score=length_score
|
|
663
|
+
)
|
|
664
|
+
|
|
789
665
|
@staticmethod
|
|
790
666
|
def _score_anchor_static(anchor: AnchorSequence, context: str) -> ScoredAnchor:
|
|
791
667
|
"""Static version of _score_anchor for multiprocessing compatibility."""
|
|
@@ -816,9 +692,9 @@ class AnchorSequenceFinder:
|
|
|
816
692
|
except Exception:
|
|
817
693
|
# Add basic score for failed anchor
|
|
818
694
|
phrase_score = PhraseScore(
|
|
819
|
-
|
|
695
|
+
phrase_type=PhraseType.COMPLETE,
|
|
820
696
|
natural_break_score=1.0,
|
|
821
|
-
|
|
697
|
+
length_score=1.0
|
|
822
698
|
)
|
|
823
699
|
scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
|
|
824
700
|
|