karaoke-gen 0.71.27__py3-none-any.whl → 0.75.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. karaoke_gen/__init__.py +32 -1
  2. karaoke_gen/audio_fetcher.py +476 -56
  3. karaoke_gen/audio_processor.py +11 -3
  4. karaoke_gen/file_handler.py +192 -0
  5. karaoke_gen/instrumental_review/__init__.py +45 -0
  6. karaoke_gen/instrumental_review/analyzer.py +408 -0
  7. karaoke_gen/instrumental_review/editor.py +322 -0
  8. karaoke_gen/instrumental_review/models.py +171 -0
  9. karaoke_gen/instrumental_review/server.py +475 -0
  10. karaoke_gen/instrumental_review/static/index.html +1506 -0
  11. karaoke_gen/instrumental_review/waveform.py +409 -0
  12. karaoke_gen/karaoke_finalise/karaoke_finalise.py +62 -1
  13. karaoke_gen/karaoke_gen.py +114 -1
  14. karaoke_gen/lyrics_processor.py +81 -4
  15. karaoke_gen/utils/bulk_cli.py +3 -0
  16. karaoke_gen/utils/cli_args.py +9 -2
  17. karaoke_gen/utils/gen_cli.py +379 -2
  18. karaoke_gen/utils/remote_cli.py +1126 -77
  19. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/METADATA +7 -1
  20. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/RECORD +38 -26
  21. lyrics_transcriber/correction/anchor_sequence.py +226 -350
  22. lyrics_transcriber/frontend/package.json +1 -1
  23. lyrics_transcriber/frontend/src/components/Header.tsx +38 -12
  24. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +17 -3
  25. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +185 -0
  26. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +704 -0
  27. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/UpcomingWordsBar.tsx +80 -0
  28. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +905 -0
  29. lyrics_transcriber/frontend/src/components/ModeSelectionModal.tsx +127 -0
  30. lyrics_transcriber/frontend/src/components/ReplaceAllLyricsModal.tsx +190 -542
  31. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  32. lyrics_transcriber/frontend/web_assets/assets/{index-DdJTDWH3.js → index-COYImAcx.js} +1722 -489
  33. lyrics_transcriber/frontend/web_assets/assets/index-COYImAcx.js.map +1 -0
  34. lyrics_transcriber/frontend/web_assets/index.html +1 -1
  35. lyrics_transcriber/review/server.py +5 -5
  36. lyrics_transcriber/frontend/web_assets/assets/index-DdJTDWH3.js.map +0 -1
  37. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/WHEEL +0 -0
  38. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/entry_points.txt +0 -0
  39. {karaoke_gen-0.71.27.dist-info → karaoke_gen-0.75.16.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,8 @@
1
1
  import threading
2
2
  import time
3
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
4
4
  import logging
5
5
  from tqdm import tqdm
6
- from multiprocessing import Pool, cpu_count
7
6
  from functools import partial
8
7
  from pathlib import Path
9
8
  import json
@@ -64,8 +63,70 @@ class AnchorSequenceFinder:
64
63
  # self.logger.debug(f"_find_ngrams called with {len(words)} words, n={n}")
65
64
  return [(words[i : i + n], i) for i in range(len(words) - n + 1)]
66
65
 
66
+ def _build_ngram_index(
67
+ self,
68
+ references: Dict[str, List[str]],
69
+ n: int
70
+ ) -> Dict[Tuple[str, ...], Dict[str, List[int]]]:
71
+ """
72
+ Build a hash-based index mapping n-grams to their positions in each reference.
73
+
74
+ Args:
75
+ references: Dict mapping source names to lists of cleaned words
76
+ n: The n-gram length to index
77
+
78
+ Returns:
79
+ Dict mapping n-gram tuples to {source: [positions]} dicts
80
+ """
81
+ index: Dict[Tuple[str, ...], Dict[str, List[int]]] = {}
82
+
83
+ for source, words in references.items():
84
+ for i in range(len(words) - n + 1):
85
+ ngram_tuple = tuple(words[i:i + n])
86
+ if ngram_tuple not in index:
87
+ index[ngram_tuple] = {}
88
+ if source not in index[ngram_tuple]:
89
+ index[ngram_tuple][source] = []
90
+ index[ngram_tuple][source].append(i)
91
+
92
+ return index
93
+
94
+ def _find_matching_sources_indexed(
95
+ self,
96
+ ngram: List[str],
97
+ ngram_index: Dict[Tuple[str, ...], Dict[str, List[int]]]
98
+ ) -> Dict[str, int]:
99
+ """
100
+ Find which sources contain the given n-gram using pre-built index (O(1) lookup).
101
+
102
+ Args:
103
+ ngram: List of words to find
104
+ ngram_index: Pre-built index from _build_ngram_index()
105
+
106
+ Returns:
107
+ Dict mapping source names to first unused position
108
+ """
109
+ matches = {}
110
+ ngram_tuple = tuple(ngram)
111
+
112
+ # O(1) lookup in the index
113
+ if ngram_tuple not in ngram_index:
114
+ return matches
115
+
116
+ source_positions = ngram_index[ngram_tuple]
117
+
118
+ # For each source that contains this n-gram, find first unused position
119
+ for source, positions in source_positions.items():
120
+ used = self.used_positions.get(source, set())
121
+ for pos in positions:
122
+ if pos not in used:
123
+ matches[source] = pos
124
+ break
125
+
126
+ return matches
127
+
67
128
  def _find_matching_sources(self, ngram: List[str], references: Dict[str, List[str]], n: int) -> Dict[str, int]:
68
- """Find which sources contain the given n-gram and at what positions."""
129
+ """Find which sources contain the given n-gram and at what positions (legacy O(n) method)."""
69
130
  # self.logger.debug(f"_find_matching_sources called for ngram: '{' '.join(ngram)}'")
70
131
  matches = {}
71
132
  all_positions = {source: [] for source in references}
@@ -193,121 +254,59 @@ class AnchorSequenceFinder:
193
254
  ref_words: Dict[str, List[Word]],
194
255
  min_sources: int,
195
256
  ) -> List[AnchorSequence]:
196
- """Process a single n-gram length to find matching sequences with timeout and early termination."""
257
+ """Process a single n-gram length to find matching sequences using hash-based index."""
197
258
  self.logger.debug(f"🔍 N-GRAM {n}: Starting processing with {len(trans_words)} transcription words")
198
- self.logger.debug(f"🔍 N-GRAM {n}: Reference sources: {list(ref_texts_clean.keys())}")
199
- self.logger.debug(f"🔍 N-GRAM {n}: Max iterations limit: {self.max_iterations_per_ngram}")
200
259
 
201
260
  candidate_anchors = []
202
- used_positions = {source: set() for source in ref_texts_clean.keys()}
203
- used_trans_positions = set()
204
-
205
- iteration_count = 0
206
- last_progress_check = 0
207
- last_anchor_count = 0
208
- stagnation_count = 0
261
+ used_trans_positions: Set[int] = set()
209
262
 
210
- self.logger.debug(f"🔍 N-GRAM {n}: Processing n-gram length {n} with max {self.max_iterations_per_ngram} iterations")
263
+ # Build hash-based index for O(1) lookups
264
+ ngram_index = self._build_ngram_index(ref_texts_clean, n)
265
+ self.logger.debug(f"🔍 N-GRAM {n}: Built index with {len(ngram_index)} unique n-grams")
211
266
 
212
- # Generate n-grams from transcribed text once
267
+ # Generate n-grams from transcribed text
213
268
  trans_ngrams = self._find_ngrams(trans_words, n)
214
- self.logger.debug(f"🔍 N-GRAM {n}: Generated {len(trans_ngrams)} n-grams for processing")
215
-
216
- # Process all n-grams efficiently in multiple passes
217
- found_new_match = True
218
- while found_new_match and iteration_count < self.max_iterations_per_ngram:
219
- found_new_match = False
220
- iteration_count += 1
221
- anchors_found_this_iteration = 0
222
-
223
- # Log every 10th iteration to track progress
224
- if iteration_count % 10 == 0:
225
- self.logger.debug(f"🔍 N-GRAM {n}: Iteration {iteration_count}, anchors found: {len(candidate_anchors)}")
226
-
227
- # Check for progress stagnation every N iterations
228
- if iteration_count - last_progress_check >= self.progress_check_interval:
229
- current_anchor_count = len(candidate_anchors)
230
- if current_anchor_count == last_anchor_count:
231
- stagnation_count += 1
232
- self.logger.debug(f"🔍 N-GRAM {n}: Stagnation check {stagnation_count}/3 at iteration {iteration_count}")
233
- if stagnation_count >= 3: # No progress for 3 consecutive checks
234
- self.logger.debug(f"🔍 N-GRAM {n}: ⏹️ Early termination due to stagnation after {iteration_count} iterations")
235
- break
236
- else:
237
- stagnation_count = 0 # Reset stagnation counter
238
-
239
- last_anchor_count = current_anchor_count
240
- last_progress_check = iteration_count
241
-
242
- self.logger.debug(f"🔍 N-GRAM {n}: iteration {iteration_count}, anchors: {current_anchor_count}, stagnation: {stagnation_count}")
243
-
244
- # Process all n-grams in this iteration
245
- for ngram, trans_pos in trans_ngrams:
246
- # Skip if we've already used this transcription position
247
- if trans_pos in used_trans_positions:
248
- continue
269
+ self.logger.debug(f"🔍 N-GRAM {n}: Processing {len(trans_ngrams)} transcription n-grams")
249
270
 
250
- # Get the actual words from the transcription at this position
251
- actual_words = [w.text.lower().strip('.,?!"\n') for w in all_words[trans_pos : trans_pos + n]]
252
- ngram_words = [w.lower() for w in ngram]
253
-
254
- if actual_words != ngram_words:
255
- self.logger.error(f"🔍 N-GRAM {n}: ❌ Mismatch between ngram and actual words at position {trans_pos}:")
256
- self.logger.error(f"🔍 N-GRAM {n}: Ngram words: {ngram_words}")
257
- self.logger.error(f"🔍 N-GRAM {n}: Actual words: {actual_words}")
258
- self.logger.error(f"🔍 N-GRAM {n}: Full trans_words: {trans_words}")
259
- self.logger.error(f"🔍 N-GRAM {n}: Full all_words: {[w.text for w in all_words]}")
260
- raise AssertionError(
261
- f"Ngram words don't match actual words at position {trans_pos}. "
262
- f"This should never happen as trans_words should be derived from all_words."
263
- )
271
+ # Single pass through all transcription n-grams
272
+ for ngram, trans_pos in trans_ngrams:
273
+ # Skip if we've already used this transcription position
274
+ if trans_pos in used_trans_positions:
275
+ continue
264
276
 
265
- matches = self._find_matching_sources(ngram, ref_texts_clean, n)
266
- if len(matches) >= min_sources:
267
- # Log successful match
268
- if len(candidate_anchors) < 5: # Only log first few matches to avoid spam
269
- self.logger.debug(f"🔍 N-GRAM {n}: ✅ Found match: '{' '.join(ngram)}' at pos {trans_pos} with {len(matches)} sources")
270
-
271
- # Get Word IDs for transcribed words
272
- transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
273
-
274
- # Get Word IDs for reference words
275
- reference_word_ids = {source: [w.id for w in ref_words[source][pos : pos + n]] for source, pos in matches.items()}
276
-
277
- # Mark positions as used
278
- for source, pos in matches.items():
279
- used_positions[source].add(pos)
280
- used_trans_positions.add(trans_pos)
281
-
282
- anchor = AnchorSequence(
283
- id=WordUtils.generate_id(),
284
- transcribed_word_ids=transcribed_word_ids,
285
- transcription_position=trans_pos,
286
- reference_positions=matches,
287
- reference_word_ids=reference_word_ids,
288
- confidence=len(matches) / len(ref_texts_clean),
289
- )
290
- candidate_anchors.append(anchor)
291
- anchors_found_this_iteration += 1
292
- found_new_match = True
293
-
294
- # For efficiency, if we have very low iteration limits, find one match per iteration
295
- if self.max_iterations_per_ngram <= 10:
296
- break
277
+ # Use indexed lookup (O(1) instead of O(n))
278
+ matches = self._find_matching_sources_indexed(ngram, ngram_index)
297
279
 
298
- # Log progress for this iteration
299
- if anchors_found_this_iteration > 0:
300
- self.logger.debug(f"🔍 N-GRAM {n}: Found {anchors_found_this_iteration} anchors in iteration {iteration_count}")
301
-
302
- # Early termination if we've found enough anchors or processed all positions
303
- if len(used_trans_positions) >= len(trans_ngrams) or len(candidate_anchors) >= len(trans_ngrams):
304
- self.logger.debug(f"🔍 N-GRAM {n}: ⏹️ Early termination - processed all positions after {iteration_count} iterations")
305
- break
280
+ if len(matches) >= min_sources:
281
+ # Get Word IDs for transcribed words
282
+ transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
283
+
284
+ # Get Word IDs for reference words
285
+ reference_word_ids = {
286
+ source: [w.id for w in ref_words[source][pos : pos + n]]
287
+ for source, pos in matches.items()
288
+ }
289
+
290
+ # Mark transcription position as used
291
+ used_trans_positions.add(trans_pos)
292
+
293
+ # Mark reference positions as used
294
+ for source, pos in matches.items():
295
+ if source not in self.used_positions:
296
+ self.used_positions[source] = set()
297
+ self.used_positions[source].add(pos)
306
298
 
307
- if iteration_count >= self.max_iterations_per_ngram:
308
- self.logger.debug(f"🔍 N-GRAM {n}: ⏰ Processing terminated after reaching max iterations ({self.max_iterations_per_ngram})")
299
+ anchor = AnchorSequence(
300
+ id=WordUtils.generate_id(),
301
+ transcribed_word_ids=transcribed_word_ids,
302
+ transcription_position=trans_pos,
303
+ reference_positions=matches,
304
+ reference_word_ids=reference_word_ids,
305
+ confidence=len(matches) / len(ref_texts_clean),
306
+ )
307
+ candidate_anchors.append(anchor)
309
308
 
310
- self.logger.debug(f"🔍 N-GRAM {n}: Completed processing after {iteration_count} iterations, found {len(candidate_anchors)} anchors")
309
+ self.logger.debug(f"🔍 N-GRAM {n}: Found {len(candidate_anchors)} anchors")
311
310
  return candidate_anchors
312
311
 
313
312
  def find_anchors(
@@ -320,17 +319,18 @@ class AnchorSequenceFinder:
320
319
  start_time = time.time()
321
320
 
322
321
  try:
323
- self.logger.info(f"🔍 ANCHOR SEARCH: Starting anchor search (timeout: {self.timeout_seconds}s, sources: {list(references.keys())})")
324
- self.logger.debug(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
322
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting find_anchors with timeout {self.timeout_seconds}s")
323
+ self.logger.info(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
324
+ self.logger.info(f"🔍 ANCHOR SEARCH: Reference sources: {list(references.keys())}")
325
325
 
326
326
  cache_key = self._get_cache_key(transcribed, references, transcription_result)
327
327
  cache_path = self.cache_dir / f"anchors_{cache_key}.json"
328
- self.logger.debug(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
328
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
329
329
 
330
330
  # Try to load from cache
331
- self.logger.debug(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
331
+ self.logger.info(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
332
332
  if cached_data := self._load_from_cache(cache_path):
333
- self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit - loading anchors from cache")
333
+ self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit! Loading anchors from cache")
334
334
  try:
335
335
  # Convert cached_data to dictionary before logging
336
336
  if cached_data:
@@ -347,25 +347,27 @@ class AnchorSequenceFinder:
347
347
 
348
348
  # If not in cache or cache format invalid, perform the computation
349
349
  self.logger.info(f"🔍 ANCHOR SEARCH: Cache miss - computing anchors")
350
- self.logger.debug(f"🔍 ANCHOR SEARCH: Finding anchor sequences for transcription with length {len(transcribed)}")
350
+
351
+ # Reset used positions for fresh computation
352
+ self.used_positions = {}
351
353
 
352
354
  # Check timeout before starting computation
353
355
  self._check_timeout(start_time, "anchor computation initialization")
354
- self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
356
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
355
357
 
356
358
  # Get all words from transcription
357
- self.logger.debug(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
359
+ self.logger.info(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
358
360
  all_words = []
359
361
  for segment in transcription_result.result.segments:
360
362
  all_words.extend(segment.words)
361
- self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
363
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
362
364
 
363
365
  # Clean and split texts
364
- self.logger.debug(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
366
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
365
367
  trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words]
366
- self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
368
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
367
369
 
368
- self.logger.debug(f"🔍 ANCHOR SEARCH: Processing reference sources...")
370
+ self.logger.info(f"🔍 ANCHOR SEARCH: Processing reference sources...")
369
371
  ref_texts_clean = {
370
372
  source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
371
373
  for source, lyrics in references.items()
@@ -373,14 +375,14 @@ class AnchorSequenceFinder:
373
375
  ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
374
376
 
375
377
  for source, words in ref_texts_clean.items():
376
- self.logger.debug(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
378
+ self.logger.info(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
377
379
 
378
380
  # Check timeout after preprocessing
379
381
  self._check_timeout(start_time, "anchor computation preprocessing")
380
- self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
382
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
381
383
 
382
384
  # Filter out very short reference sources for n-gram length calculation
383
- self.logger.debug(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
385
+ self.logger.info(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
384
386
  valid_ref_lengths = [
385
387
  len(words) for words in ref_texts_clean.values()
386
388
  if len(words) >= self.min_sequence_length
@@ -393,10 +395,10 @@ class AnchorSequenceFinder:
393
395
  # Calculate max length using only valid reference sources
394
396
  max_length = min(len(trans_words), min(valid_ref_lengths))
395
397
  n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
396
- self.logger.debug(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
398
+ self.logger.info(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
397
399
 
398
400
  # Process n-gram lengths in parallel with timeout
399
- self.logger.debug(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
401
+ self.logger.info(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
400
402
  process_length_partial = partial(
401
403
  self._process_ngram_length,
402
404
  trans_words=trans_words,
@@ -406,139 +408,61 @@ class AnchorSequenceFinder:
406
408
  min_sources=self.min_sources,
407
409
  )
408
410
 
409
- # Process n-gram lengths in parallel with timeout
411
+ # Process n-gram lengths sequentially (single-threaded for cloud compatibility)
410
412
  candidate_anchors = []
411
- pool_timeout = max(60, self.timeout_seconds // 2) if self.timeout_seconds > 0 else 300 # Use half the total timeout for pool operations
412
413
 
413
- # Check timeout before parallel processing
414
- self._check_timeout(start_time, "parallel processing start")
415
- self.logger.debug(f"🔍 ANCHOR SEARCH: Timeout check passed - about to start parallel processing")
414
+ # Check timeout before processing
415
+ self._check_timeout(start_time, "n-gram processing start")
416
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting sequential n-gram processing ({len(n_gram_lengths)} lengths)")
416
417
 
417
- pool = None
418
- try:
419
- num_processes = max(cpu_count() - 1, 1)
420
- self.logger.info(f"🔍 ANCHOR SEARCH: 🚀 Starting parallel processing ({num_processes} processes, {len(n_gram_lengths)} n-gram lengths)")
421
- pool = Pool(processes=num_processes)
422
- self.logger.debug(f"🔍 ANCHOR SEARCH: Pool created successfully")
423
- results = []
424
-
425
- # Submit all jobs first
426
- self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting {len(n_gram_lengths)} n-gram processing jobs...")
427
- async_results = []
428
- for i, n in enumerate(n_gram_lengths):
429
- self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting job {i+1}/{len(n_gram_lengths)} for n-gram length {n}")
430
- async_result = pool.apply_async(process_length_partial, (n,))
431
- async_results.append(async_result)
432
-
433
- self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ All {len(async_results)} jobs submitted")
434
-
435
- # Collect results with individual timeouts
436
- batch_results = []
437
- batch_size = 10
438
-
439
- for i, async_result in enumerate(async_results):
440
- n_gram_length = n_gram_lengths[i]
441
- try:
442
- # Check remaining time for pool timeout (more lenient than overall timeout)
418
+ batch_size = 10
419
+ batch_results = []
420
+
421
+ for i, n in enumerate(n_gram_lengths):
422
+ try:
423
+ # Check timeout periodically
424
+ if self.timeout_seconds > 0:
443
425
  elapsed_time = time.time() - start_time
444
- remaining_time = max(10, self.timeout_seconds - elapsed_time) if self.timeout_seconds > 0 else pool_timeout
445
-
446
- self.logger.debug(f"🔍 ANCHOR SEARCH: Remaining time for n-gram {n_gram_length}: {remaining_time}s")
447
-
448
- # Use a more lenient timeout for individual results to allow fallback
449
- individual_timeout = min(pool_timeout, remaining_time) if self.timeout_seconds > 0 else pool_timeout
450
-
451
- result = async_result.get(timeout=individual_timeout)
452
- results.append(result)
453
-
454
- # Batch logging - collect info for batched logging
455
- batch_results.append((n_gram_length, len(result)))
456
-
457
- # Log progress every batch_size results or on the last result (at DEBUG level)
458
- if (i + 1) % batch_size == 0 or (i + 1) == len(async_results):
459
- total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
460
- n_gram_ranges = [str(ng) for ng, _ in batch_results]
461
- range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
462
- self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} ({i+1-len(batch_results)+1}-{i+1}/{len(async_results)}) - found {total_anchors_in_batch} anchors")
463
- batch_results = [] # Reset batch
464
-
465
- except Exception as e:
466
- self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n_gram_length} failed or timed out: {str(e)}")
467
- results.append([]) # Add empty result to maintain order
468
-
469
- # Add failed result to batch for logging
470
- batch_results.append((n_gram_length, 0))
471
-
472
- # If we're running short on time, trigger fallback early
473
- if self.timeout_seconds > 0 and (time.time() - start_time) > (self.timeout_seconds * 0.8):
474
- self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Approaching timeout limit, triggering early fallback")
475
- # Raise exception to trigger fallback to sequential processing
476
- raise Exception("Parallel processing timeout, triggering fallback")
477
-
478
- self.logger.debug(f"🔍 ANCHOR SEARCH: Parallel processing completed, combining results...")
479
- for anchors in results:
426
+ if elapsed_time > self.timeout_seconds:
427
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached at n-gram {n}, stopping")
428
+ break
429
+
430
+ anchors = self._process_ngram_length(
431
+ n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
432
+ )
480
433
  candidate_anchors.extend(anchors)
481
-
482
- # Explicitly cleanup pool to avoid hangs in containerized environments
483
- self.logger.debug(f"🔍 ANCHOR SEARCH: 🧹 Cleaning up pool...")
484
- pool.close()
485
- pool.terminate()
486
- self.logger.debug(f"🔍 ANCHOR SEARCH: Pool cleanup completed")
487
-
488
- except AnchorSequenceTimeoutError:
489
- self.logger.error(f"🔍 ANCHOR SEARCH: Parallel processing timed out")
490
- # Re-raise timeout errors
491
- raise
492
- except Exception as e:
493
- self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing failed: {str(e)}")
494
- # Fall back to sequential processing with timeout checks
495
- self.logger.info("🔍 ANCHOR SEARCH: Falling back to sequential processing")
496
- for n in n_gram_lengths:
497
- try:
498
- # Check timeout more leniently during sequential processing
499
- if self.timeout_seconds > 0:
500
- elapsed_time = time.time() - start_time
501
- # Allow more time for sequential processing (up to 2x the original timeout)
502
- if elapsed_time > (self.timeout_seconds * 2.0):
503
- self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Sequential processing timeout for n-gram {n}")
504
- break
505
-
506
- self.logger.debug(f"🔍 ANCHOR SEARCH: Sequential processing n-gram length {n}")
434
+
435
+ # Batch logging
436
+ batch_results.append((n, len(anchors)))
437
+
438
+ # Log progress every batch_size results or on the last result
439
+ if (i + 1) % batch_size == 0 or (i + 1) == len(n_gram_lengths):
440
+ total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
441
+ n_gram_ranges = [str(ng) for ng, _ in batch_results]
442
+ range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
443
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} - found {total_anchors_in_batch} anchors")
444
+ batch_results = []
507
445
 
508
- anchors = self._process_ngram_length(
509
- n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
510
- )
511
- candidate_anchors.extend(anchors)
512
- self.logger.debug(f"🔍 ANCHOR SEARCH: Sequential n-gram {n} completed - found {len(anchors)} anchors")
513
- except Exception as e:
514
- self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Sequential processing failed for n-gram length {n}: {str(e)}")
515
- continue
516
- finally:
517
- # Always ensure pool is cleaned up to avoid hangs in containerized environments
518
- if pool is not None:
519
- try:
520
- self.logger.debug(f"🔍 ANCHOR SEARCH: 🧹 Final pool cleanup...")
521
- pool.terminate()
522
- pool.join(timeout=5) # Wait max 5 seconds for workers to terminate
523
- self.logger.debug(f"🔍 ANCHOR SEARCH: ✅ Final pool cleanup completed")
524
- except Exception as cleanup_error:
525
- self.logger.debug(f"🔍 ANCHOR SEARCH: ⚠️ Pool cleanup error (ignored): {cleanup_error}")
526
-
527
- self.logger.debug(f"🔍 ANCHOR SEARCH: Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
446
+ except Exception as e:
447
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
448
+ batch_results.append((n, 0))
449
+ continue
450
+
451
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
528
452
 
529
453
  # Check timeout before expensive filtering operation
530
454
  self._check_timeout(start_time, "overlap filtering start")
531
- self.logger.debug(f"🔍 ANCHOR SEARCH: Starting overlap filtering...")
455
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Starting overlap filtering...")
532
456
 
533
457
  filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
534
- self.logger.debug(f"🔍 ANCHOR SEARCH: Filtering completed - {len(filtered_anchors)} final anchors")
458
+ self.logger.info(f"🔍 ANCHOR SEARCH: Filtering completed - {len(filtered_anchors)} final anchors")
535
459
 
536
460
  # Save to cache
537
- self.logger.debug(f"🔍 ANCHOR SEARCH: Saving results to cache...")
461
+ self.logger.info(f"🔍 ANCHOR SEARCH: 💾 Saving results to cache...")
538
462
  self._save_to_cache(cache_path, filtered_anchors)
539
463
 
540
464
  total_time = time.time() - start_time
541
- self.logger.info(f"🔍 ANCHOR SEARCH: Completed in {total_time:.1f}s - found {len(filtered_anchors)} anchors")
465
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🎉 Anchor sequence computation completed successfully in {total_time:.1f}s")
542
466
 
543
467
  return filtered_anchors
544
468
 
@@ -618,13 +542,13 @@ class AnchorSequenceFinder:
618
542
  transcription_result: TranscriptionResult,
619
543
  ) -> List[ScoredAnchor]:
620
544
  """Remove overlapping sequences using phrase analysis with timeout protection."""
621
- self.logger.debug(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
545
+ self.logger.info(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
622
546
 
623
547
  if not anchors:
624
- self.logger.debug(f"🔍 FILTERING: No anchors to process")
548
+ self.logger.info(f"🔍 FILTERING: No anchors to process")
625
549
  return []
626
550
 
627
- self.logger.debug(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
551
+ self.logger.info(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
628
552
 
629
553
  # Create word map for scoring
630
554
  word_map = {w.id: w for s in transcription_result.result.segments for w in s.words}
@@ -669,123 +593,75 @@ class AnchorSequenceFinder:
669
593
 
670
594
  start_time = time.time()
671
595
 
672
- # Try different pool sizes with timeout
673
- num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
674
- self.logger.info(f"🔍 FILTERING: Using {num_processes} processes for scoring")
675
-
676
- # Create a partial function with the context parameter fixed
677
- score_anchor_partial = partial(self._score_anchor_static, context=context)
678
-
679
- # Use multiprocessing to score anchors in parallel with timeout
596
+ # Score anchors sequentially using simple rule-based scoring
597
+ # (Avoids expensive spaCy NLP and works in cloud environments)
680
598
  scored_anchors = []
681
- pool_timeout = 300 # 5 minutes for scoring phase
682
-
683
- scoring_pool = None
684
- try:
685
- self.logger.debug(f"🔍 FILTERING: Starting parallel scoring with timeout {pool_timeout}s")
686
- scoring_pool = Pool(processes=num_processes)
687
- # Submit scoring jobs with timeout
688
- async_results = []
689
- batch_size = 50
690
-
691
- self.logger.debug(f"🔍 FILTERING: Splitting {len(anchors)} anchors into batches of {batch_size}")
692
- for i in range(0, len(anchors), batch_size):
693
- batch = anchors[i:i + batch_size]
694
- async_result = scoring_pool.apply_async(self._score_batch_static, (batch, context))
695
- async_results.append(async_result)
696
-
697
- self.logger.debug(f"🔍 FILTERING: Submitted {len(async_results)} scoring batches")
698
-
699
- # Collect results with timeout
700
- for i, async_result in enumerate(async_results):
701
- try:
702
- self.logger.debug(f"🔍 FILTERING: Collecting batch {i+1}/{len(async_results)}")
703
- batch_results = async_result.get(timeout=pool_timeout)
704
- scored_anchors.extend(batch_results)
705
- self.logger.debug(f"🔍 FILTERING: Completed scoring batch {i+1}/{len(async_results)}")
706
- except Exception as e:
707
- self.logger.warning(f"🔍 FILTERING: ⚠️ Scoring batch {i+1} failed or timed out: {str(e)}")
708
- # Add basic scores for failed batch
709
- start_idx = i * batch_size
710
- end_idx = min((i + 1) * batch_size, len(anchors))
711
- for j in range(start_idx, end_idx):
712
- if j < len(anchors):
713
- try:
714
- phrase_score = PhraseScore(
715
- total_score=1.0,
716
- natural_break_score=1.0,
717
- phrase_type=PhraseType.COMPLETE
718
- )
719
- scored_anchors.append(ScoredAnchor(anchor=anchors[j], phrase_score=phrase_score))
720
- except:
721
- continue
722
-
723
- # Explicitly cleanup pool to avoid hangs in containerized environments
724
- self.logger.debug(f"🔍 FILTERING: Cleaning up scoring pool...")
725
- scoring_pool.close()
726
- scoring_pool.terminate()
727
- self.logger.debug(f"🔍 FILTERING: Scoring pool cleanup completed")
728
-
729
- except Exception as e:
730
- self.logger.warning(f"🔍 FILTERING: ❌ Parallel scoring failed: {str(e)}, falling back to basic scoring")
731
- # Fall back to basic scoring
732
- for anchor in anchors:
733
- try:
734
- phrase_score = PhraseScore(
735
- total_score=1.0,
736
- natural_break_score=1.0,
737
- phrase_type=PhraseType.COMPLETE
738
- )
739
- scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
740
- except:
741
- continue
742
- finally:
743
- # Always ensure scoring pool is cleaned up to avoid hangs
744
- if scoring_pool is not None:
745
- try:
746
- self.logger.debug(f"🔍 FILTERING: Final scoring pool cleanup...")
747
- scoring_pool.terminate()
748
- scoring_pool.join(timeout=5) # Wait max 5 seconds for workers to terminate
749
- self.logger.debug(f"🔍 FILTERING: Final scoring pool cleanup completed")
750
- except Exception as cleanup_error:
751
- self.logger.debug(f"🔍 FILTERING: Scoring pool cleanup error (ignored): {cleanup_error}")
599
+ self.logger.debug(f"🔍 FILTERING: Scoring {len(anchors)} anchors sequentially")
752
600
 
753
- parallel_time = time.time() - start_time
754
- self.logger.debug(f"🔍 FILTERING: Parallel scoring completed in {parallel_time:.2f}s, scored {len(scored_anchors)} anchors")
601
+ for i, anchor in enumerate(anchors):
602
+ try:
603
+ # Simple rule-based scoring based on anchor properties
604
+ phrase_score = self._simple_score_anchor(anchor)
605
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
606
+ except Exception as e:
607
+ # Fallback to default score on error
608
+ self.logger.debug(f"🔍 FILTERING: Scoring failed for anchor {i}: {e}")
609
+ phrase_score = PhraseScore(
610
+ phrase_type=PhraseType.COMPLETE,
611
+ natural_break_score=1.0,
612
+ length_score=1.0
613
+ )
614
+ scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
755
615
 
756
- # Sort and filter as before
616
+ scoring_time = time.time() - start_time
617
+ self.logger.debug(f"🔍 FILTERING: Scoring completed in {scoring_time:.2f}s, scored {len(scored_anchors)} anchors")
618
+
619
+ # Sort anchors by priority (highest first)
757
620
  self.logger.debug(f"🔍 FILTERING: Sorting anchors by priority...")
758
621
  scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
759
- self.logger.debug(f"🔍 FILTERING: Sorting completed")
760
622
 
623
+ # O(N) overlap filtering using covered positions set
761
624
  self.logger.debug(f"🔍 FILTERING: Filtering {len(scored_anchors)} overlapping sequences")
762
625
  filtered_scored = []
763
-
764
- for i, scored_anchor in enumerate(scored_anchors):
765
- # Check timeout every 100 anchors using our timeout mechanism (more lenient)
766
- if i % 100 == 0 and i > 0:
767
- # Only check timeout if we're significantly over the limit
768
- if self.timeout_seconds > 0:
769
- elapsed_time = time.time() - start_time
770
- # Use a more lenient timeout for filtering (allow 50% more time)
771
- if elapsed_time > (self.timeout_seconds * 1.5):
772
- self.logger.warning(f"🔍 FILTERING: ⏰ Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
773
- break
774
-
775
- self.logger.debug(f"🔍 FILTERING: Progress: {i}/{len(scored_anchors)} processed, {len(filtered_scored)} kept")
776
-
777
- overlaps = False
778
- for existing in filtered_scored:
779
- if self._sequences_overlap(scored_anchor.anchor, existing.anchor):
780
- overlaps = True
781
- break
626
+ covered_positions: Set[int] = set()
782
627
 
783
- if not overlaps:
628
+ for scored_anchor in scored_anchors:
629
+ anchor = scored_anchor.anchor
630
+ start_pos = anchor.transcription_position
631
+ end_pos = start_pos + anchor.length
632
+
633
+ # Check if any position in this anchor's range is already covered
634
+ anchor_positions = set(range(start_pos, end_pos))
635
+ if not anchor_positions & covered_positions: # No overlap with covered
784
636
  filtered_scored.append(scored_anchor)
637
+ covered_positions.update(anchor_positions)
785
638
 
786
- self.logger.debug(f"🔍 FILTERING: Filtering completed - kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
639
+ self.logger.debug(f"🔍 FILTERING: Kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
787
640
  return filtered_scored
788
641
 
642
+ def _simple_score_anchor(self, anchor: AnchorSequence) -> PhraseScore:
643
+ """
644
+ Simple rule-based scoring for anchors without expensive NLP.
645
+
646
+ Scoring criteria:
647
+ - Longer sequences are preferred (length_score)
648
+ - Sequences matching more reference sources are preferred (natural_break_score)
649
+ - All sequences treated as COMPLETE type for simplicity
650
+ """
651
+ # Length score: normalize to 0-1 range (3-15 words typical)
652
+ length = anchor.length
653
+ length_score = min(1.0, (length - 2) / 10.0) # 3 words = 0.1, 12 words = 1.0
654
+
655
+ # Source match score: more sources = higher score
656
+ num_sources = len(anchor.reference_positions)
657
+ natural_break_score = min(1.0, num_sources / 3.0) # 1 source = 0.33, 3+ sources = 1.0
658
+
659
+ return PhraseScore(
660
+ phrase_type=PhraseType.COMPLETE,
661
+ natural_break_score=natural_break_score,
662
+ length_score=length_score
663
+ )
664
+
789
665
  @staticmethod
790
666
  def _score_anchor_static(anchor: AnchorSequence, context: str) -> ScoredAnchor:
791
667
  """Static version of _score_anchor for multiprocessing compatibility."""
@@ -816,9 +692,9 @@ class AnchorSequenceFinder:
816
692
  except Exception:
817
693
  # Add basic score for failed anchor
818
694
  phrase_score = PhraseScore(
819
- total_score=1.0,
695
+ phrase_type=PhraseType.COMPLETE,
820
696
  natural_break_score=1.0,
821
- phrase_type=PhraseType.COMPLETE
697
+ length_score=1.0
822
698
  )
823
699
  scored_anchors.append(ScoredAnchor(anchor=anchor, phrase_score=phrase_score))
824
700