lyrics-transcriber 0.65.1__py3-none-any.whl → 0.66.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -194,6 +194,10 @@ class AnchorSequenceFinder:
194
194
  min_sources: int,
195
195
  ) -> List[AnchorSequence]:
196
196
  """Process a single n-gram length to find matching sequences with timeout and early termination."""
197
+ self.logger.info(f"🔍 N-GRAM {n}: Starting processing with {len(trans_words)} transcription words")
198
+ self.logger.info(f"🔍 N-GRAM {n}: Reference sources: {list(ref_texts_clean.keys())}")
199
+ self.logger.info(f"🔍 N-GRAM {n}: Max iterations limit: {self.max_iterations_per_ngram}")
200
+
197
201
  candidate_anchors = []
198
202
  used_positions = {source: set() for source in ref_texts_clean.keys()}
199
203
  used_trans_positions = set()
@@ -203,20 +207,31 @@ class AnchorSequenceFinder:
203
207
  last_anchor_count = 0
204
208
  stagnation_count = 0
205
209
 
206
- self.logger.debug(f"Processing n-gram length {n} with max {self.max_iterations_per_ngram} iterations")
210
+ self.logger.debug(f"🔍 N-GRAM {n}: Processing n-gram length {n} with max {self.max_iterations_per_ngram} iterations")
211
+
212
+ # Generate n-grams from transcribed text once
213
+ trans_ngrams = self._find_ngrams(trans_words, n)
214
+ self.logger.info(f"🔍 N-GRAM {n}: Generated {len(trans_ngrams)} n-grams for processing")
207
215
 
216
+ # Process all n-grams efficiently in multiple passes
208
217
  found_new_match = True
209
218
  while found_new_match and iteration_count < self.max_iterations_per_ngram:
210
219
  found_new_match = False
211
220
  iteration_count += 1
221
+ anchors_found_this_iteration = 0
222
+
223
+ # Log every 10th iteration to track progress
224
+ if iteration_count % 10 == 0:
225
+ self.logger.debug(f"🔍 N-GRAM {n}: Iteration {iteration_count}, anchors found: {len(candidate_anchors)}")
212
226
 
213
227
  # Check for progress stagnation every N iterations
214
228
  if iteration_count - last_progress_check >= self.progress_check_interval:
215
229
  current_anchor_count = len(candidate_anchors)
216
230
  if current_anchor_count == last_anchor_count:
217
231
  stagnation_count += 1
232
+ self.logger.debug(f"🔍 N-GRAM {n}: Stagnation check {stagnation_count}/3 at iteration {iteration_count}")
218
233
  if stagnation_count >= 3: # No progress for 3 consecutive checks
219
- self.logger.debug(f"Early termination for n-gram length {n} due to stagnation after {iteration_count} iterations")
234
+ self.logger.info(f"🔍 N-GRAM {n}: ⏹️ Early termination due to stagnation after {iteration_count} iterations")
220
235
  break
221
236
  else:
222
237
  stagnation_count = 0 # Reset stagnation counter
@@ -224,11 +239,9 @@ class AnchorSequenceFinder:
224
239
  last_anchor_count = current_anchor_count
225
240
  last_progress_check = iteration_count
226
241
 
227
- self.logger.debug(f"n-gram {n}: iteration {iteration_count}, anchors: {current_anchor_count}, stagnation: {stagnation_count}")
228
-
229
- # Generate n-grams from transcribed text
230
- trans_ngrams = self._find_ngrams(trans_words, n)
242
+ self.logger.debug(f"🔍 N-GRAM {n}: iteration {iteration_count}, anchors: {current_anchor_count}, stagnation: {stagnation_count}")
231
243
 
244
+ # Process all n-grams in this iteration
232
245
  for ngram, trans_pos in trans_ngrams:
233
246
  # Skip if we've already used this transcription position
234
247
  if trans_pos in used_trans_positions:
@@ -239,11 +252,11 @@ class AnchorSequenceFinder:
239
252
  ngram_words = [w.lower() for w in ngram]
240
253
 
241
254
  if actual_words != ngram_words:
242
- self.logger.error(f"Mismatch between ngram and actual words at position {trans_pos}:")
243
- self.logger.error(f"Ngram words: {ngram_words}")
244
- self.logger.error(f"Actual words: {actual_words}")
245
- self.logger.error(f"Full trans_words: {trans_words}")
246
- self.logger.error(f"Full all_words: {[w.text for w in all_words]}")
255
+ self.logger.error(f"🔍 N-GRAM {n}: ❌ Mismatch between ngram and actual words at position {trans_pos}:")
256
+ self.logger.error(f"🔍 N-GRAM {n}: Ngram words: {ngram_words}")
257
+ self.logger.error(f"🔍 N-GRAM {n}: Actual words: {actual_words}")
258
+ self.logger.error(f"🔍 N-GRAM {n}: Full trans_words: {trans_words}")
259
+ self.logger.error(f"🔍 N-GRAM {n}: Full all_words: {[w.text for w in all_words]}")
247
260
  raise AssertionError(
248
261
  f"Ngram words don't match actual words at position {trans_pos}. "
249
262
  f"This should never happen as trans_words should be derived from all_words."
@@ -251,6 +264,10 @@ class AnchorSequenceFinder:
251
264
 
252
265
  matches = self._find_matching_sources(ngram, ref_texts_clean, n)
253
266
  if len(matches) >= min_sources:
267
+ # Log successful match
268
+ if len(candidate_anchors) < 5: # Only log first few matches to avoid spam
269
+ self.logger.debug(f"🔍 N-GRAM {n}: ✅ Found match: '{' '.join(ngram)}' at pos {trans_pos} with {len(matches)} sources")
270
+
254
271
  # Get Word IDs for transcribed words
255
272
  transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
256
273
 
@@ -271,13 +288,26 @@ class AnchorSequenceFinder:
271
288
  confidence=len(matches) / len(ref_texts_clean),
272
289
  )
273
290
  candidate_anchors.append(anchor)
291
+ anchors_found_this_iteration += 1
274
292
  found_new_match = True
275
- break
293
+
294
+ # For efficiency, if we have very low iteration limits, find one match per iteration
295
+ if self.max_iterations_per_ngram <= 10:
296
+ break
297
+
298
+ # Log progress for this iteration
299
+ if anchors_found_this_iteration > 0:
300
+ self.logger.debug(f"🔍 N-GRAM {n}: Found {anchors_found_this_iteration} anchors in iteration {iteration_count}")
301
+
302
+ # Early termination if we've found enough anchors or processed all positions
303
+ if len(used_trans_positions) >= len(trans_ngrams) or len(candidate_anchors) >= len(trans_ngrams):
304
+ self.logger.info(f"🔍 N-GRAM {n}: ⏹️ Early termination - processed all positions after {iteration_count} iterations")
305
+ break
276
306
 
277
307
  if iteration_count >= self.max_iterations_per_ngram:
278
- self.logger.warning(f"n-gram length {n} processing terminated after reaching max iterations ({self.max_iterations_per_ngram})")
308
+ self.logger.warning(f"🔍 N-GRAM {n}: Processing terminated after reaching max iterations ({self.max_iterations_per_ngram})")
279
309
 
280
- self.logger.debug(f"Completed n-gram length {n} processing after {iteration_count} iterations, found {len(candidate_anchors)} anchors")
310
+ self.logger.info(f"🔍 N-GRAM {n}: ✅ Completed processing after {iteration_count} iterations, found {len(candidate_anchors)} anchors")
281
311
  return candidate_anchors
282
312
 
283
313
  def find_anchors(
@@ -290,19 +320,25 @@ class AnchorSequenceFinder:
290
320
  start_time = time.time()
291
321
 
292
322
  try:
323
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting find_anchors with timeout {self.timeout_seconds}s")
324
+ self.logger.info(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
325
+ self.logger.info(f"🔍 ANCHOR SEARCH: Reference sources: {list(references.keys())}")
326
+
293
327
  cache_key = self._get_cache_key(transcribed, references, transcription_result)
294
328
  cache_path = self.cache_dir / f"anchors_{cache_key}.json"
329
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
295
330
 
296
331
  # Try to load from cache
332
+ self.logger.info(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
297
333
  if cached_data := self._load_from_cache(cache_path):
298
- self.logger.info("Loading anchors from cache")
334
+ self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit! Loading anchors from cache")
299
335
  try:
300
336
  # Convert cached_data to dictionary before logging
301
337
  if cached_data:
302
338
  first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
303
339
  return cached_data
304
340
  except Exception as e:
305
- self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
341
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Error loading cache: {type(e).__name__}: {e}")
306
342
  if cached_data:
307
343
  try:
308
344
  first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
@@ -311,43 +347,57 @@ class AnchorSequenceFinder:
311
347
  self.logger.error("Could not serialize first cached anchor for logging")
312
348
 
313
349
  # If not in cache or cache format invalid, perform the computation
314
- self.logger.info(f"Cache miss for key {cache_key} - computing anchors with timeout {self.timeout_seconds}s")
315
- self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
350
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cache miss - computing anchors with timeout {self.timeout_seconds}s")
351
+ self.logger.info(f"🔍 ANCHOR SEARCH: Finding anchor sequences for transcription with length {len(transcribed)}")
316
352
 
317
353
  # Check timeout before starting computation
318
354
  self._check_timeout(start_time, "anchor computation initialization")
355
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
319
356
 
320
357
  # Get all words from transcription
358
+ self.logger.info(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
321
359
  all_words = []
322
360
  for segment in transcription_result.result.segments:
323
361
  all_words.extend(segment.words)
362
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
324
363
 
325
364
  # Clean and split texts
365
+ self.logger.info(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
326
366
  trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words]
367
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
368
+
369
+ self.logger.info(f"🔍 ANCHOR SEARCH: Processing reference sources...")
327
370
  ref_texts_clean = {
328
371
  source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
329
372
  for source, lyrics in references.items()
330
373
  }
331
374
  ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
375
+
376
+ for source, words in ref_texts_clean.items():
377
+ self.logger.info(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
332
378
 
333
379
  # Check timeout after preprocessing
334
380
  self._check_timeout(start_time, "anchor computation preprocessing")
381
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
335
382
 
336
383
  # Filter out very short reference sources for n-gram length calculation
384
+ self.logger.info(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
337
385
  valid_ref_lengths = [
338
386
  len(words) for words in ref_texts_clean.values()
339
387
  if len(words) >= self.min_sequence_length
340
388
  ]
341
389
 
342
390
  if not valid_ref_lengths:
343
- self.logger.warning("No reference sources long enough for anchor detection")
391
+ self.logger.warning("🔍 ANCHOR SEARCH: ❌ No reference sources long enough for anchor detection")
344
392
  return []
345
393
 
346
394
  # Calculate max length using only valid reference sources
347
395
  max_length = min(len(trans_words), min(valid_ref_lengths))
348
396
  n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
397
+ self.logger.info(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
349
398
 
350
399
  # Process n-gram lengths in parallel with timeout
400
+ self.logger.info(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
351
401
  process_length_partial = partial(
352
402
  self._process_ngram_length,
353
403
  trans_words=trans_words,
@@ -363,83 +413,115 @@ class AnchorSequenceFinder:
363
413
 
364
414
  # Check timeout before parallel processing
365
415
  self._check_timeout(start_time, "parallel processing start")
416
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - about to start parallel processing")
366
417
 
367
418
  try:
419
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🚀 Starting parallel processing with {max(cpu_count() - 1, 1)} processes, pool timeout: {pool_timeout}s")
368
420
  with Pool(processes=max(cpu_count() - 1, 1)) as pool:
369
- self.logger.debug(f"Starting parallel processing with timeout {pool_timeout}s")
421
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Pool created successfully")
370
422
  results = []
371
423
 
372
424
  # Submit all jobs first
373
- async_results = [pool.apply_async(process_length_partial, (n,)) for n in n_gram_lengths]
425
+ self.logger.info(f"🔍 ANCHOR SEARCH: Submitting {len(n_gram_lengths)} n-gram processing jobs...")
426
+ async_results = []
427
+ for i, n in enumerate(n_gram_lengths):
428
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting job {i+1}/{len(n_gram_lengths)} for n-gram length {n}")
429
+ async_result = pool.apply_async(process_length_partial, (n,))
430
+ async_results.append(async_result)
431
+
432
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ All {len(async_results)} jobs submitted")
374
433
 
375
434
  # Collect results with individual timeouts
376
435
  for i, async_result in enumerate(async_results):
436
+ n_gram_length = n_gram_lengths[i]
377
437
  try:
378
- # Check timeout before each result collection
379
- self._check_timeout(start_time, f"collecting n-gram {n_gram_lengths[i]} results")
438
+ self.logger.info(f"🔍 ANCHOR SEARCH: Collecting result {i+1}/{len(async_results)} for n-gram length {n_gram_length}")
380
439
 
381
- # Check remaining time for pool timeout
440
+ # Check remaining time for pool timeout (more lenient than overall timeout)
382
441
  elapsed_time = time.time() - start_time
383
442
  remaining_time = max(10, self.timeout_seconds - elapsed_time) if self.timeout_seconds > 0 else pool_timeout
384
443
 
385
- result = async_result.get(timeout=min(pool_timeout, remaining_time))
444
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Remaining time for n-gram {n_gram_length}: {remaining_time}s")
445
+
446
+ # Use a more lenient timeout for individual results to allow fallback
447
+ individual_timeout = min(pool_timeout, remaining_time) if self.timeout_seconds > 0 else pool_timeout
448
+
449
+ result = async_result.get(timeout=individual_timeout)
386
450
  results.append(result)
387
451
 
388
- self.logger.debug(f"Completed n-gram length {n_gram_lengths[i]} ({i+1}/{len(n_gram_lengths)})")
452
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Completed n-gram length {n_gram_length} ({i+1}/{len(n_gram_lengths)}) - found {len(result)} anchors")
389
453
 
390
- except AnchorSequenceTimeoutError:
391
- # Re-raise timeout errors
392
- raise
393
454
  except Exception as e:
394
- self.logger.warning(f"n-gram length {n_gram_lengths[i]} failed or timed out: {str(e)}")
455
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n_gram_length} failed or timed out: {str(e)}")
395
456
  results.append([]) # Add empty result to maintain order
457
+
458
+ # If we're running short on time, trigger fallback early
459
+ if self.timeout_seconds > 0 and (time.time() - start_time) > (self.timeout_seconds * 0.8):
460
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Approaching timeout limit, triggering early fallback")
461
+ # Raise exception to trigger fallback to sequential processing
462
+ raise Exception("Parallel processing timeout, triggering fallback")
396
463
 
464
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Parallel processing completed, combining results...")
397
465
  for anchors in results:
398
466
  candidate_anchors.extend(anchors)
399
467
 
400
468
  except AnchorSequenceTimeoutError:
469
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing timed out")
401
470
  # Re-raise timeout errors
402
471
  raise
403
472
  except Exception as e:
404
- self.logger.error(f"Parallel processing failed: {str(e)}")
473
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing failed: {str(e)}")
405
474
  # Fall back to sequential processing with timeout checks
406
- self.logger.info("Falling back to sequential processing")
475
+ self.logger.info("🔍 ANCHOR SEARCH: 🔄 Falling back to sequential processing")
407
476
  for n in n_gram_lengths:
408
477
  try:
409
- # Check timeout before each n-gram length
410
- self._check_timeout(start_time, f"sequential processing n-gram {n}")
478
+ # Check timeout more leniently during sequential processing
479
+ if self.timeout_seconds > 0:
480
+ elapsed_time = time.time() - start_time
481
+ # Allow more time for sequential processing (up to 2x the original timeout)
482
+ if elapsed_time > (self.timeout_seconds * 2.0):
483
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Sequential processing timeout for n-gram {n}")
484
+ break
485
+
486
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Sequential processing n-gram length {n}")
411
487
 
412
488
  anchors = self._process_ngram_length(
413
489
  n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
414
490
  )
415
491
  candidate_anchors.extend(anchors)
416
- except AnchorSequenceTimeoutError:
417
- # Re-raise timeout errors
418
- raise
492
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Sequential n-gram {n} completed - found {len(anchors)} anchors")
419
493
  except Exception as e:
420
- self.logger.warning(f"Sequential processing failed for n-gram length {n}: {str(e)}")
494
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Sequential processing failed for n-gram length {n}: {str(e)}")
421
495
  continue
422
496
 
423
- self.logger.info(f"Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
497
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
424
498
 
425
499
  # Check timeout before expensive filtering operation
426
500
  self._check_timeout(start_time, "overlap filtering start")
501
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Starting overlap filtering...")
427
502
 
428
503
  filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
504
+ self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Filtering completed - {len(filtered_anchors)} final anchors")
429
505
 
430
506
  # Save to cache
507
+ self.logger.info(f"🔍 ANCHOR SEARCH: 💾 Saving results to cache...")
431
508
  self._save_to_cache(cache_path, filtered_anchors)
432
509
 
433
510
  total_time = time.time() - start_time
434
- self.logger.info(f"Anchor sequence computation completed in {total_time:.1f}s")
511
+ self.logger.info(f"🔍 ANCHOR SEARCH: 🎉 Anchor sequence computation completed successfully in {total_time:.1f}s")
435
512
 
436
513
  return filtered_anchors
437
514
 
438
515
  except AnchorSequenceTimeoutError:
439
- self.logger.error(f"Anchor sequence computation timed out after {self.timeout_seconds} seconds")
516
+ elapsed_time = time.time() - start_time
517
+ self.logger.error(f"🔍 ANCHOR SEARCH: ⏰ TIMEOUT after {elapsed_time:.1f}s (limit: {self.timeout_seconds}s)")
440
518
  raise
441
519
  except Exception as e:
442
- self.logger.error(f"Anchor sequence computation failed: {str(e)}")
520
+ elapsed_time = time.time() - start_time
521
+ self.logger.error(f"🔍 ANCHOR SEARCH: ❌ FAILED after {elapsed_time:.1f}s: {str(e)}")
522
+ self.logger.error(f"🔍 ANCHOR SEARCH: Exception type: {type(e).__name__}")
523
+ import traceback
524
+ self.logger.error(f"🔍 ANCHOR SEARCH: Traceback: {traceback.format_exc()}")
443
525
  raise
444
526
  finally:
445
527
  # No cleanup needed for time-based timeout checks
@@ -506,16 +588,20 @@ class AnchorSequenceFinder:
506
588
  transcription_result: TranscriptionResult,
507
589
  ) -> List[ScoredAnchor]:
508
590
  """Remove overlapping sequences using phrase analysis with timeout protection."""
591
+ self.logger.info(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
592
+
509
593
  if not anchors:
594
+ self.logger.info(f"🔍 FILTERING: No anchors to process")
510
595
  return []
511
596
 
512
- self.logger.info(f"Scoring {len(anchors)} anchors")
597
+ self.logger.info(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
513
598
 
514
599
  # Create word map for scoring
515
600
  word_map = {w.id: w for s in transcription_result.result.segments for w in s.words}
601
+ self.logger.debug(f"🔍 FILTERING: Created word map with {len(word_map)} words")
516
602
 
517
603
  # Add word map to each anchor for scoring
518
- for anchor in anchors:
604
+ for i, anchor in enumerate(anchors):
519
605
  # For backwards compatibility, only add transcribed_words if all IDs exist in word_map
520
606
  try:
521
607
  anchor.transcribed_words = [word_map[word_id] for word_id in anchor.transcribed_word_ids]
@@ -555,7 +641,7 @@ class AnchorSequenceFinder:
555
641
 
556
642
  # Try different pool sizes with timeout
557
643
  num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
558
- self.logger.info(f"Using {num_processes} processes for scoring")
644
+ self.logger.info(f"🔍 FILTERING: Using {num_processes} processes for scoring")
559
645
 
560
646
  # Create a partial function with the context parameter fixed
561
647
  score_anchor_partial = partial(self._score_anchor_static, context=context)
@@ -565,24 +651,29 @@ class AnchorSequenceFinder:
565
651
  pool_timeout = 300 # 5 minutes for scoring phase
566
652
 
567
653
  try:
654
+ self.logger.info(f"🔍 FILTERING: 🚀 Starting parallel scoring with timeout {pool_timeout}s")
568
655
  with Pool(processes=num_processes) as pool:
569
656
  # Submit scoring jobs with timeout
570
657
  async_results = []
571
658
  batch_size = 50
572
659
 
660
+ self.logger.info(f"🔍 FILTERING: Splitting {len(anchors)} anchors into batches of {batch_size}")
573
661
  for i in range(0, len(anchors), batch_size):
574
662
  batch = anchors[i:i + batch_size]
575
663
  async_result = pool.apply_async(self._score_batch_static, (batch, context))
576
664
  async_results.append(async_result)
577
665
 
666
+ self.logger.info(f"🔍 FILTERING: Submitted {len(async_results)} scoring batches")
667
+
578
668
  # Collect results with timeout
579
669
  for i, async_result in enumerate(async_results):
580
670
  try:
671
+ self.logger.debug(f"🔍 FILTERING: ⏳ Collecting batch {i+1}/{len(async_results)}")
581
672
  batch_results = async_result.get(timeout=pool_timeout)
582
673
  scored_anchors.extend(batch_results)
583
- self.logger.debug(f"Completed scoring batch {i+1}/{len(async_results)}")
674
+ self.logger.debug(f"🔍 FILTERING: ✅ Completed scoring batch {i+1}/{len(async_results)}")
584
675
  except Exception as e:
585
- self.logger.warning(f"Scoring batch {i+1} failed or timed out: {str(e)}")
676
+ self.logger.warning(f"🔍 FILTERING: ⚠️ Scoring batch {i+1} failed or timed out: {str(e)}")
586
677
  # Add basic scores for failed batch
587
678
  start_idx = i * batch_size
588
679
  end_idx = min((i + 1) * batch_size, len(anchors))
@@ -599,7 +690,7 @@ class AnchorSequenceFinder:
599
690
  continue
600
691
 
601
692
  except Exception as e:
602
- self.logger.warning(f"Parallel scoring failed: {str(e)}, falling back to basic scoring")
693
+ self.logger.warning(f"🔍 FILTERING: ❌ Parallel scoring failed: {str(e)}, falling back to basic scoring")
603
694
  # Fall back to basic scoring
604
695
  for anchor in anchors:
605
696
  try:
@@ -613,22 +704,28 @@ class AnchorSequenceFinder:
613
704
  continue
614
705
 
615
706
  parallel_time = time.time() - start_time
616
- self.logger.info(f"Parallel scoring took {parallel_time:.2f} seconds")
707
+ self.logger.info(f"🔍 FILTERING: ✅ Parallel scoring completed in {parallel_time:.2f}s, scored {len(scored_anchors)} anchors")
617
708
 
618
709
  # Sort and filter as before
710
+ self.logger.info(f"🔍 FILTERING: 🔄 Sorting anchors by priority...")
619
711
  scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
712
+ self.logger.info(f"🔍 FILTERING: ✅ Sorting completed")
620
713
 
621
- self.logger.info(f"Filtering {len(scored_anchors)} overlapping sequences")
714
+ self.logger.info(f"🔍 FILTERING: 🔄 Filtering {len(scored_anchors)} overlapping sequences")
622
715
  filtered_scored = []
623
716
 
624
717
  for i, scored_anchor in enumerate(scored_anchors):
625
- # Check timeout every 100 anchors using our timeout mechanism
626
- if i % 100 == 0:
627
- try:
628
- self._check_timeout(start_time, f"filtering anchors (processed {i}/{len(scored_anchors)})")
629
- except AnchorSequenceTimeoutError:
630
- self.logger.warning(f"Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
631
- break
718
+ # Check timeout every 100 anchors using our timeout mechanism (more lenient)
719
+ if i % 100 == 0 and i > 0:
720
+ # Only check timeout if we're significantly over the limit
721
+ if self.timeout_seconds > 0:
722
+ elapsed_time = time.time() - start_time
723
+ # Use a more lenient timeout for filtering (allow 50% more time)
724
+ if elapsed_time > (self.timeout_seconds * 1.5):
725
+ self.logger.warning(f"🔍 FILTERING: ⏰ Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
726
+ break
727
+
728
+ self.logger.debug(f"🔍 FILTERING: Progress: {i}/{len(scored_anchors)} processed, {len(filtered_scored)} kept")
632
729
 
633
730
  overlaps = False
634
731
  for existing in filtered_scored:
@@ -639,7 +736,7 @@ class AnchorSequenceFinder:
639
736
  if not overlaps:
640
737
  filtered_scored.append(scored_anchor)
641
738
 
642
- self.logger.info(f"Filtered down to {len(filtered_scored)} non-overlapping anchors")
739
+ self.logger.info(f"🔍 FILTERING: Filtering completed - kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
643
740
  return filtered_scored
644
741
 
645
742
  @staticmethod
@@ -2,7 +2,7 @@
2
2
  "name": "lyrics-transcriber-frontend",
3
3
  "private": true,
4
4
  "homepage": "https://nomadkaraoke.github.io/lyrics-transcriber-frontend",
5
- "version": "0.65.1",
5
+ "version": "0.66.0",
6
6
  "type": "module",
7
7
  "scripts": {
8
8
  "dev": "vite",
@@ -38915,7 +38915,7 @@ const theme = createTheme({
38915
38915
  spacing: (factor) => `${0.6 * factor}rem`
38916
38916
  // Further reduced from 0.8 * factor
38917
38917
  });
38918
- const version = "0.65.1";
38918
+ const version = "0.66.0";
38919
38919
  const packageJson = {
38920
38920
  version
38921
38921
  };
@@ -38926,4 +38926,4 @@ ReactDOM$1.createRoot(document.getElementById("root")).render(
38926
38926
  /* @__PURE__ */ jsxRuntimeExports.jsx(App, {})
38927
38927
  ] })
38928
38928
  );
38929
- //# sourceMappingURL=index-BDSHneNc.js.map
38929
+ //# sourceMappingURL=index-BMWgZ3MR.js.map