lyrics-transcriber 0.65.1__py3-none-any.whl → 0.68.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lyrics_transcriber/core/config.py +1 -1
- lyrics_transcriber/core/controller.py +22 -0
- lyrics_transcriber/correction/anchor_sequence.py +169 -59
- lyrics_transcriber/frontend/package.json +1 -1
- lyrics_transcriber/frontend/web_assets/assets/{index-BDSHneNc.js → index-D7BQUJXK.js} +2 -2
- lyrics_transcriber/frontend/web_assets/assets/{index-BDSHneNc.js.map → index-D7BQUJXK.js.map} +1 -1
- lyrics_transcriber/frontend/web_assets/index.html +1 -1
- lyrics_transcriber/lyrics/base_lyrics_provider.py +1 -1
- lyrics_transcriber/output/ass/config.py +37 -0
- lyrics_transcriber/output/ass/lyrics_line.py +1 -1
- lyrics_transcriber/output/generator.py +21 -5
- lyrics_transcriber/output/subtitles.py +2 -1
- {lyrics_transcriber-0.65.1.dist-info → lyrics_transcriber-0.68.0.dist-info}/METADATA +1 -1
- {lyrics_transcriber-0.65.1.dist-info → lyrics_transcriber-0.68.0.dist-info}/RECORD +17 -17
- {lyrics_transcriber-0.65.1.dist-info → lyrics_transcriber-0.68.0.dist-info}/LICENSE +0 -0
- {lyrics_transcriber-0.65.1.dist-info → lyrics_transcriber-0.68.0.dist-info}/WHEEL +0 -0
- {lyrics_transcriber-0.65.1.dist-info → lyrics_transcriber-0.68.0.dist-info}/entry_points.txt +0 -0
@@ -26,7 +26,7 @@ class OutputConfig:
|
|
26
26
|
"""Configuration for output generation."""
|
27
27
|
|
28
28
|
output_styles_json: str
|
29
|
-
|
29
|
+
default_max_line_length: int = 36
|
30
30
|
styles: Dict[str, Any] = field(default_factory=dict)
|
31
31
|
output_dir: Optional[str] = os.getcwd()
|
32
32
|
cache_dir: str = os.getenv(
|
@@ -109,6 +109,9 @@ class LyricsTranscriber:
|
|
109
109
|
# Initialize results
|
110
110
|
self.results = LyricsControllerResult()
|
111
111
|
|
112
|
+
# Load styles early so lyrics providers can use them
|
113
|
+
self._load_styles()
|
114
|
+
|
112
115
|
# Initialize components (with dependency injection)
|
113
116
|
self.transcribers = transcribers or self._initialize_transcribers()
|
114
117
|
self.lyrics_providers = lyrics_providers or self._initialize_lyrics_providers()
|
@@ -127,6 +130,20 @@ class LyricsTranscriber:
|
|
127
130
|
if self.output_config.render_video:
|
128
131
|
self.logger.info(f" Video resolution: {self.output_config.video_resolution}")
|
129
132
|
|
133
|
+
def _load_styles(self) -> None:
|
134
|
+
"""Load styles from JSON file if available."""
|
135
|
+
if self.output_config.output_styles_json and os.path.exists(self.output_config.output_styles_json):
|
136
|
+
try:
|
137
|
+
with open(self.output_config.output_styles_json, "r") as f:
|
138
|
+
self.output_config.styles = json.load(f)
|
139
|
+
self.logger.debug(f"Loaded output styles from: {self.output_config.output_styles_json}")
|
140
|
+
except Exception as e:
|
141
|
+
self.logger.warning(f"Failed to load output styles file: {str(e)}")
|
142
|
+
self.output_config.styles = {}
|
143
|
+
else:
|
144
|
+
self.logger.debug("No styles JSON file provided or file does not exist")
|
145
|
+
self.output_config.styles = {}
|
146
|
+
|
130
147
|
def _sanitize_filename(self, filename: str) -> str:
|
131
148
|
"""Replace or remove characters that are unsafe for filenames."""
|
132
149
|
if not filename:
|
@@ -189,6 +206,10 @@ class LyricsTranscriber:
|
|
189
206
|
"""Initialize available lyrics providers."""
|
190
207
|
providers = {}
|
191
208
|
|
209
|
+
# Get max_line_length from styles if available, otherwise use config default
|
210
|
+
max_line_length = self.output_config.styles.get("karaoke", {}).get("max_line_length", self.output_config.default_max_line_length)
|
211
|
+
self.logger.info(f"Using max_line_length for lyrics providers: {max_line_length}")
|
212
|
+
|
192
213
|
# Create provider config with all necessary parameters
|
193
214
|
provider_config = LyricsProviderConfig(
|
194
215
|
genius_api_token=self.lyrics_config.genius_api_token,
|
@@ -197,6 +218,7 @@ class LyricsTranscriber:
|
|
197
218
|
lyrics_file=self.lyrics_config.lyrics_file,
|
198
219
|
cache_dir=self.output_config.cache_dir,
|
199
220
|
audio_filepath=self.audio_filepath,
|
221
|
+
max_line_length=max_line_length,
|
200
222
|
)
|
201
223
|
|
202
224
|
if provider_config.lyrics_file and os.path.exists(provider_config.lyrics_file):
|
@@ -194,6 +194,10 @@ class AnchorSequenceFinder:
|
|
194
194
|
min_sources: int,
|
195
195
|
) -> List[AnchorSequence]:
|
196
196
|
"""Process a single n-gram length to find matching sequences with timeout and early termination."""
|
197
|
+
self.logger.info(f"🔍 N-GRAM {n}: Starting processing with {len(trans_words)} transcription words")
|
198
|
+
self.logger.info(f"🔍 N-GRAM {n}: Reference sources: {list(ref_texts_clean.keys())}")
|
199
|
+
self.logger.info(f"🔍 N-GRAM {n}: Max iterations limit: {self.max_iterations_per_ngram}")
|
200
|
+
|
197
201
|
candidate_anchors = []
|
198
202
|
used_positions = {source: set() for source in ref_texts_clean.keys()}
|
199
203
|
used_trans_positions = set()
|
@@ -203,20 +207,31 @@ class AnchorSequenceFinder:
|
|
203
207
|
last_anchor_count = 0
|
204
208
|
stagnation_count = 0
|
205
209
|
|
206
|
-
self.logger.debug(f"Processing n-gram length {n} with max {self.max_iterations_per_ngram} iterations")
|
210
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Processing n-gram length {n} with max {self.max_iterations_per_ngram} iterations")
|
211
|
+
|
212
|
+
# Generate n-grams from transcribed text once
|
213
|
+
trans_ngrams = self._find_ngrams(trans_words, n)
|
214
|
+
self.logger.info(f"🔍 N-GRAM {n}: Generated {len(trans_ngrams)} n-grams for processing")
|
207
215
|
|
216
|
+
# Process all n-grams efficiently in multiple passes
|
208
217
|
found_new_match = True
|
209
218
|
while found_new_match and iteration_count < self.max_iterations_per_ngram:
|
210
219
|
found_new_match = False
|
211
220
|
iteration_count += 1
|
221
|
+
anchors_found_this_iteration = 0
|
222
|
+
|
223
|
+
# Log every 10th iteration to track progress
|
224
|
+
if iteration_count % 10 == 0:
|
225
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Iteration {iteration_count}, anchors found: {len(candidate_anchors)}")
|
212
226
|
|
213
227
|
# Check for progress stagnation every N iterations
|
214
228
|
if iteration_count - last_progress_check >= self.progress_check_interval:
|
215
229
|
current_anchor_count = len(candidate_anchors)
|
216
230
|
if current_anchor_count == last_anchor_count:
|
217
231
|
stagnation_count += 1
|
232
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Stagnation check {stagnation_count}/3 at iteration {iteration_count}")
|
218
233
|
if stagnation_count >= 3: # No progress for 3 consecutive checks
|
219
|
-
self.logger.
|
234
|
+
self.logger.info(f"🔍 N-GRAM {n}: ⏹️ Early termination due to stagnation after {iteration_count} iterations")
|
220
235
|
break
|
221
236
|
else:
|
222
237
|
stagnation_count = 0 # Reset stagnation counter
|
@@ -224,11 +239,9 @@ class AnchorSequenceFinder:
|
|
224
239
|
last_anchor_count = current_anchor_count
|
225
240
|
last_progress_check = iteration_count
|
226
241
|
|
227
|
-
self.logger.debug(f"
|
228
|
-
|
229
|
-
# Generate n-grams from transcribed text
|
230
|
-
trans_ngrams = self._find_ngrams(trans_words, n)
|
242
|
+
self.logger.debug(f"🔍 N-GRAM {n}: iteration {iteration_count}, anchors: {current_anchor_count}, stagnation: {stagnation_count}")
|
231
243
|
|
244
|
+
# Process all n-grams in this iteration
|
232
245
|
for ngram, trans_pos in trans_ngrams:
|
233
246
|
# Skip if we've already used this transcription position
|
234
247
|
if trans_pos in used_trans_positions:
|
@@ -239,11 +252,11 @@ class AnchorSequenceFinder:
|
|
239
252
|
ngram_words = [w.lower() for w in ngram]
|
240
253
|
|
241
254
|
if actual_words != ngram_words:
|
242
|
-
self.logger.error(f"Mismatch between ngram and actual words at position {trans_pos}:")
|
243
|
-
self.logger.error(f"Ngram words: {ngram_words}")
|
244
|
-
self.logger.error(f"Actual words: {actual_words}")
|
245
|
-
self.logger.error(f"Full trans_words: {trans_words}")
|
246
|
-
self.logger.error(f"Full all_words: {[w.text for w in all_words]}")
|
255
|
+
self.logger.error(f"🔍 N-GRAM {n}: ❌ Mismatch between ngram and actual words at position {trans_pos}:")
|
256
|
+
self.logger.error(f"🔍 N-GRAM {n}: Ngram words: {ngram_words}")
|
257
|
+
self.logger.error(f"🔍 N-GRAM {n}: Actual words: {actual_words}")
|
258
|
+
self.logger.error(f"🔍 N-GRAM {n}: Full trans_words: {trans_words}")
|
259
|
+
self.logger.error(f"🔍 N-GRAM {n}: Full all_words: {[w.text for w in all_words]}")
|
247
260
|
raise AssertionError(
|
248
261
|
f"Ngram words don't match actual words at position {trans_pos}. "
|
249
262
|
f"This should never happen as trans_words should be derived from all_words."
|
@@ -251,6 +264,10 @@ class AnchorSequenceFinder:
|
|
251
264
|
|
252
265
|
matches = self._find_matching_sources(ngram, ref_texts_clean, n)
|
253
266
|
if len(matches) >= min_sources:
|
267
|
+
# Log successful match
|
268
|
+
if len(candidate_anchors) < 5: # Only log first few matches to avoid spam
|
269
|
+
self.logger.debug(f"🔍 N-GRAM {n}: ✅ Found match: '{' '.join(ngram)}' at pos {trans_pos} with {len(matches)} sources")
|
270
|
+
|
254
271
|
# Get Word IDs for transcribed words
|
255
272
|
transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
|
256
273
|
|
@@ -271,13 +288,26 @@ class AnchorSequenceFinder:
|
|
271
288
|
confidence=len(matches) / len(ref_texts_clean),
|
272
289
|
)
|
273
290
|
candidate_anchors.append(anchor)
|
291
|
+
anchors_found_this_iteration += 1
|
274
292
|
found_new_match = True
|
275
|
-
|
293
|
+
|
294
|
+
# For efficiency, if we have very low iteration limits, find one match per iteration
|
295
|
+
if self.max_iterations_per_ngram <= 10:
|
296
|
+
break
|
297
|
+
|
298
|
+
# Log progress for this iteration
|
299
|
+
if anchors_found_this_iteration > 0:
|
300
|
+
self.logger.debug(f"🔍 N-GRAM {n}: Found {anchors_found_this_iteration} anchors in iteration {iteration_count}")
|
301
|
+
|
302
|
+
# Early termination if we've found enough anchors or processed all positions
|
303
|
+
if len(used_trans_positions) >= len(trans_ngrams) or len(candidate_anchors) >= len(trans_ngrams):
|
304
|
+
self.logger.info(f"🔍 N-GRAM {n}: ⏹️ Early termination - processed all positions after {iteration_count} iterations")
|
305
|
+
break
|
276
306
|
|
277
307
|
if iteration_count >= self.max_iterations_per_ngram:
|
278
|
-
self.logger.warning(f"
|
308
|
+
self.logger.warning(f"🔍 N-GRAM {n}: ⏰ Processing terminated after reaching max iterations ({self.max_iterations_per_ngram})")
|
279
309
|
|
280
|
-
self.logger.
|
310
|
+
self.logger.info(f"🔍 N-GRAM {n}: ✅ Completed processing after {iteration_count} iterations, found {len(candidate_anchors)} anchors")
|
281
311
|
return candidate_anchors
|
282
312
|
|
283
313
|
def find_anchors(
|
@@ -290,19 +320,25 @@ class AnchorSequenceFinder:
|
|
290
320
|
start_time = time.time()
|
291
321
|
|
292
322
|
try:
|
323
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Starting find_anchors with timeout {self.timeout_seconds}s")
|
324
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Transcribed text length: {len(transcribed)}")
|
325
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Reference sources: {list(references.keys())}")
|
326
|
+
|
293
327
|
cache_key = self._get_cache_key(transcribed, references, transcription_result)
|
294
328
|
cache_path = self.cache_dir / f"anchors_{cache_key}.json"
|
329
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Cache key: {cache_key}")
|
295
330
|
|
296
331
|
# Try to load from cache
|
332
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Checking cache at {cache_path}")
|
297
333
|
if cached_data := self._load_from_cache(cache_path):
|
298
|
-
self.logger.info("Loading anchors from cache")
|
334
|
+
self.logger.info("🔍 ANCHOR SEARCH: ✅ Cache hit! Loading anchors from cache")
|
299
335
|
try:
|
300
336
|
# Convert cached_data to dictionary before logging
|
301
337
|
if cached_data:
|
302
338
|
first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
|
303
339
|
return cached_data
|
304
340
|
except Exception as e:
|
305
|
-
self.logger.error(f"
|
341
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Error loading cache: {type(e).__name__}: {e}")
|
306
342
|
if cached_data:
|
307
343
|
try:
|
308
344
|
first_anchor = {"anchor": cached_data[0].anchor.to_dict(), "phrase_score": cached_data[0].phrase_score.to_dict()}
|
@@ -311,43 +347,57 @@ class AnchorSequenceFinder:
|
|
311
347
|
self.logger.error("Could not serialize first cached anchor for logging")
|
312
348
|
|
313
349
|
# If not in cache or cache format invalid, perform the computation
|
314
|
-
self.logger.info(f"
|
315
|
-
self.logger.info(f"Finding anchor sequences for transcription with length {len(transcribed)}")
|
350
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ❌ Cache miss - computing anchors with timeout {self.timeout_seconds}s")
|
351
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Finding anchor sequences for transcription with length {len(transcribed)}")
|
316
352
|
|
317
353
|
# Check timeout before starting computation
|
318
354
|
self._check_timeout(start_time, "anchor computation initialization")
|
355
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - initialization")
|
319
356
|
|
320
357
|
# Get all words from transcription
|
358
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Extracting words from transcription result...")
|
321
359
|
all_words = []
|
322
360
|
for segment in transcription_result.result.segments:
|
323
361
|
all_words.extend(segment.words)
|
362
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Extracted {len(all_words)} words from transcription")
|
324
363
|
|
325
364
|
# Clean and split texts
|
365
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Cleaning transcription words...")
|
326
366
|
trans_words = [w.text.lower().strip('.,?!"\n') for w in all_words]
|
367
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Cleaned {len(trans_words)} transcription words")
|
368
|
+
|
369
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Processing reference sources...")
|
327
370
|
ref_texts_clean = {
|
328
371
|
source: self._clean_text(" ".join(w.text for s in lyrics.segments for w in s.words)).split()
|
329
372
|
for source, lyrics in references.items()
|
330
373
|
}
|
331
374
|
ref_words = {source: [w for s in lyrics.segments for w in s.words] for source, lyrics in references.items()}
|
375
|
+
|
376
|
+
for source, words in ref_texts_clean.items():
|
377
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Reference '{source}': {len(words)} words")
|
332
378
|
|
333
379
|
# Check timeout after preprocessing
|
334
380
|
self._check_timeout(start_time, "anchor computation preprocessing")
|
381
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - preprocessing")
|
335
382
|
|
336
383
|
# Filter out very short reference sources for n-gram length calculation
|
384
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Calculating n-gram lengths...")
|
337
385
|
valid_ref_lengths = [
|
338
386
|
len(words) for words in ref_texts_clean.values()
|
339
387
|
if len(words) >= self.min_sequence_length
|
340
388
|
]
|
341
389
|
|
342
390
|
if not valid_ref_lengths:
|
343
|
-
self.logger.warning("No reference sources long enough for anchor detection")
|
391
|
+
self.logger.warning("🔍 ANCHOR SEARCH: ❌ No reference sources long enough for anchor detection")
|
344
392
|
return []
|
345
393
|
|
346
394
|
# Calculate max length using only valid reference sources
|
347
395
|
max_length = min(len(trans_words), min(valid_ref_lengths))
|
348
396
|
n_gram_lengths = range(max_length, self.min_sequence_length - 1, -1)
|
397
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: N-gram lengths to process: {list(n_gram_lengths)} (max_length: {max_length})")
|
349
398
|
|
350
399
|
# Process n-gram lengths in parallel with timeout
|
400
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Setting up parallel processing...")
|
351
401
|
process_length_partial = partial(
|
352
402
|
self._process_ngram_length,
|
353
403
|
trans_words=trans_words,
|
@@ -363,83 +413,128 @@ class AnchorSequenceFinder:
|
|
363
413
|
|
364
414
|
# Check timeout before parallel processing
|
365
415
|
self._check_timeout(start_time, "parallel processing start")
|
416
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Timeout check passed - about to start parallel processing")
|
366
417
|
|
367
418
|
try:
|
419
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 🚀 Starting parallel processing with {max(cpu_count() - 1, 1)} processes, pool timeout: {pool_timeout}s")
|
368
420
|
with Pool(processes=max(cpu_count() - 1, 1)) as pool:
|
369
|
-
self.logger.debug(f"
|
421
|
+
self.logger.debug(f"🔍 ANCHOR SEARCH: Pool created successfully")
|
370
422
|
results = []
|
371
423
|
|
372
424
|
# Submit all jobs first
|
373
|
-
|
425
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Submitting {len(n_gram_lengths)} n-gram processing jobs...")
|
426
|
+
async_results = []
|
427
|
+
for i, n in enumerate(n_gram_lengths):
|
428
|
+
self.logger.debug(f"🔍 ANCHOR SEARCH: Submitting job {i+1}/{len(n_gram_lengths)} for n-gram length {n}")
|
429
|
+
async_result = pool.apply_async(process_length_partial, (n,))
|
430
|
+
async_results.append(async_result)
|
431
|
+
|
432
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ All {len(async_results)} jobs submitted")
|
374
433
|
|
375
434
|
# Collect results with individual timeouts
|
435
|
+
batch_results = []
|
436
|
+
batch_size = 10
|
437
|
+
|
376
438
|
for i, async_result in enumerate(async_results):
|
439
|
+
n_gram_length = n_gram_lengths[i]
|
377
440
|
try:
|
378
|
-
# Check timeout
|
379
|
-
self._check_timeout(start_time, f"collecting n-gram {n_gram_lengths[i]} results")
|
380
|
-
|
381
|
-
# Check remaining time for pool timeout
|
441
|
+
# Check remaining time for pool timeout (more lenient than overall timeout)
|
382
442
|
elapsed_time = time.time() - start_time
|
383
443
|
remaining_time = max(10, self.timeout_seconds - elapsed_time) if self.timeout_seconds > 0 else pool_timeout
|
384
444
|
|
385
|
-
|
445
|
+
self.logger.debug(f"🔍 ANCHOR SEARCH: Remaining time for n-gram {n_gram_length}: {remaining_time}s")
|
446
|
+
|
447
|
+
# Use a more lenient timeout for individual results to allow fallback
|
448
|
+
individual_timeout = min(pool_timeout, remaining_time) if self.timeout_seconds > 0 else pool_timeout
|
449
|
+
|
450
|
+
result = async_result.get(timeout=individual_timeout)
|
386
451
|
results.append(result)
|
387
452
|
|
388
|
-
|
453
|
+
# Batch logging - collect info for batched logging
|
454
|
+
batch_results.append((n_gram_length, len(result)))
|
455
|
+
|
456
|
+
# Log progress every batch_size results or on the last result
|
457
|
+
if (i + 1) % batch_size == 0 or (i + 1) == len(async_results):
|
458
|
+
total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
|
459
|
+
n_gram_ranges = [str(ng) for ng, _ in batch_results]
|
460
|
+
range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
|
461
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Completed n-gram lengths {range_str} ({i+1-len(batch_results)+1}-{i+1}/{len(async_results)}) - found {total_anchors_in_batch} anchors total")
|
462
|
+
batch_results = [] # Reset batch
|
389
463
|
|
390
|
-
except AnchorSequenceTimeoutError:
|
391
|
-
# Re-raise timeout errors
|
392
|
-
raise
|
393
464
|
except Exception as e:
|
394
|
-
self.logger.warning(f"n-gram length {
|
465
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n_gram_length} failed or timed out: {str(e)}")
|
395
466
|
results.append([]) # Add empty result to maintain order
|
467
|
+
|
468
|
+
# Add failed result to batch for logging
|
469
|
+
batch_results.append((n_gram_length, 0))
|
470
|
+
|
471
|
+
# If we're running short on time, trigger fallback early
|
472
|
+
if self.timeout_seconds > 0 and (time.time() - start_time) > (self.timeout_seconds * 0.8):
|
473
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Approaching timeout limit, triggering early fallback")
|
474
|
+
# Raise exception to trigger fallback to sequential processing
|
475
|
+
raise Exception("Parallel processing timeout, triggering fallback")
|
396
476
|
|
477
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Parallel processing completed, combining results...")
|
397
478
|
for anchors in results:
|
398
479
|
candidate_anchors.extend(anchors)
|
399
480
|
|
400
481
|
except AnchorSequenceTimeoutError:
|
482
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing timed out")
|
401
483
|
# Re-raise timeout errors
|
402
484
|
raise
|
403
485
|
except Exception as e:
|
404
|
-
self.logger.error(f"Parallel processing failed: {str(e)}")
|
486
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: ❌ Parallel processing failed: {str(e)}")
|
405
487
|
# Fall back to sequential processing with timeout checks
|
406
|
-
self.logger.info("Falling back to sequential processing")
|
488
|
+
self.logger.info("🔍 ANCHOR SEARCH: 🔄 Falling back to sequential processing")
|
407
489
|
for n in n_gram_lengths:
|
408
490
|
try:
|
409
|
-
# Check timeout
|
410
|
-
self.
|
491
|
+
# Check timeout more leniently during sequential processing
|
492
|
+
if self.timeout_seconds > 0:
|
493
|
+
elapsed_time = time.time() - start_time
|
494
|
+
# Allow more time for sequential processing (up to 2x the original timeout)
|
495
|
+
if elapsed_time > (self.timeout_seconds * 2.0):
|
496
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Sequential processing timeout for n-gram {n}")
|
497
|
+
break
|
498
|
+
|
499
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Sequential processing n-gram length {n}")
|
411
500
|
|
412
501
|
anchors = self._process_ngram_length(
|
413
502
|
n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
|
414
503
|
)
|
415
504
|
candidate_anchors.extend(anchors)
|
416
|
-
|
417
|
-
# Re-raise timeout errors
|
418
|
-
raise
|
505
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Sequential n-gram {n} completed - found {len(anchors)} anchors")
|
419
506
|
except Exception as e:
|
420
|
-
self.logger.warning(f"Sequential processing failed for n-gram length {n}: {str(e)}")
|
507
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ Sequential processing failed for n-gram length {n}: {str(e)}")
|
421
508
|
continue
|
422
509
|
|
423
|
-
self.logger.info(f"Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
|
510
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
|
424
511
|
|
425
512
|
# Check timeout before expensive filtering operation
|
426
513
|
self._check_timeout(start_time, "overlap filtering start")
|
514
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 🔄 Starting overlap filtering...")
|
427
515
|
|
428
516
|
filtered_anchors = self._remove_overlapping_sequences(candidate_anchors, transcribed, transcription_result)
|
517
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Filtering completed - {len(filtered_anchors)} final anchors")
|
429
518
|
|
430
519
|
# Save to cache
|
520
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 💾 Saving results to cache...")
|
431
521
|
self._save_to_cache(cache_path, filtered_anchors)
|
432
522
|
|
433
523
|
total_time = time.time() - start_time
|
434
|
-
self.logger.info(f"Anchor sequence computation completed in {total_time:.1f}s")
|
524
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: 🎉 Anchor sequence computation completed successfully in {total_time:.1f}s")
|
435
525
|
|
436
526
|
return filtered_anchors
|
437
527
|
|
438
528
|
except AnchorSequenceTimeoutError:
|
439
|
-
|
529
|
+
elapsed_time = time.time() - start_time
|
530
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: ⏰ TIMEOUT after {elapsed_time:.1f}s (limit: {self.timeout_seconds}s)")
|
440
531
|
raise
|
441
532
|
except Exception as e:
|
442
|
-
|
533
|
+
elapsed_time = time.time() - start_time
|
534
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: ❌ FAILED after {elapsed_time:.1f}s: {str(e)}")
|
535
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: Exception type: {type(e).__name__}")
|
536
|
+
import traceback
|
537
|
+
self.logger.error(f"🔍 ANCHOR SEARCH: Traceback: {traceback.format_exc()}")
|
443
538
|
raise
|
444
539
|
finally:
|
445
540
|
# No cleanup needed for time-based timeout checks
|
@@ -506,16 +601,20 @@ class AnchorSequenceFinder:
|
|
506
601
|
transcription_result: TranscriptionResult,
|
507
602
|
) -> List[ScoredAnchor]:
|
508
603
|
"""Remove overlapping sequences using phrase analysis with timeout protection."""
|
604
|
+
self.logger.info(f"🔍 FILTERING: Starting overlap removal for {len(anchors)} anchors")
|
605
|
+
|
509
606
|
if not anchors:
|
607
|
+
self.logger.info(f"🔍 FILTERING: No anchors to process")
|
510
608
|
return []
|
511
609
|
|
512
|
-
self.logger.info(f"Scoring {len(anchors)} anchors")
|
610
|
+
self.logger.info(f"🔍 FILTERING: Scoring {len(anchors)} anchors")
|
513
611
|
|
514
612
|
# Create word map for scoring
|
515
613
|
word_map = {w.id: w for s in transcription_result.result.segments for w in s.words}
|
614
|
+
self.logger.debug(f"🔍 FILTERING: Created word map with {len(word_map)} words")
|
516
615
|
|
517
616
|
# Add word map to each anchor for scoring
|
518
|
-
for anchor in anchors:
|
617
|
+
for i, anchor in enumerate(anchors):
|
519
618
|
# For backwards compatibility, only add transcribed_words if all IDs exist in word_map
|
520
619
|
try:
|
521
620
|
anchor.transcribed_words = [word_map[word_id] for word_id in anchor.transcribed_word_ids]
|
@@ -555,7 +654,7 @@ class AnchorSequenceFinder:
|
|
555
654
|
|
556
655
|
# Try different pool sizes with timeout
|
557
656
|
num_processes = max(cpu_count() - 1, 1) # Leave one CPU free
|
558
|
-
self.logger.info(f"Using {num_processes} processes for scoring")
|
657
|
+
self.logger.info(f"🔍 FILTERING: Using {num_processes} processes for scoring")
|
559
658
|
|
560
659
|
# Create a partial function with the context parameter fixed
|
561
660
|
score_anchor_partial = partial(self._score_anchor_static, context=context)
|
@@ -565,24 +664,29 @@ class AnchorSequenceFinder:
|
|
565
664
|
pool_timeout = 300 # 5 minutes for scoring phase
|
566
665
|
|
567
666
|
try:
|
667
|
+
self.logger.info(f"🔍 FILTERING: 🚀 Starting parallel scoring with timeout {pool_timeout}s")
|
568
668
|
with Pool(processes=num_processes) as pool:
|
569
669
|
# Submit scoring jobs with timeout
|
570
670
|
async_results = []
|
571
671
|
batch_size = 50
|
572
672
|
|
673
|
+
self.logger.info(f"🔍 FILTERING: Splitting {len(anchors)} anchors into batches of {batch_size}")
|
573
674
|
for i in range(0, len(anchors), batch_size):
|
574
675
|
batch = anchors[i:i + batch_size]
|
575
676
|
async_result = pool.apply_async(self._score_batch_static, (batch, context))
|
576
677
|
async_results.append(async_result)
|
577
678
|
|
679
|
+
self.logger.info(f"🔍 FILTERING: Submitted {len(async_results)} scoring batches")
|
680
|
+
|
578
681
|
# Collect results with timeout
|
579
682
|
for i, async_result in enumerate(async_results):
|
580
683
|
try:
|
684
|
+
self.logger.debug(f"🔍 FILTERING: ⏳ Collecting batch {i+1}/{len(async_results)}")
|
581
685
|
batch_results = async_result.get(timeout=pool_timeout)
|
582
686
|
scored_anchors.extend(batch_results)
|
583
|
-
self.logger.debug(f"Completed scoring batch {i+1}/{len(async_results)}")
|
687
|
+
self.logger.debug(f"🔍 FILTERING: ✅ Completed scoring batch {i+1}/{len(async_results)}")
|
584
688
|
except Exception as e:
|
585
|
-
self.logger.warning(f"Scoring batch {i+1} failed or timed out: {str(e)}")
|
689
|
+
self.logger.warning(f"🔍 FILTERING: ⚠️ Scoring batch {i+1} failed or timed out: {str(e)}")
|
586
690
|
# Add basic scores for failed batch
|
587
691
|
start_idx = i * batch_size
|
588
692
|
end_idx = min((i + 1) * batch_size, len(anchors))
|
@@ -599,7 +703,7 @@ class AnchorSequenceFinder:
|
|
599
703
|
continue
|
600
704
|
|
601
705
|
except Exception as e:
|
602
|
-
self.logger.warning(f"Parallel scoring failed: {str(e)}, falling back to basic scoring")
|
706
|
+
self.logger.warning(f"🔍 FILTERING: ❌ Parallel scoring failed: {str(e)}, falling back to basic scoring")
|
603
707
|
# Fall back to basic scoring
|
604
708
|
for anchor in anchors:
|
605
709
|
try:
|
@@ -613,22 +717,28 @@ class AnchorSequenceFinder:
|
|
613
717
|
continue
|
614
718
|
|
615
719
|
parallel_time = time.time() - start_time
|
616
|
-
self.logger.info(f"Parallel scoring
|
720
|
+
self.logger.info(f"🔍 FILTERING: ✅ Parallel scoring completed in {parallel_time:.2f}s, scored {len(scored_anchors)} anchors")
|
617
721
|
|
618
722
|
# Sort and filter as before
|
723
|
+
self.logger.info(f"🔍 FILTERING: 🔄 Sorting anchors by priority...")
|
619
724
|
scored_anchors.sort(key=self._get_sequence_priority, reverse=True)
|
725
|
+
self.logger.info(f"🔍 FILTERING: ✅ Sorting completed")
|
620
726
|
|
621
|
-
self.logger.info(f"Filtering {len(scored_anchors)} overlapping sequences")
|
727
|
+
self.logger.info(f"🔍 FILTERING: 🔄 Filtering {len(scored_anchors)} overlapping sequences")
|
622
728
|
filtered_scored = []
|
623
729
|
|
624
730
|
for i, scored_anchor in enumerate(scored_anchors):
|
625
|
-
# Check timeout every 100 anchors using our timeout mechanism
|
626
|
-
if i % 100 == 0:
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
731
|
+
# Check timeout every 100 anchors using our timeout mechanism (more lenient)
|
732
|
+
if i % 100 == 0 and i > 0:
|
733
|
+
# Only check timeout if we're significantly over the limit
|
734
|
+
if self.timeout_seconds > 0:
|
735
|
+
elapsed_time = time.time() - start_time
|
736
|
+
# Use a more lenient timeout for filtering (allow 50% more time)
|
737
|
+
if elapsed_time > (self.timeout_seconds * 1.5):
|
738
|
+
self.logger.warning(f"🔍 FILTERING: ⏰ Filtering timed out, returning {len(filtered_scored)} anchors out of {len(scored_anchors)}")
|
739
|
+
break
|
740
|
+
|
741
|
+
self.logger.debug(f"🔍 FILTERING: Progress: {i}/{len(scored_anchors)} processed, {len(filtered_scored)} kept")
|
632
742
|
|
633
743
|
overlaps = False
|
634
744
|
for existing in filtered_scored:
|
@@ -639,7 +749,7 @@ class AnchorSequenceFinder:
|
|
639
749
|
if not overlaps:
|
640
750
|
filtered_scored.append(scored_anchor)
|
641
751
|
|
642
|
-
self.logger.info(f"
|
752
|
+
self.logger.info(f"🔍 FILTERING: ✅ Filtering completed - kept {len(filtered_scored)} non-overlapping anchors out of {len(scored_anchors)}")
|
643
753
|
return filtered_scored
|
644
754
|
|
645
755
|
@staticmethod
|
@@ -38915,7 +38915,7 @@ const theme = createTheme({
|
|
38915
38915
|
spacing: (factor) => `${0.6 * factor}rem`
|
38916
38916
|
// Further reduced from 0.8 * factor
|
38917
38917
|
});
|
38918
|
-
const version = "0.
|
38918
|
+
const version = "0.68.0";
|
38919
38919
|
const packageJson = {
|
38920
38920
|
version
|
38921
38921
|
};
|
@@ -38926,4 +38926,4 @@ ReactDOM$1.createRoot(document.getElementById("root")).render(
|
|
38926
38926
|
/* @__PURE__ */ jsxRuntimeExports.jsx(App, {})
|
38927
38927
|
] })
|
38928
38928
|
);
|
38929
|
-
//# sourceMappingURL=index-
|
38929
|
+
//# sourceMappingURL=index-D7BQUJXK.js.map
|