karaoke-gen 0.90.1__py3-none-any.whl → 0.99.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. backend/.coveragerc +20 -0
  2. backend/.gitignore +37 -0
  3. backend/Dockerfile +43 -0
  4. backend/Dockerfile.base +74 -0
  5. backend/README.md +242 -0
  6. backend/__init__.py +0 -0
  7. backend/api/__init__.py +0 -0
  8. backend/api/dependencies.py +457 -0
  9. backend/api/routes/__init__.py +0 -0
  10. backend/api/routes/admin.py +835 -0
  11. backend/api/routes/audio_search.py +913 -0
  12. backend/api/routes/auth.py +348 -0
  13. backend/api/routes/file_upload.py +2112 -0
  14. backend/api/routes/health.py +409 -0
  15. backend/api/routes/internal.py +435 -0
  16. backend/api/routes/jobs.py +1629 -0
  17. backend/api/routes/review.py +652 -0
  18. backend/api/routes/themes.py +162 -0
  19. backend/api/routes/users.py +1513 -0
  20. backend/config.py +172 -0
  21. backend/main.py +157 -0
  22. backend/middleware/__init__.py +5 -0
  23. backend/middleware/audit_logging.py +124 -0
  24. backend/models/__init__.py +0 -0
  25. backend/models/job.py +519 -0
  26. backend/models/requests.py +123 -0
  27. backend/models/theme.py +153 -0
  28. backend/models/user.py +254 -0
  29. backend/models/worker_log.py +164 -0
  30. backend/pyproject.toml +29 -0
  31. backend/quick-check.sh +93 -0
  32. backend/requirements.txt +29 -0
  33. backend/run_tests.sh +60 -0
  34. backend/services/__init__.py +0 -0
  35. backend/services/audio_analysis_service.py +243 -0
  36. backend/services/audio_editing_service.py +278 -0
  37. backend/services/audio_search_service.py +702 -0
  38. backend/services/auth_service.py +630 -0
  39. backend/services/credential_manager.py +792 -0
  40. backend/services/discord_service.py +172 -0
  41. backend/services/dropbox_service.py +301 -0
  42. backend/services/email_service.py +1093 -0
  43. backend/services/encoding_interface.py +454 -0
  44. backend/services/encoding_service.py +502 -0
  45. backend/services/firestore_service.py +512 -0
  46. backend/services/flacfetch_client.py +573 -0
  47. backend/services/gce_encoding/README.md +72 -0
  48. backend/services/gce_encoding/__init__.py +22 -0
  49. backend/services/gce_encoding/main.py +589 -0
  50. backend/services/gce_encoding/requirements.txt +16 -0
  51. backend/services/gdrive_service.py +356 -0
  52. backend/services/job_logging.py +258 -0
  53. backend/services/job_manager.py +853 -0
  54. backend/services/job_notification_service.py +271 -0
  55. backend/services/langfuse_preloader.py +98 -0
  56. backend/services/local_encoding_service.py +590 -0
  57. backend/services/local_preview_encoding_service.py +407 -0
  58. backend/services/lyrics_cache_service.py +216 -0
  59. backend/services/metrics.py +413 -0
  60. backend/services/nltk_preloader.py +122 -0
  61. backend/services/packaging_service.py +287 -0
  62. backend/services/rclone_service.py +106 -0
  63. backend/services/spacy_preloader.py +65 -0
  64. backend/services/storage_service.py +209 -0
  65. backend/services/stripe_service.py +371 -0
  66. backend/services/structured_logging.py +254 -0
  67. backend/services/template_service.py +330 -0
  68. backend/services/theme_service.py +469 -0
  69. backend/services/tracing.py +543 -0
  70. backend/services/user_service.py +721 -0
  71. backend/services/worker_service.py +558 -0
  72. backend/services/youtube_service.py +112 -0
  73. backend/services/youtube_upload_service.py +445 -0
  74. backend/tests/__init__.py +4 -0
  75. backend/tests/conftest.py +224 -0
  76. backend/tests/emulator/__init__.py +7 -0
  77. backend/tests/emulator/conftest.py +109 -0
  78. backend/tests/emulator/test_e2e_cli_backend.py +1053 -0
  79. backend/tests/emulator/test_emulator_integration.py +356 -0
  80. backend/tests/emulator/test_style_loading_direct.py +436 -0
  81. backend/tests/emulator/test_worker_logs_direct.py +229 -0
  82. backend/tests/emulator/test_worker_logs_subcollection.py +443 -0
  83. backend/tests/requirements-test.txt +10 -0
  84. backend/tests/requirements.txt +6 -0
  85. backend/tests/test_admin_email_endpoints.py +411 -0
  86. backend/tests/test_api_integration.py +460 -0
  87. backend/tests/test_api_routes.py +93 -0
  88. backend/tests/test_audio_analysis_service.py +294 -0
  89. backend/tests/test_audio_editing_service.py +386 -0
  90. backend/tests/test_audio_search.py +1398 -0
  91. backend/tests/test_audio_services.py +378 -0
  92. backend/tests/test_auth_firestore.py +231 -0
  93. backend/tests/test_config_extended.py +68 -0
  94. backend/tests/test_credential_manager.py +377 -0
  95. backend/tests/test_dependencies.py +54 -0
  96. backend/tests/test_discord_service.py +244 -0
  97. backend/tests/test_distribution_services.py +820 -0
  98. backend/tests/test_dropbox_service.py +472 -0
  99. backend/tests/test_email_service.py +492 -0
  100. backend/tests/test_emulator_integration.py +322 -0
  101. backend/tests/test_encoding_interface.py +412 -0
  102. backend/tests/test_file_upload.py +1739 -0
  103. backend/tests/test_flacfetch_client.py +632 -0
  104. backend/tests/test_gdrive_service.py +524 -0
  105. backend/tests/test_instrumental_api.py +431 -0
  106. backend/tests/test_internal_api.py +343 -0
  107. backend/tests/test_job_creation_regression.py +583 -0
  108. backend/tests/test_job_manager.py +356 -0
  109. backend/tests/test_job_manager_notifications.py +329 -0
  110. backend/tests/test_job_notification_service.py +443 -0
  111. backend/tests/test_jobs_api.py +283 -0
  112. backend/tests/test_local_encoding_service.py +423 -0
  113. backend/tests/test_local_preview_encoding_service.py +567 -0
  114. backend/tests/test_main.py +87 -0
  115. backend/tests/test_models.py +918 -0
  116. backend/tests/test_packaging_service.py +382 -0
  117. backend/tests/test_requests.py +201 -0
  118. backend/tests/test_routes_jobs.py +282 -0
  119. backend/tests/test_routes_review.py +337 -0
  120. backend/tests/test_services.py +556 -0
  121. backend/tests/test_services_extended.py +112 -0
  122. backend/tests/test_spacy_preloader.py +119 -0
  123. backend/tests/test_storage_service.py +448 -0
  124. backend/tests/test_style_upload.py +261 -0
  125. backend/tests/test_template_service.py +295 -0
  126. backend/tests/test_theme_service.py +516 -0
  127. backend/tests/test_unicode_sanitization.py +522 -0
  128. backend/tests/test_upload_api.py +256 -0
  129. backend/tests/test_validate.py +156 -0
  130. backend/tests/test_video_worker_orchestrator.py +847 -0
  131. backend/tests/test_worker_log_subcollection.py +509 -0
  132. backend/tests/test_worker_logging.py +365 -0
  133. backend/tests/test_workers.py +1116 -0
  134. backend/tests/test_workers_extended.py +178 -0
  135. backend/tests/test_youtube_service.py +247 -0
  136. backend/tests/test_youtube_upload_service.py +568 -0
  137. backend/utils/test_data.py +27 -0
  138. backend/validate.py +173 -0
  139. backend/version.py +27 -0
  140. backend/workers/README.md +597 -0
  141. backend/workers/__init__.py +11 -0
  142. backend/workers/audio_worker.py +618 -0
  143. backend/workers/lyrics_worker.py +683 -0
  144. backend/workers/render_video_worker.py +483 -0
  145. backend/workers/screens_worker.py +535 -0
  146. backend/workers/style_helper.py +198 -0
  147. backend/workers/video_worker.py +1277 -0
  148. backend/workers/video_worker_orchestrator.py +701 -0
  149. backend/workers/worker_logging.py +278 -0
  150. karaoke_gen/instrumental_review/static/index.html +7 -4
  151. karaoke_gen/karaoke_finalise/karaoke_finalise.py +6 -1
  152. karaoke_gen/utils/__init__.py +163 -8
  153. karaoke_gen/video_background_processor.py +9 -4
  154. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/METADATA +1 -1
  155. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/RECORD +196 -46
  156. lyrics_transcriber/correction/agentic/agent.py +17 -6
  157. lyrics_transcriber/correction/agentic/providers/config.py +9 -5
  158. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +96 -93
  159. lyrics_transcriber/correction/agentic/providers/model_factory.py +27 -6
  160. lyrics_transcriber/correction/anchor_sequence.py +151 -37
  161. lyrics_transcriber/correction/corrector.py +192 -130
  162. lyrics_transcriber/correction/handlers/syllables_match.py +44 -2
  163. lyrics_transcriber/correction/operations.py +24 -9
  164. lyrics_transcriber/correction/phrase_analyzer.py +18 -0
  165. lyrics_transcriber/frontend/package-lock.json +2 -2
  166. lyrics_transcriber/frontend/package.json +1 -1
  167. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +1 -1
  168. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +11 -7
  169. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +31 -5
  170. lyrics_transcriber/frontend/src/components/EditModal.tsx +28 -10
  171. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +123 -27
  172. lyrics_transcriber/frontend/src/components/EditWordList.tsx +112 -60
  173. lyrics_transcriber/frontend/src/components/Header.tsx +90 -76
  174. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +53 -31
  175. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +44 -13
  176. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +66 -50
  177. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +124 -30
  178. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
  179. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +12 -5
  180. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +3 -3
  181. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
  182. lyrics_transcriber/frontend/src/components/WordDivider.tsx +11 -7
  183. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +4 -2
  184. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +103 -1
  185. lyrics_transcriber/frontend/src/theme.ts +42 -15
  186. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  187. lyrics_transcriber/frontend/vite.config.js +5 -0
  188. lyrics_transcriber/frontend/web_assets/assets/{index-BECn1o8Q.js → index-BSMgOq4Z.js} +6959 -5782
  189. lyrics_transcriber/frontend/web_assets/assets/index-BSMgOq4Z.js.map +1 -0
  190. lyrics_transcriber/frontend/web_assets/index.html +6 -2
  191. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.svg +5 -0
  192. lyrics_transcriber/output/generator.py +17 -3
  193. lyrics_transcriber/output/video.py +60 -95
  194. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +0 -1
  195. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/WHEEL +0 -0
  196. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/entry_points.txt +0 -0
  197. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/licenses/LICENSE +0 -0
@@ -13,6 +13,7 @@ from __future__ import annotations
13
13
 
14
14
  import logging
15
15
  import os
16
+ import threading
16
17
  import time
17
18
  from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
18
19
  from typing import List, Dict, Any, Optional
@@ -94,11 +95,80 @@ class LangChainBridge(BaseAIProvider):
94
95
  cache_dir=self._config.cache_dir,
95
96
  enabled=cache_enabled
96
97
  )
97
-
98
- # Lazy-initialized chat model
98
+
99
+ # Lazy-initialized chat model with thread-safe initialization
100
+ # Lock prevents race condition where multiple threads try to initialize simultaneously
99
101
  self._chat_model: Optional[Any] = None
100
- self._warmed_up: bool = False
101
-
102
+ self._model_init_lock = threading.Lock()
103
+
104
+ def warmup(self) -> bool:
105
+ """Eagerly initialize the chat model.
106
+
107
+ Call this after creating the bridge to avoid lazy initialization delays
108
+ when multiple threads call generate_correction_proposals() simultaneously.
109
+
110
+ Returns:
111
+ True if model was initialized successfully, False otherwise
112
+ """
113
+ if self._chat_model is not None:
114
+ logger.debug(f"🤖 Model {self._model} already initialized")
115
+ return True
116
+
117
+ logger.info(f"🤖 Warming up model {self._model}...")
118
+ # Trigger initialization by calling the initialization logic directly
119
+ try:
120
+ self._ensure_model_initialized()
121
+ return self._chat_model is not None
122
+ except Exception as e:
123
+ logger.error(f"🤖 Warmup failed for {self._model}: {e}")
124
+ return False
125
+
126
+ def _ensure_model_initialized(self) -> None:
127
+ """Ensure the chat model is initialized (thread-safe).
128
+
129
+ This method handles the lazy initialization with proper locking.
130
+ It's separated out so it can be called from both warmup() and
131
+ generate_correction_proposals().
132
+ """
133
+ if self._chat_model is not None:
134
+ return
135
+
136
+ with self._model_init_lock:
137
+ # Double-check after acquiring lock
138
+ if self._chat_model is not None:
139
+ return
140
+
141
+ timeout = self._config.initialization_timeout_seconds
142
+ logger.info(f"🤖 Initializing model {self._model} with {timeout}s timeout...")
143
+ init_start = time.time()
144
+
145
+ try:
146
+ # Use ThreadPoolExecutor for cross-platform timeout
147
+ with ThreadPoolExecutor(max_workers=1) as executor:
148
+ future = executor.submit(
149
+ self._factory.create_chat_model,
150
+ self._model,
151
+ self._config
152
+ )
153
+ try:
154
+ self._chat_model = future.result(timeout=timeout)
155
+ except FuturesTimeoutError:
156
+ raise InitializationTimeoutError(
157
+ f"Model initialization timed out after {timeout}s. "
158
+ f"This may indicate network issues or service unavailability."
159
+ ) from None
160
+
161
+ init_elapsed = time.time() - init_start
162
+ logger.info(f"🤖 Model initialized in {init_elapsed:.2f}s")
163
+
164
+ except InitializationTimeoutError:
165
+ self._circuit_breaker.record_failure(self._model)
166
+ raise
167
+ except Exception as e:
168
+ self._circuit_breaker.record_failure(self._model)
169
+ logger.error(f"🤖 Failed to initialize chat model: {e}")
170
+ raise
171
+
102
172
  def name(self) -> str:
103
173
  """Return provider name for logging."""
104
174
  return f"langchain:{self._model}"
@@ -141,52 +211,28 @@ class LangChainBridge(BaseAIProvider):
141
211
  "until": open_until
142
212
  }]
143
213
 
144
- # Step 2: Get or create chat model with initialization timeout
145
- if not self._chat_model:
146
- timeout = self._config.initialization_timeout_seconds
147
- logger.info(f"🤖 Initializing model {self._model} with {timeout}s timeout...")
148
- init_start = time.time()
149
-
150
- try:
151
- # Use ThreadPoolExecutor for cross-platform timeout
152
- with ThreadPoolExecutor(max_workers=1) as executor:
153
- future = executor.submit(
154
- self._factory.create_chat_model,
155
- self._model,
156
- self._config
157
- )
158
- try:
159
- self._chat_model = future.result(timeout=timeout)
160
- except FuturesTimeoutError:
161
- raise InitializationTimeoutError(
162
- f"Model initialization timed out after {timeout}s. "
163
- f"This may indicate network issues or service unavailability."
164
- ) from None
165
-
166
- init_elapsed = time.time() - init_start
167
- logger.info(f"🤖 Model created in {init_elapsed:.2f}s, starting warm-up...")
168
-
169
- # Warm up the model to establish connection before real work
170
- self._warm_up_model()
171
-
172
- total_elapsed = time.time() - init_start
173
- logger.info(f"🤖 Model initialization complete in {total_elapsed:.2f}s")
174
-
175
- except InitializationTimeoutError as e:
176
- self._circuit_breaker.record_failure(self._model)
177
- logger.exception("🤖 Model initialization timeout")
178
- return [{
179
- "error": INIT_TIMEOUT_ERROR,
180
- "message": str(e),
181
- "timeout_seconds": timeout
182
- }]
183
- except Exception as e:
184
- self._circuit_breaker.record_failure(self._model)
185
- logger.error(f"🤖 Failed to initialize chat model: {e}")
186
- return [{
187
- "error": MODEL_INIT_ERROR,
188
- "message": str(e)
189
- }]
214
+ # Step 2: Get or create chat model with thread-safe initialization
215
+ # Use double-checked locking to avoid race condition where multiple threads
216
+ # all try to initialize the model simultaneously (which caused job 2ccbdf6b
217
+ # to have 5 concurrent model initializations and 6+ minute delays)
218
+ #
219
+ # NOTE: For best performance, call warmup() after creating the bridge to
220
+ # eagerly initialize the model before parallel processing begins.
221
+ try:
222
+ self._ensure_model_initialized()
223
+ except InitializationTimeoutError as e:
224
+ logger.exception("🤖 Model initialization timeout")
225
+ return [{
226
+ "error": INIT_TIMEOUT_ERROR,
227
+ "message": str(e),
228
+ "timeout_seconds": self._config.initialization_timeout_seconds
229
+ }]
230
+ except Exception as e:
231
+ logger.error(f"🤖 Failed to initialize chat model: {e}")
232
+ return [{
233
+ "error": MODEL_INIT_ERROR,
234
+ "message": str(e)
235
+ }]
190
236
 
191
237
  # Step 3: Execute with retry logic
192
238
  logger.info(
@@ -271,47 +317,4 @@ class LangChainBridge(BaseAIProvider):
271
317
 
272
318
  return content
273
319
 
274
- def _warm_up_model(self) -> None:
275
- """Send a lightweight request to warm up the model connection.
276
-
277
- This helps establish the REST connection and potentially warm up any
278
- server-side resources before processing real correction requests.
279
- The warm-up uses a timeout to fail fast if the service is unresponsive.
280
- """
281
- if self._warmed_up:
282
- return
283
-
284
- timeout = self._config.warmup_timeout_seconds
285
- # Use print with flush=True for visibility when output is redirected
286
- print(f"🔥 Warming up {self._model} connection (timeout: {timeout}s)...", flush=True)
287
- logger.info(f"🔥 Warming up {self._model} connection (timeout: {timeout}s)...")
288
-
289
- warmup_start = time.time()
290
- try:
291
- from langchain_core.messages import HumanMessage
292
-
293
- # Minimal prompt that requires almost no processing
294
- warm_up_prompt = 'Respond with exactly: {"status":"ready"}'
295
-
296
- # Use ThreadPoolExecutor for timeout on warm-up call
297
- with ThreadPoolExecutor(max_workers=1) as executor:
298
- future = executor.submit(
299
- self._chat_model.invoke,
300
- [HumanMessage(content=warm_up_prompt)]
301
- )
302
- try:
303
- future.result(timeout=timeout)
304
- except FuturesTimeoutError:
305
- raise TimeoutError(f"Warm-up timed out after {timeout}s") from None
306
-
307
- elapsed = time.time() - warmup_start
308
- self._warmed_up = True
309
- print(f"🔥 Warm-up complete for {self._model} in {elapsed:.2f}s", flush=True)
310
- logger.info(f"🔥 Warm-up complete for {self._model} in {elapsed:.2f}s")
311
- except Exception as e:
312
- elapsed = time.time() - warmup_start
313
- # Don't fail the actual request if warm-up fails
314
- # Just log and continue - the real request might still work
315
- print(f"🔥 Warm-up failed for {self._model} after {elapsed:.2f}s: {e} (continuing anyway)", flush=True)
316
- logger.warning(f"🔥 Warm-up failed for {self._model} after {elapsed:.2f}s: {e} (continuing anyway)")
317
320
 
@@ -10,6 +10,14 @@ from .config import ProviderConfig
10
10
 
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
+ # Try to import Langfuse preloader (may not exist in standalone library usage)
14
+ try:
15
+ from backend.services.langfuse_preloader import get_preloaded_langfuse_handler
16
+
17
+ _HAS_LANGFUSE_PRELOADER = True
18
+ except ImportError:
19
+ _HAS_LANGFUSE_PRELOADER = False
20
+
13
21
  # Error message constant for TRY003 compliance
14
22
  GOOGLE_API_KEY_MISSING_ERROR = (
15
23
  "GOOGLE_API_KEY environment variable is required for Google/Gemini models. "
@@ -87,25 +95,38 @@ class ModelFactory:
87
95
 
88
96
  def _initialize_langfuse(self, model_spec: str) -> None:
89
97
  """Initialize Langfuse callback handler if keys are present.
90
-
98
+
99
+ First tries to use a preloaded handler (to avoid 200+ second init delay
100
+ on Cloud Run cold starts), then falls back to creating a new one.
101
+
91
102
  Langfuse reads credentials from environment variables automatically:
92
103
  - LANGFUSE_PUBLIC_KEY
93
- - LANGFUSE_SECRET_KEY
104
+ - LANGFUSE_SECRET_KEY
94
105
  - LANGFUSE_HOST (optional)
95
-
106
+
96
107
  Args:
97
108
  model_spec: Model specification for logging
98
-
109
+
99
110
  Raises:
100
111
  RuntimeError: If Langfuse keys are set but initialization fails
101
112
  """
102
113
  public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
103
114
  secret_key = os.getenv("LANGFUSE_SECRET_KEY")
104
-
115
+
105
116
  if not (public_key and secret_key):
106
117
  logger.debug("🤖 Langfuse keys not found, tracing disabled")
107
118
  return
108
-
119
+
120
+ # Try to use preloaded handler first (avoids 200+ second delay on Cloud Run)
121
+ if _HAS_LANGFUSE_PRELOADER:
122
+ preloaded = get_preloaded_langfuse_handler()
123
+ if preloaded is not None:
124
+ logger.info(f"🤖 Using preloaded Langfuse handler for {model_spec}")
125
+ self._langfuse_handler = preloaded
126
+ return
127
+
128
+ # Fall back to creating new handler
129
+ logger.info(f"🤖 Initializing Langfuse handler (not preloaded) for {model_spec}...")
109
130
  try:
110
131
  from langfuse.langchain import CallbackHandler
111
132
 
@@ -32,19 +32,24 @@ class AnchorSequenceFinder:
32
32
  progress_check_interval: int = 50, # Check progress every N iterations
33
33
  logger: Optional[logging.Logger] = None,
34
34
  ):
35
+ init_start = time.time()
35
36
  self.min_sequence_length = min_sequence_length
36
37
  self.min_sources = min_sources
37
38
  self.timeout_seconds = timeout_seconds
38
39
  self.max_iterations_per_ngram = max_iterations_per_ngram
39
40
  self.progress_check_interval = progress_check_interval
40
41
  self.logger = logger or logging.getLogger(__name__)
42
+
43
+ self.logger.info("Initializing AnchorSequenceFinder...")
41
44
  self.phrase_analyzer = PhraseAnalyzer(logger=self.logger)
42
45
  self.used_positions = {}
43
46
 
44
47
  # Initialize cache directory
45
48
  self.cache_dir = Path(cache_dir)
46
49
  self.cache_dir.mkdir(parents=True, exist_ok=True)
47
- self.logger.info(f"Initialized AnchorSequenceFinder with cache dir: {self.cache_dir}, timeout: {timeout_seconds}s")
50
+
51
+ init_elapsed = time.time() - init_start
52
+ self.logger.info(f"Initialized AnchorSequenceFinder in {init_elapsed:.2f}s (cache: {self.cache_dir}, timeout: {timeout_seconds}s)")
48
53
 
49
54
  def _check_timeout(self, start_time: float, operation_name: str = "operation"):
50
55
  """Check if timeout has occurred and raise exception if so."""
@@ -245,6 +250,65 @@ class AnchorSequenceFinder:
245
250
  self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
246
251
  return None
247
252
 
253
+ def _process_ngram_length_no_state(
254
+ self,
255
+ n: int,
256
+ trans_words: List[str],
257
+ all_words: List[Word],
258
+ ref_texts_clean: Dict[str, List[str]],
259
+ ref_words: Dict[str, List[Word]],
260
+ min_sources: int,
261
+ ) -> List[AnchorSequence]:
262
+ """Process a single n-gram length without modifying shared state (thread-safe).
263
+
264
+ This version doesn't track used positions - overlap filtering happens later
265
+ in _remove_overlapping_sequences. This allows parallel processing of different
266
+ n-gram lengths.
267
+ """
268
+ candidate_anchors = []
269
+
270
+ # Build hash-based index for O(1) lookups
271
+ ngram_index = self._build_ngram_index(ref_texts_clean, n)
272
+
273
+ # Generate n-grams from transcribed text
274
+ trans_ngrams = self._find_ngrams(trans_words, n)
275
+
276
+ # Single pass through all transcription n-grams
277
+ for ngram, trans_pos in trans_ngrams:
278
+ # Use indexed lookup (O(1) instead of O(n))
279
+ ngram_tuple = tuple(ngram)
280
+ if ngram_tuple not in ngram_index:
281
+ continue
282
+
283
+ # Find matches in all sources (no used_positions check - handled later)
284
+ matches = {}
285
+ source_positions = ngram_index[ngram_tuple]
286
+ for source, positions in source_positions.items():
287
+ if positions:
288
+ matches[source] = positions[0] # Take first position
289
+
290
+ if len(matches) >= min_sources:
291
+ # Get Word IDs for transcribed words
292
+ transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
293
+
294
+ # Get Word IDs for reference words
295
+ reference_word_ids = {
296
+ source: [w.id for w in ref_words[source][pos : pos + n]]
297
+ for source, pos in matches.items()
298
+ }
299
+
300
+ anchor = AnchorSequence(
301
+ id=WordUtils.generate_id(),
302
+ transcribed_word_ids=transcribed_word_ids,
303
+ transcription_position=trans_pos,
304
+ reference_positions=matches,
305
+ reference_word_ids=reference_word_ids,
306
+ confidence=len(matches) / len(ref_texts_clean),
307
+ )
308
+ candidate_anchors.append(anchor)
309
+
310
+ return candidate_anchors
311
+
248
312
  def _process_ngram_length(
249
313
  self,
250
314
  n: int,
@@ -408,45 +472,95 @@ class AnchorSequenceFinder:
408
472
  min_sources=self.min_sources,
409
473
  )
410
474
 
411
- # Process n-gram lengths sequentially (single-threaded for cloud compatibility)
475
+ # Process n-gram lengths in parallel for better performance
476
+ # The overlap filtering at the end handles deduplication, so we don't
477
+ # need to track used_positions during processing
412
478
  candidate_anchors = []
413
-
479
+
414
480
  # Check timeout before processing
415
481
  self._check_timeout(start_time, "n-gram processing start")
416
- self.logger.info(f"🔍 ANCHOR SEARCH: Starting sequential n-gram processing ({len(n_gram_lengths)} lengths)")
417
-
418
- batch_size = 10
419
- batch_results = []
420
-
421
- for i, n in enumerate(n_gram_lengths):
422
- try:
423
- # Check timeout periodically
424
- if self.timeout_seconds > 0:
425
- elapsed_time = time.time() - start_time
426
- if elapsed_time > self.timeout_seconds:
427
- self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached at n-gram {n}, stopping")
428
- break
429
-
430
- anchors = self._process_ngram_length(
431
- n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
432
- )
433
- candidate_anchors.extend(anchors)
434
-
435
- # Batch logging
436
- batch_results.append((n, len(anchors)))
437
-
438
- # Log progress every batch_size results or on the last result
439
- if (i + 1) % batch_size == 0 or (i + 1) == len(n_gram_lengths):
440
- total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
441
- n_gram_ranges = [str(ng) for ng, _ in batch_results]
442
- range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
443
- self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} - found {total_anchors_in_batch} anchors")
444
- batch_results = []
445
-
446
- except Exception as e:
447
- self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
448
- batch_results.append((n, 0))
449
- continue
482
+
483
+ # Determine parallelization strategy
484
+ import os
485
+ from concurrent.futures import ThreadPoolExecutor, as_completed
486
+
487
+ # Use parallel processing by default, can be disabled via env var
488
+ use_parallel = os.getenv("ANCHOR_SEARCH_SEQUENTIAL", "0").lower() not in {"1", "true", "yes"}
489
+ max_workers = int(os.getenv("ANCHOR_SEARCH_WORKERS", "4"))
490
+
491
+ if use_parallel and len(n_gram_lengths) > 1:
492
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting PARALLEL n-gram processing ({len(n_gram_lengths)} lengths, {max_workers} workers)")
493
+
494
+ # Process in parallel - each n-gram length is independent
495
+ # since we don't track used_positions during processing
496
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
497
+ # Submit all tasks
498
+ future_to_n = {
499
+ executor.submit(
500
+ self._process_ngram_length_no_state,
501
+ n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
502
+ ): n
503
+ for n in n_gram_lengths
504
+ }
505
+
506
+ completed = 0
507
+ for future in as_completed(future_to_n):
508
+ n = future_to_n[future]
509
+ completed += 1
510
+
511
+ # Check timeout periodically
512
+ if self.timeout_seconds > 0:
513
+ elapsed_time = time.time() - start_time
514
+ if elapsed_time > self.timeout_seconds:
515
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached, stopping ({completed}/{len(n_gram_lengths)} completed)")
516
+ # Cancel remaining futures
517
+ for f in future_to_n.keys():
518
+ f.cancel()
519
+ break
520
+
521
+ try:
522
+ anchors = future.result()
523
+ candidate_anchors.extend(anchors)
524
+ if completed % 20 == 0:
525
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Progress {completed}/{len(n_gram_lengths)} lengths processed")
526
+ except Exception as e:
527
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
528
+ else:
529
+ # Sequential fallback
530
+ self.logger.info(f"🔍 ANCHOR SEARCH: Starting sequential n-gram processing ({len(n_gram_lengths)} lengths)")
531
+
532
+ batch_size = 10
533
+ batch_results = []
534
+
535
+ for i, n in enumerate(n_gram_lengths):
536
+ try:
537
+ # Check timeout periodically
538
+ if self.timeout_seconds > 0:
539
+ elapsed_time = time.time() - start_time
540
+ if elapsed_time > self.timeout_seconds:
541
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached at n-gram {n}, stopping")
542
+ break
543
+
544
+ anchors = self._process_ngram_length(
545
+ n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
546
+ )
547
+ candidate_anchors.extend(anchors)
548
+
549
+ # Batch logging
550
+ batch_results.append((n, len(anchors)))
551
+
552
+ # Log progress every batch_size results or on the last result
553
+ if (i + 1) % batch_size == 0 or (i + 1) == len(n_gram_lengths):
554
+ total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
555
+ n_gram_ranges = [str(ng) for ng, _ in batch_results]
556
+ range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
557
+ self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} - found {total_anchors_in_batch} anchors")
558
+ batch_results = []
559
+
560
+ except Exception as e:
561
+ self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
562
+ batch_results.append((n, 0))
563
+ continue
450
564
 
451
565
  self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
452
566