karaoke-gen 0.90.1__py3-none-any.whl → 0.99.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backend/.coveragerc +20 -0
- backend/.gitignore +37 -0
- backend/Dockerfile +43 -0
- backend/Dockerfile.base +74 -0
- backend/README.md +242 -0
- backend/__init__.py +0 -0
- backend/api/__init__.py +0 -0
- backend/api/dependencies.py +457 -0
- backend/api/routes/__init__.py +0 -0
- backend/api/routes/admin.py +835 -0
- backend/api/routes/audio_search.py +913 -0
- backend/api/routes/auth.py +348 -0
- backend/api/routes/file_upload.py +2112 -0
- backend/api/routes/health.py +409 -0
- backend/api/routes/internal.py +435 -0
- backend/api/routes/jobs.py +1629 -0
- backend/api/routes/review.py +652 -0
- backend/api/routes/themes.py +162 -0
- backend/api/routes/users.py +1513 -0
- backend/config.py +172 -0
- backend/main.py +157 -0
- backend/middleware/__init__.py +5 -0
- backend/middleware/audit_logging.py +124 -0
- backend/models/__init__.py +0 -0
- backend/models/job.py +519 -0
- backend/models/requests.py +123 -0
- backend/models/theme.py +153 -0
- backend/models/user.py +254 -0
- backend/models/worker_log.py +164 -0
- backend/pyproject.toml +29 -0
- backend/quick-check.sh +93 -0
- backend/requirements.txt +29 -0
- backend/run_tests.sh +60 -0
- backend/services/__init__.py +0 -0
- backend/services/audio_analysis_service.py +243 -0
- backend/services/audio_editing_service.py +278 -0
- backend/services/audio_search_service.py +702 -0
- backend/services/auth_service.py +630 -0
- backend/services/credential_manager.py +792 -0
- backend/services/discord_service.py +172 -0
- backend/services/dropbox_service.py +301 -0
- backend/services/email_service.py +1093 -0
- backend/services/encoding_interface.py +454 -0
- backend/services/encoding_service.py +502 -0
- backend/services/firestore_service.py +512 -0
- backend/services/flacfetch_client.py +573 -0
- backend/services/gce_encoding/README.md +72 -0
- backend/services/gce_encoding/__init__.py +22 -0
- backend/services/gce_encoding/main.py +589 -0
- backend/services/gce_encoding/requirements.txt +16 -0
- backend/services/gdrive_service.py +356 -0
- backend/services/job_logging.py +258 -0
- backend/services/job_manager.py +853 -0
- backend/services/job_notification_service.py +271 -0
- backend/services/langfuse_preloader.py +98 -0
- backend/services/local_encoding_service.py +590 -0
- backend/services/local_preview_encoding_service.py +407 -0
- backend/services/lyrics_cache_service.py +216 -0
- backend/services/metrics.py +413 -0
- backend/services/nltk_preloader.py +122 -0
- backend/services/packaging_service.py +287 -0
- backend/services/rclone_service.py +106 -0
- backend/services/spacy_preloader.py +65 -0
- backend/services/storage_service.py +209 -0
- backend/services/stripe_service.py +371 -0
- backend/services/structured_logging.py +254 -0
- backend/services/template_service.py +330 -0
- backend/services/theme_service.py +469 -0
- backend/services/tracing.py +543 -0
- backend/services/user_service.py +721 -0
- backend/services/worker_service.py +558 -0
- backend/services/youtube_service.py +112 -0
- backend/services/youtube_upload_service.py +445 -0
- backend/tests/__init__.py +4 -0
- backend/tests/conftest.py +224 -0
- backend/tests/emulator/__init__.py +7 -0
- backend/tests/emulator/conftest.py +109 -0
- backend/tests/emulator/test_e2e_cli_backend.py +1053 -0
- backend/tests/emulator/test_emulator_integration.py +356 -0
- backend/tests/emulator/test_style_loading_direct.py +436 -0
- backend/tests/emulator/test_worker_logs_direct.py +229 -0
- backend/tests/emulator/test_worker_logs_subcollection.py +443 -0
- backend/tests/requirements-test.txt +10 -0
- backend/tests/requirements.txt +6 -0
- backend/tests/test_admin_email_endpoints.py +411 -0
- backend/tests/test_api_integration.py +460 -0
- backend/tests/test_api_routes.py +93 -0
- backend/tests/test_audio_analysis_service.py +294 -0
- backend/tests/test_audio_editing_service.py +386 -0
- backend/tests/test_audio_search.py +1398 -0
- backend/tests/test_audio_services.py +378 -0
- backend/tests/test_auth_firestore.py +231 -0
- backend/tests/test_config_extended.py +68 -0
- backend/tests/test_credential_manager.py +377 -0
- backend/tests/test_dependencies.py +54 -0
- backend/tests/test_discord_service.py +244 -0
- backend/tests/test_distribution_services.py +820 -0
- backend/tests/test_dropbox_service.py +472 -0
- backend/tests/test_email_service.py +492 -0
- backend/tests/test_emulator_integration.py +322 -0
- backend/tests/test_encoding_interface.py +412 -0
- backend/tests/test_file_upload.py +1739 -0
- backend/tests/test_flacfetch_client.py +632 -0
- backend/tests/test_gdrive_service.py +524 -0
- backend/tests/test_instrumental_api.py +431 -0
- backend/tests/test_internal_api.py +343 -0
- backend/tests/test_job_creation_regression.py +583 -0
- backend/tests/test_job_manager.py +356 -0
- backend/tests/test_job_manager_notifications.py +329 -0
- backend/tests/test_job_notification_service.py +443 -0
- backend/tests/test_jobs_api.py +283 -0
- backend/tests/test_local_encoding_service.py +423 -0
- backend/tests/test_local_preview_encoding_service.py +567 -0
- backend/tests/test_main.py +87 -0
- backend/tests/test_models.py +918 -0
- backend/tests/test_packaging_service.py +382 -0
- backend/tests/test_requests.py +201 -0
- backend/tests/test_routes_jobs.py +282 -0
- backend/tests/test_routes_review.py +337 -0
- backend/tests/test_services.py +556 -0
- backend/tests/test_services_extended.py +112 -0
- backend/tests/test_spacy_preloader.py +119 -0
- backend/tests/test_storage_service.py +448 -0
- backend/tests/test_style_upload.py +261 -0
- backend/tests/test_template_service.py +295 -0
- backend/tests/test_theme_service.py +516 -0
- backend/tests/test_unicode_sanitization.py +522 -0
- backend/tests/test_upload_api.py +256 -0
- backend/tests/test_validate.py +156 -0
- backend/tests/test_video_worker_orchestrator.py +847 -0
- backend/tests/test_worker_log_subcollection.py +509 -0
- backend/tests/test_worker_logging.py +365 -0
- backend/tests/test_workers.py +1116 -0
- backend/tests/test_workers_extended.py +178 -0
- backend/tests/test_youtube_service.py +247 -0
- backend/tests/test_youtube_upload_service.py +568 -0
- backend/utils/test_data.py +27 -0
- backend/validate.py +173 -0
- backend/version.py +27 -0
- backend/workers/README.md +597 -0
- backend/workers/__init__.py +11 -0
- backend/workers/audio_worker.py +618 -0
- backend/workers/lyrics_worker.py +683 -0
- backend/workers/render_video_worker.py +483 -0
- backend/workers/screens_worker.py +535 -0
- backend/workers/style_helper.py +198 -0
- backend/workers/video_worker.py +1277 -0
- backend/workers/video_worker_orchestrator.py +701 -0
- backend/workers/worker_logging.py +278 -0
- karaoke_gen/instrumental_review/static/index.html +7 -4
- karaoke_gen/karaoke_finalise/karaoke_finalise.py +6 -1
- karaoke_gen/utils/__init__.py +163 -8
- karaoke_gen/video_background_processor.py +9 -4
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/METADATA +1 -1
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/RECORD +196 -46
- lyrics_transcriber/correction/agentic/agent.py +17 -6
- lyrics_transcriber/correction/agentic/providers/config.py +9 -5
- lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +96 -93
- lyrics_transcriber/correction/agentic/providers/model_factory.py +27 -6
- lyrics_transcriber/correction/anchor_sequence.py +151 -37
- lyrics_transcriber/correction/corrector.py +192 -130
- lyrics_transcriber/correction/handlers/syllables_match.py +44 -2
- lyrics_transcriber/correction/operations.py +24 -9
- lyrics_transcriber/correction/phrase_analyzer.py +18 -0
- lyrics_transcriber/frontend/package-lock.json +2 -2
- lyrics_transcriber/frontend/package.json +1 -1
- lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +1 -1
- lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +11 -7
- lyrics_transcriber/frontend/src/components/EditActionBar.tsx +31 -5
- lyrics_transcriber/frontend/src/components/EditModal.tsx +28 -10
- lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +123 -27
- lyrics_transcriber/frontend/src/components/EditWordList.tsx +112 -60
- lyrics_transcriber/frontend/src/components/Header.tsx +90 -76
- lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +53 -31
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +44 -13
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +66 -50
- lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +124 -30
- lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
- lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +12 -5
- lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +3 -3
- lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
- lyrics_transcriber/frontend/src/components/WordDivider.tsx +11 -7
- lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +4 -2
- lyrics_transcriber/frontend/src/hooks/useManualSync.ts +103 -1
- lyrics_transcriber/frontend/src/theme.ts +42 -15
- lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
- lyrics_transcriber/frontend/vite.config.js +5 -0
- lyrics_transcriber/frontend/web_assets/assets/{index-BECn1o8Q.js → index-BSMgOq4Z.js} +6959 -5782
- lyrics_transcriber/frontend/web_assets/assets/index-BSMgOq4Z.js.map +1 -0
- lyrics_transcriber/frontend/web_assets/index.html +6 -2
- lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.svg +5 -0
- lyrics_transcriber/output/generator.py +17 -3
- lyrics_transcriber/output/video.py +60 -95
- lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +0 -1
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/WHEEL +0 -0
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/entry_points.txt +0 -0
- {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -13,6 +13,7 @@ from __future__ import annotations
|
|
|
13
13
|
|
|
14
14
|
import logging
|
|
15
15
|
import os
|
|
16
|
+
import threading
|
|
16
17
|
import time
|
|
17
18
|
from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
|
|
18
19
|
from typing import List, Dict, Any, Optional
|
|
@@ -94,11 +95,80 @@ class LangChainBridge(BaseAIProvider):
|
|
|
94
95
|
cache_dir=self._config.cache_dir,
|
|
95
96
|
enabled=cache_enabled
|
|
96
97
|
)
|
|
97
|
-
|
|
98
|
-
# Lazy-initialized chat model
|
|
98
|
+
|
|
99
|
+
# Lazy-initialized chat model with thread-safe initialization
|
|
100
|
+
# Lock prevents race condition where multiple threads try to initialize simultaneously
|
|
99
101
|
self._chat_model: Optional[Any] = None
|
|
100
|
-
self.
|
|
101
|
-
|
|
102
|
+
self._model_init_lock = threading.Lock()
|
|
103
|
+
|
|
104
|
+
def warmup(self) -> bool:
|
|
105
|
+
"""Eagerly initialize the chat model.
|
|
106
|
+
|
|
107
|
+
Call this after creating the bridge to avoid lazy initialization delays
|
|
108
|
+
when multiple threads call generate_correction_proposals() simultaneously.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
True if model was initialized successfully, False otherwise
|
|
112
|
+
"""
|
|
113
|
+
if self._chat_model is not None:
|
|
114
|
+
logger.debug(f"🤖 Model {self._model} already initialized")
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
logger.info(f"🤖 Warming up model {self._model}...")
|
|
118
|
+
# Trigger initialization by calling the initialization logic directly
|
|
119
|
+
try:
|
|
120
|
+
self._ensure_model_initialized()
|
|
121
|
+
return self._chat_model is not None
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"🤖 Warmup failed for {self._model}: {e}")
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
def _ensure_model_initialized(self) -> None:
|
|
127
|
+
"""Ensure the chat model is initialized (thread-safe).
|
|
128
|
+
|
|
129
|
+
This method handles the lazy initialization with proper locking.
|
|
130
|
+
It's separated out so it can be called from both warmup() and
|
|
131
|
+
generate_correction_proposals().
|
|
132
|
+
"""
|
|
133
|
+
if self._chat_model is not None:
|
|
134
|
+
return
|
|
135
|
+
|
|
136
|
+
with self._model_init_lock:
|
|
137
|
+
# Double-check after acquiring lock
|
|
138
|
+
if self._chat_model is not None:
|
|
139
|
+
return
|
|
140
|
+
|
|
141
|
+
timeout = self._config.initialization_timeout_seconds
|
|
142
|
+
logger.info(f"🤖 Initializing model {self._model} with {timeout}s timeout...")
|
|
143
|
+
init_start = time.time()
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
# Use ThreadPoolExecutor for cross-platform timeout
|
|
147
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
148
|
+
future = executor.submit(
|
|
149
|
+
self._factory.create_chat_model,
|
|
150
|
+
self._model,
|
|
151
|
+
self._config
|
|
152
|
+
)
|
|
153
|
+
try:
|
|
154
|
+
self._chat_model = future.result(timeout=timeout)
|
|
155
|
+
except FuturesTimeoutError:
|
|
156
|
+
raise InitializationTimeoutError(
|
|
157
|
+
f"Model initialization timed out after {timeout}s. "
|
|
158
|
+
f"This may indicate network issues or service unavailability."
|
|
159
|
+
) from None
|
|
160
|
+
|
|
161
|
+
init_elapsed = time.time() - init_start
|
|
162
|
+
logger.info(f"🤖 Model initialized in {init_elapsed:.2f}s")
|
|
163
|
+
|
|
164
|
+
except InitializationTimeoutError:
|
|
165
|
+
self._circuit_breaker.record_failure(self._model)
|
|
166
|
+
raise
|
|
167
|
+
except Exception as e:
|
|
168
|
+
self._circuit_breaker.record_failure(self._model)
|
|
169
|
+
logger.error(f"🤖 Failed to initialize chat model: {e}")
|
|
170
|
+
raise
|
|
171
|
+
|
|
102
172
|
def name(self) -> str:
|
|
103
173
|
"""Return provider name for logging."""
|
|
104
174
|
return f"langchain:{self._model}"
|
|
@@ -141,52 +211,28 @@ class LangChainBridge(BaseAIProvider):
|
|
|
141
211
|
"until": open_until
|
|
142
212
|
}]
|
|
143
213
|
|
|
144
|
-
# Step 2: Get or create chat model with initialization
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
init_elapsed = time.time() - init_start
|
|
167
|
-
logger.info(f"🤖 Model created in {init_elapsed:.2f}s, starting warm-up...")
|
|
168
|
-
|
|
169
|
-
# Warm up the model to establish connection before real work
|
|
170
|
-
self._warm_up_model()
|
|
171
|
-
|
|
172
|
-
total_elapsed = time.time() - init_start
|
|
173
|
-
logger.info(f"🤖 Model initialization complete in {total_elapsed:.2f}s")
|
|
174
|
-
|
|
175
|
-
except InitializationTimeoutError as e:
|
|
176
|
-
self._circuit_breaker.record_failure(self._model)
|
|
177
|
-
logger.exception("🤖 Model initialization timeout")
|
|
178
|
-
return [{
|
|
179
|
-
"error": INIT_TIMEOUT_ERROR,
|
|
180
|
-
"message": str(e),
|
|
181
|
-
"timeout_seconds": timeout
|
|
182
|
-
}]
|
|
183
|
-
except Exception as e:
|
|
184
|
-
self._circuit_breaker.record_failure(self._model)
|
|
185
|
-
logger.error(f"🤖 Failed to initialize chat model: {e}")
|
|
186
|
-
return [{
|
|
187
|
-
"error": MODEL_INIT_ERROR,
|
|
188
|
-
"message": str(e)
|
|
189
|
-
}]
|
|
214
|
+
# Step 2: Get or create chat model with thread-safe initialization
|
|
215
|
+
# Use double-checked locking to avoid race condition where multiple threads
|
|
216
|
+
# all try to initialize the model simultaneously (which caused job 2ccbdf6b
|
|
217
|
+
# to have 5 concurrent model initializations and 6+ minute delays)
|
|
218
|
+
#
|
|
219
|
+
# NOTE: For best performance, call warmup() after creating the bridge to
|
|
220
|
+
# eagerly initialize the model before parallel processing begins.
|
|
221
|
+
try:
|
|
222
|
+
self._ensure_model_initialized()
|
|
223
|
+
except InitializationTimeoutError as e:
|
|
224
|
+
logger.exception("🤖 Model initialization timeout")
|
|
225
|
+
return [{
|
|
226
|
+
"error": INIT_TIMEOUT_ERROR,
|
|
227
|
+
"message": str(e),
|
|
228
|
+
"timeout_seconds": self._config.initialization_timeout_seconds
|
|
229
|
+
}]
|
|
230
|
+
except Exception as e:
|
|
231
|
+
logger.error(f"🤖 Failed to initialize chat model: {e}")
|
|
232
|
+
return [{
|
|
233
|
+
"error": MODEL_INIT_ERROR,
|
|
234
|
+
"message": str(e)
|
|
235
|
+
}]
|
|
190
236
|
|
|
191
237
|
# Step 3: Execute with retry logic
|
|
192
238
|
logger.info(
|
|
@@ -271,47 +317,4 @@ class LangChainBridge(BaseAIProvider):
|
|
|
271
317
|
|
|
272
318
|
return content
|
|
273
319
|
|
|
274
|
-
def _warm_up_model(self) -> None:
|
|
275
|
-
"""Send a lightweight request to warm up the model connection.
|
|
276
|
-
|
|
277
|
-
This helps establish the REST connection and potentially warm up any
|
|
278
|
-
server-side resources before processing real correction requests.
|
|
279
|
-
The warm-up uses a timeout to fail fast if the service is unresponsive.
|
|
280
|
-
"""
|
|
281
|
-
if self._warmed_up:
|
|
282
|
-
return
|
|
283
|
-
|
|
284
|
-
timeout = self._config.warmup_timeout_seconds
|
|
285
|
-
# Use print with flush=True for visibility when output is redirected
|
|
286
|
-
print(f"🔥 Warming up {self._model} connection (timeout: {timeout}s)...", flush=True)
|
|
287
|
-
logger.info(f"🔥 Warming up {self._model} connection (timeout: {timeout}s)...")
|
|
288
|
-
|
|
289
|
-
warmup_start = time.time()
|
|
290
|
-
try:
|
|
291
|
-
from langchain_core.messages import HumanMessage
|
|
292
|
-
|
|
293
|
-
# Minimal prompt that requires almost no processing
|
|
294
|
-
warm_up_prompt = 'Respond with exactly: {"status":"ready"}'
|
|
295
|
-
|
|
296
|
-
# Use ThreadPoolExecutor for timeout on warm-up call
|
|
297
|
-
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
298
|
-
future = executor.submit(
|
|
299
|
-
self._chat_model.invoke,
|
|
300
|
-
[HumanMessage(content=warm_up_prompt)]
|
|
301
|
-
)
|
|
302
|
-
try:
|
|
303
|
-
future.result(timeout=timeout)
|
|
304
|
-
except FuturesTimeoutError:
|
|
305
|
-
raise TimeoutError(f"Warm-up timed out after {timeout}s") from None
|
|
306
|
-
|
|
307
|
-
elapsed = time.time() - warmup_start
|
|
308
|
-
self._warmed_up = True
|
|
309
|
-
print(f"🔥 Warm-up complete for {self._model} in {elapsed:.2f}s", flush=True)
|
|
310
|
-
logger.info(f"🔥 Warm-up complete for {self._model} in {elapsed:.2f}s")
|
|
311
|
-
except Exception as e:
|
|
312
|
-
elapsed = time.time() - warmup_start
|
|
313
|
-
# Don't fail the actual request if warm-up fails
|
|
314
|
-
# Just log and continue - the real request might still work
|
|
315
|
-
print(f"🔥 Warm-up failed for {self._model} after {elapsed:.2f}s: {e} (continuing anyway)", flush=True)
|
|
316
|
-
logger.warning(f"🔥 Warm-up failed for {self._model} after {elapsed:.2f}s: {e} (continuing anyway)")
|
|
317
320
|
|
|
@@ -10,6 +10,14 @@ from .config import ProviderConfig
|
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
|
+
# Try to import Langfuse preloader (may not exist in standalone library usage)
|
|
14
|
+
try:
|
|
15
|
+
from backend.services.langfuse_preloader import get_preloaded_langfuse_handler
|
|
16
|
+
|
|
17
|
+
_HAS_LANGFUSE_PRELOADER = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
_HAS_LANGFUSE_PRELOADER = False
|
|
20
|
+
|
|
13
21
|
# Error message constant for TRY003 compliance
|
|
14
22
|
GOOGLE_API_KEY_MISSING_ERROR = (
|
|
15
23
|
"GOOGLE_API_KEY environment variable is required for Google/Gemini models. "
|
|
@@ -87,25 +95,38 @@ class ModelFactory:
|
|
|
87
95
|
|
|
88
96
|
def _initialize_langfuse(self, model_spec: str) -> None:
|
|
89
97
|
"""Initialize Langfuse callback handler if keys are present.
|
|
90
|
-
|
|
98
|
+
|
|
99
|
+
First tries to use a preloaded handler (to avoid 200+ second init delay
|
|
100
|
+
on Cloud Run cold starts), then falls back to creating a new one.
|
|
101
|
+
|
|
91
102
|
Langfuse reads credentials from environment variables automatically:
|
|
92
103
|
- LANGFUSE_PUBLIC_KEY
|
|
93
|
-
- LANGFUSE_SECRET_KEY
|
|
104
|
+
- LANGFUSE_SECRET_KEY
|
|
94
105
|
- LANGFUSE_HOST (optional)
|
|
95
|
-
|
|
106
|
+
|
|
96
107
|
Args:
|
|
97
108
|
model_spec: Model specification for logging
|
|
98
|
-
|
|
109
|
+
|
|
99
110
|
Raises:
|
|
100
111
|
RuntimeError: If Langfuse keys are set but initialization fails
|
|
101
112
|
"""
|
|
102
113
|
public_key = os.getenv("LANGFUSE_PUBLIC_KEY")
|
|
103
114
|
secret_key = os.getenv("LANGFUSE_SECRET_KEY")
|
|
104
|
-
|
|
115
|
+
|
|
105
116
|
if not (public_key and secret_key):
|
|
106
117
|
logger.debug("🤖 Langfuse keys not found, tracing disabled")
|
|
107
118
|
return
|
|
108
|
-
|
|
119
|
+
|
|
120
|
+
# Try to use preloaded handler first (avoids 200+ second delay on Cloud Run)
|
|
121
|
+
if _HAS_LANGFUSE_PRELOADER:
|
|
122
|
+
preloaded = get_preloaded_langfuse_handler()
|
|
123
|
+
if preloaded is not None:
|
|
124
|
+
logger.info(f"🤖 Using preloaded Langfuse handler for {model_spec}")
|
|
125
|
+
self._langfuse_handler = preloaded
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
# Fall back to creating new handler
|
|
129
|
+
logger.info(f"🤖 Initializing Langfuse handler (not preloaded) for {model_spec}...")
|
|
109
130
|
try:
|
|
110
131
|
from langfuse.langchain import CallbackHandler
|
|
111
132
|
|
|
@@ -32,19 +32,24 @@ class AnchorSequenceFinder:
|
|
|
32
32
|
progress_check_interval: int = 50, # Check progress every N iterations
|
|
33
33
|
logger: Optional[logging.Logger] = None,
|
|
34
34
|
):
|
|
35
|
+
init_start = time.time()
|
|
35
36
|
self.min_sequence_length = min_sequence_length
|
|
36
37
|
self.min_sources = min_sources
|
|
37
38
|
self.timeout_seconds = timeout_seconds
|
|
38
39
|
self.max_iterations_per_ngram = max_iterations_per_ngram
|
|
39
40
|
self.progress_check_interval = progress_check_interval
|
|
40
41
|
self.logger = logger or logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
self.logger.info("Initializing AnchorSequenceFinder...")
|
|
41
44
|
self.phrase_analyzer = PhraseAnalyzer(logger=self.logger)
|
|
42
45
|
self.used_positions = {}
|
|
43
46
|
|
|
44
47
|
# Initialize cache directory
|
|
45
48
|
self.cache_dir = Path(cache_dir)
|
|
46
49
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
-
|
|
50
|
+
|
|
51
|
+
init_elapsed = time.time() - init_start
|
|
52
|
+
self.logger.info(f"Initialized AnchorSequenceFinder in {init_elapsed:.2f}s (cache: {self.cache_dir}, timeout: {timeout_seconds}s)")
|
|
48
53
|
|
|
49
54
|
def _check_timeout(self, start_time: float, operation_name: str = "operation"):
|
|
50
55
|
"""Check if timeout has occurred and raise exception if so."""
|
|
@@ -245,6 +250,65 @@ class AnchorSequenceFinder:
|
|
|
245
250
|
self.logger.error(f"Unexpected error loading cache: {type(e).__name__}: {e}")
|
|
246
251
|
return None
|
|
247
252
|
|
|
253
|
+
def _process_ngram_length_no_state(
|
|
254
|
+
self,
|
|
255
|
+
n: int,
|
|
256
|
+
trans_words: List[str],
|
|
257
|
+
all_words: List[Word],
|
|
258
|
+
ref_texts_clean: Dict[str, List[str]],
|
|
259
|
+
ref_words: Dict[str, List[Word]],
|
|
260
|
+
min_sources: int,
|
|
261
|
+
) -> List[AnchorSequence]:
|
|
262
|
+
"""Process a single n-gram length without modifying shared state (thread-safe).
|
|
263
|
+
|
|
264
|
+
This version doesn't track used positions - overlap filtering happens later
|
|
265
|
+
in _remove_overlapping_sequences. This allows parallel processing of different
|
|
266
|
+
n-gram lengths.
|
|
267
|
+
"""
|
|
268
|
+
candidate_anchors = []
|
|
269
|
+
|
|
270
|
+
# Build hash-based index for O(1) lookups
|
|
271
|
+
ngram_index = self._build_ngram_index(ref_texts_clean, n)
|
|
272
|
+
|
|
273
|
+
# Generate n-grams from transcribed text
|
|
274
|
+
trans_ngrams = self._find_ngrams(trans_words, n)
|
|
275
|
+
|
|
276
|
+
# Single pass through all transcription n-grams
|
|
277
|
+
for ngram, trans_pos in trans_ngrams:
|
|
278
|
+
# Use indexed lookup (O(1) instead of O(n))
|
|
279
|
+
ngram_tuple = tuple(ngram)
|
|
280
|
+
if ngram_tuple not in ngram_index:
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
# Find matches in all sources (no used_positions check - handled later)
|
|
284
|
+
matches = {}
|
|
285
|
+
source_positions = ngram_index[ngram_tuple]
|
|
286
|
+
for source, positions in source_positions.items():
|
|
287
|
+
if positions:
|
|
288
|
+
matches[source] = positions[0] # Take first position
|
|
289
|
+
|
|
290
|
+
if len(matches) >= min_sources:
|
|
291
|
+
# Get Word IDs for transcribed words
|
|
292
|
+
transcribed_word_ids = [w.id for w in all_words[trans_pos : trans_pos + n]]
|
|
293
|
+
|
|
294
|
+
# Get Word IDs for reference words
|
|
295
|
+
reference_word_ids = {
|
|
296
|
+
source: [w.id for w in ref_words[source][pos : pos + n]]
|
|
297
|
+
for source, pos in matches.items()
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
anchor = AnchorSequence(
|
|
301
|
+
id=WordUtils.generate_id(),
|
|
302
|
+
transcribed_word_ids=transcribed_word_ids,
|
|
303
|
+
transcription_position=trans_pos,
|
|
304
|
+
reference_positions=matches,
|
|
305
|
+
reference_word_ids=reference_word_ids,
|
|
306
|
+
confidence=len(matches) / len(ref_texts_clean),
|
|
307
|
+
)
|
|
308
|
+
candidate_anchors.append(anchor)
|
|
309
|
+
|
|
310
|
+
return candidate_anchors
|
|
311
|
+
|
|
248
312
|
def _process_ngram_length(
|
|
249
313
|
self,
|
|
250
314
|
n: int,
|
|
@@ -408,45 +472,95 @@ class AnchorSequenceFinder:
|
|
|
408
472
|
min_sources=self.min_sources,
|
|
409
473
|
)
|
|
410
474
|
|
|
411
|
-
# Process n-gram lengths
|
|
475
|
+
# Process n-gram lengths in parallel for better performance
|
|
476
|
+
# The overlap filtering at the end handles deduplication, so we don't
|
|
477
|
+
# need to track used_positions during processing
|
|
412
478
|
candidate_anchors = []
|
|
413
|
-
|
|
479
|
+
|
|
414
480
|
# Check timeout before processing
|
|
415
481
|
self._check_timeout(start_time, "n-gram processing start")
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
482
|
+
|
|
483
|
+
# Determine parallelization strategy
|
|
484
|
+
import os
|
|
485
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
486
|
+
|
|
487
|
+
# Use parallel processing by default, can be disabled via env var
|
|
488
|
+
use_parallel = os.getenv("ANCHOR_SEARCH_SEQUENTIAL", "0").lower() not in {"1", "true", "yes"}
|
|
489
|
+
max_workers = int(os.getenv("ANCHOR_SEARCH_WORKERS", "4"))
|
|
490
|
+
|
|
491
|
+
if use_parallel and len(n_gram_lengths) > 1:
|
|
492
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Starting PARALLEL n-gram processing ({len(n_gram_lengths)} lengths, {max_workers} workers)")
|
|
493
|
+
|
|
494
|
+
# Process in parallel - each n-gram length is independent
|
|
495
|
+
# since we don't track used_positions during processing
|
|
496
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
497
|
+
# Submit all tasks
|
|
498
|
+
future_to_n = {
|
|
499
|
+
executor.submit(
|
|
500
|
+
self._process_ngram_length_no_state,
|
|
501
|
+
n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
|
|
502
|
+
): n
|
|
503
|
+
for n in n_gram_lengths
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
completed = 0
|
|
507
|
+
for future in as_completed(future_to_n):
|
|
508
|
+
n = future_to_n[future]
|
|
509
|
+
completed += 1
|
|
510
|
+
|
|
511
|
+
# Check timeout periodically
|
|
512
|
+
if self.timeout_seconds > 0:
|
|
513
|
+
elapsed_time = time.time() - start_time
|
|
514
|
+
if elapsed_time > self.timeout_seconds:
|
|
515
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached, stopping ({completed}/{len(n_gram_lengths)} completed)")
|
|
516
|
+
# Cancel remaining futures
|
|
517
|
+
for f in future_to_n.keys():
|
|
518
|
+
f.cancel()
|
|
519
|
+
break
|
|
520
|
+
|
|
521
|
+
try:
|
|
522
|
+
anchors = future.result()
|
|
523
|
+
candidate_anchors.extend(anchors)
|
|
524
|
+
if completed % 20 == 0:
|
|
525
|
+
self.logger.debug(f"🔍 ANCHOR SEARCH: Progress {completed}/{len(n_gram_lengths)} lengths processed")
|
|
526
|
+
except Exception as e:
|
|
527
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
|
|
528
|
+
else:
|
|
529
|
+
# Sequential fallback
|
|
530
|
+
self.logger.info(f"🔍 ANCHOR SEARCH: Starting sequential n-gram processing ({len(n_gram_lengths)} lengths)")
|
|
531
|
+
|
|
532
|
+
batch_size = 10
|
|
533
|
+
batch_results = []
|
|
534
|
+
|
|
535
|
+
for i, n in enumerate(n_gram_lengths):
|
|
536
|
+
try:
|
|
537
|
+
# Check timeout periodically
|
|
538
|
+
if self.timeout_seconds > 0:
|
|
539
|
+
elapsed_time = time.time() - start_time
|
|
540
|
+
if elapsed_time > self.timeout_seconds:
|
|
541
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⏰ Timeout reached at n-gram {n}, stopping")
|
|
542
|
+
break
|
|
543
|
+
|
|
544
|
+
anchors = self._process_ngram_length(
|
|
545
|
+
n, trans_words, all_words, ref_texts_clean, ref_words, self.min_sources
|
|
546
|
+
)
|
|
547
|
+
candidate_anchors.extend(anchors)
|
|
548
|
+
|
|
549
|
+
# Batch logging
|
|
550
|
+
batch_results.append((n, len(anchors)))
|
|
551
|
+
|
|
552
|
+
# Log progress every batch_size results or on the last result
|
|
553
|
+
if (i + 1) % batch_size == 0 or (i + 1) == len(n_gram_lengths):
|
|
554
|
+
total_anchors_in_batch = sum(anchor_count for _, anchor_count in batch_results)
|
|
555
|
+
n_gram_ranges = [str(ng) for ng, _ in batch_results]
|
|
556
|
+
range_str = f"{n_gram_ranges[0]}-{n_gram_ranges[-1]}" if len(n_gram_ranges) > 1 else n_gram_ranges[0]
|
|
557
|
+
self.logger.debug(f"🔍 ANCHOR SEARCH: Completed n-gram lengths {range_str} - found {total_anchors_in_batch} anchors")
|
|
558
|
+
batch_results = []
|
|
559
|
+
|
|
560
|
+
except Exception as e:
|
|
561
|
+
self.logger.warning(f"🔍 ANCHOR SEARCH: ⚠️ n-gram length {n} failed: {str(e)}")
|
|
562
|
+
batch_results.append((n, 0))
|
|
563
|
+
continue
|
|
450
564
|
|
|
451
565
|
self.logger.info(f"🔍 ANCHOR SEARCH: ✅ Found {len(candidate_anchors)} candidate anchors in {time.time() - start_time:.1f}s")
|
|
452
566
|
|