karaoke-gen 0.90.1__py3-none-any.whl → 0.99.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. backend/.coveragerc +20 -0
  2. backend/.gitignore +37 -0
  3. backend/Dockerfile +43 -0
  4. backend/Dockerfile.base +74 -0
  5. backend/README.md +242 -0
  6. backend/__init__.py +0 -0
  7. backend/api/__init__.py +0 -0
  8. backend/api/dependencies.py +457 -0
  9. backend/api/routes/__init__.py +0 -0
  10. backend/api/routes/admin.py +835 -0
  11. backend/api/routes/audio_search.py +913 -0
  12. backend/api/routes/auth.py +348 -0
  13. backend/api/routes/file_upload.py +2112 -0
  14. backend/api/routes/health.py +409 -0
  15. backend/api/routes/internal.py +435 -0
  16. backend/api/routes/jobs.py +1629 -0
  17. backend/api/routes/review.py +652 -0
  18. backend/api/routes/themes.py +162 -0
  19. backend/api/routes/users.py +1513 -0
  20. backend/config.py +172 -0
  21. backend/main.py +157 -0
  22. backend/middleware/__init__.py +5 -0
  23. backend/middleware/audit_logging.py +124 -0
  24. backend/models/__init__.py +0 -0
  25. backend/models/job.py +519 -0
  26. backend/models/requests.py +123 -0
  27. backend/models/theme.py +153 -0
  28. backend/models/user.py +254 -0
  29. backend/models/worker_log.py +164 -0
  30. backend/pyproject.toml +29 -0
  31. backend/quick-check.sh +93 -0
  32. backend/requirements.txt +29 -0
  33. backend/run_tests.sh +60 -0
  34. backend/services/__init__.py +0 -0
  35. backend/services/audio_analysis_service.py +243 -0
  36. backend/services/audio_editing_service.py +278 -0
  37. backend/services/audio_search_service.py +702 -0
  38. backend/services/auth_service.py +630 -0
  39. backend/services/credential_manager.py +792 -0
  40. backend/services/discord_service.py +172 -0
  41. backend/services/dropbox_service.py +301 -0
  42. backend/services/email_service.py +1093 -0
  43. backend/services/encoding_interface.py +454 -0
  44. backend/services/encoding_service.py +502 -0
  45. backend/services/firestore_service.py +512 -0
  46. backend/services/flacfetch_client.py +573 -0
  47. backend/services/gce_encoding/README.md +72 -0
  48. backend/services/gce_encoding/__init__.py +22 -0
  49. backend/services/gce_encoding/main.py +589 -0
  50. backend/services/gce_encoding/requirements.txt +16 -0
  51. backend/services/gdrive_service.py +356 -0
  52. backend/services/job_logging.py +258 -0
  53. backend/services/job_manager.py +853 -0
  54. backend/services/job_notification_service.py +271 -0
  55. backend/services/langfuse_preloader.py +98 -0
  56. backend/services/local_encoding_service.py +590 -0
  57. backend/services/local_preview_encoding_service.py +407 -0
  58. backend/services/lyrics_cache_service.py +216 -0
  59. backend/services/metrics.py +413 -0
  60. backend/services/nltk_preloader.py +122 -0
  61. backend/services/packaging_service.py +287 -0
  62. backend/services/rclone_service.py +106 -0
  63. backend/services/spacy_preloader.py +65 -0
  64. backend/services/storage_service.py +209 -0
  65. backend/services/stripe_service.py +371 -0
  66. backend/services/structured_logging.py +254 -0
  67. backend/services/template_service.py +330 -0
  68. backend/services/theme_service.py +469 -0
  69. backend/services/tracing.py +543 -0
  70. backend/services/user_service.py +721 -0
  71. backend/services/worker_service.py +558 -0
  72. backend/services/youtube_service.py +112 -0
  73. backend/services/youtube_upload_service.py +445 -0
  74. backend/tests/__init__.py +4 -0
  75. backend/tests/conftest.py +224 -0
  76. backend/tests/emulator/__init__.py +7 -0
  77. backend/tests/emulator/conftest.py +109 -0
  78. backend/tests/emulator/test_e2e_cli_backend.py +1053 -0
  79. backend/tests/emulator/test_emulator_integration.py +356 -0
  80. backend/tests/emulator/test_style_loading_direct.py +436 -0
  81. backend/tests/emulator/test_worker_logs_direct.py +229 -0
  82. backend/tests/emulator/test_worker_logs_subcollection.py +443 -0
  83. backend/tests/requirements-test.txt +10 -0
  84. backend/tests/requirements.txt +6 -0
  85. backend/tests/test_admin_email_endpoints.py +411 -0
  86. backend/tests/test_api_integration.py +460 -0
  87. backend/tests/test_api_routes.py +93 -0
  88. backend/tests/test_audio_analysis_service.py +294 -0
  89. backend/tests/test_audio_editing_service.py +386 -0
  90. backend/tests/test_audio_search.py +1398 -0
  91. backend/tests/test_audio_services.py +378 -0
  92. backend/tests/test_auth_firestore.py +231 -0
  93. backend/tests/test_config_extended.py +68 -0
  94. backend/tests/test_credential_manager.py +377 -0
  95. backend/tests/test_dependencies.py +54 -0
  96. backend/tests/test_discord_service.py +244 -0
  97. backend/tests/test_distribution_services.py +820 -0
  98. backend/tests/test_dropbox_service.py +472 -0
  99. backend/tests/test_email_service.py +492 -0
  100. backend/tests/test_emulator_integration.py +322 -0
  101. backend/tests/test_encoding_interface.py +412 -0
  102. backend/tests/test_file_upload.py +1739 -0
  103. backend/tests/test_flacfetch_client.py +632 -0
  104. backend/tests/test_gdrive_service.py +524 -0
  105. backend/tests/test_instrumental_api.py +431 -0
  106. backend/tests/test_internal_api.py +343 -0
  107. backend/tests/test_job_creation_regression.py +583 -0
  108. backend/tests/test_job_manager.py +356 -0
  109. backend/tests/test_job_manager_notifications.py +329 -0
  110. backend/tests/test_job_notification_service.py +443 -0
  111. backend/tests/test_jobs_api.py +283 -0
  112. backend/tests/test_local_encoding_service.py +423 -0
  113. backend/tests/test_local_preview_encoding_service.py +567 -0
  114. backend/tests/test_main.py +87 -0
  115. backend/tests/test_models.py +918 -0
  116. backend/tests/test_packaging_service.py +382 -0
  117. backend/tests/test_requests.py +201 -0
  118. backend/tests/test_routes_jobs.py +282 -0
  119. backend/tests/test_routes_review.py +337 -0
  120. backend/tests/test_services.py +556 -0
  121. backend/tests/test_services_extended.py +112 -0
  122. backend/tests/test_spacy_preloader.py +119 -0
  123. backend/tests/test_storage_service.py +448 -0
  124. backend/tests/test_style_upload.py +261 -0
  125. backend/tests/test_template_service.py +295 -0
  126. backend/tests/test_theme_service.py +516 -0
  127. backend/tests/test_unicode_sanitization.py +522 -0
  128. backend/tests/test_upload_api.py +256 -0
  129. backend/tests/test_validate.py +156 -0
  130. backend/tests/test_video_worker_orchestrator.py +847 -0
  131. backend/tests/test_worker_log_subcollection.py +509 -0
  132. backend/tests/test_worker_logging.py +365 -0
  133. backend/tests/test_workers.py +1116 -0
  134. backend/tests/test_workers_extended.py +178 -0
  135. backend/tests/test_youtube_service.py +247 -0
  136. backend/tests/test_youtube_upload_service.py +568 -0
  137. backend/utils/test_data.py +27 -0
  138. backend/validate.py +173 -0
  139. backend/version.py +27 -0
  140. backend/workers/README.md +597 -0
  141. backend/workers/__init__.py +11 -0
  142. backend/workers/audio_worker.py +618 -0
  143. backend/workers/lyrics_worker.py +683 -0
  144. backend/workers/render_video_worker.py +483 -0
  145. backend/workers/screens_worker.py +535 -0
  146. backend/workers/style_helper.py +198 -0
  147. backend/workers/video_worker.py +1277 -0
  148. backend/workers/video_worker_orchestrator.py +701 -0
  149. backend/workers/worker_logging.py +278 -0
  150. karaoke_gen/instrumental_review/static/index.html +7 -4
  151. karaoke_gen/karaoke_finalise/karaoke_finalise.py +6 -1
  152. karaoke_gen/utils/__init__.py +163 -8
  153. karaoke_gen/video_background_processor.py +9 -4
  154. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/METADATA +1 -1
  155. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/RECORD +196 -46
  156. lyrics_transcriber/correction/agentic/agent.py +17 -6
  157. lyrics_transcriber/correction/agentic/providers/config.py +9 -5
  158. lyrics_transcriber/correction/agentic/providers/langchain_bridge.py +96 -93
  159. lyrics_transcriber/correction/agentic/providers/model_factory.py +27 -6
  160. lyrics_transcriber/correction/anchor_sequence.py +151 -37
  161. lyrics_transcriber/correction/corrector.py +192 -130
  162. lyrics_transcriber/correction/handlers/syllables_match.py +44 -2
  163. lyrics_transcriber/correction/operations.py +24 -9
  164. lyrics_transcriber/correction/phrase_analyzer.py +18 -0
  165. lyrics_transcriber/frontend/package-lock.json +2 -2
  166. lyrics_transcriber/frontend/package.json +1 -1
  167. lyrics_transcriber/frontend/src/components/AIFeedbackModal.tsx +1 -1
  168. lyrics_transcriber/frontend/src/components/CorrectedWordWithActions.tsx +11 -7
  169. lyrics_transcriber/frontend/src/components/EditActionBar.tsx +31 -5
  170. lyrics_transcriber/frontend/src/components/EditModal.tsx +28 -10
  171. lyrics_transcriber/frontend/src/components/EditTimelineSection.tsx +123 -27
  172. lyrics_transcriber/frontend/src/components/EditWordList.tsx +112 -60
  173. lyrics_transcriber/frontend/src/components/Header.tsx +90 -76
  174. lyrics_transcriber/frontend/src/components/LyricsAnalyzer.tsx +53 -31
  175. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/SyncControls.tsx +44 -13
  176. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/TimelineCanvas.tsx +66 -50
  177. lyrics_transcriber/frontend/src/components/LyricsSynchronizer/index.tsx +124 -30
  178. lyrics_transcriber/frontend/src/components/ReferenceView.tsx +1 -1
  179. lyrics_transcriber/frontend/src/components/TimelineEditor.tsx +12 -5
  180. lyrics_transcriber/frontend/src/components/TimingOffsetModal.tsx +3 -3
  181. lyrics_transcriber/frontend/src/components/TranscriptionView.tsx +1 -1
  182. lyrics_transcriber/frontend/src/components/WordDivider.tsx +11 -7
  183. lyrics_transcriber/frontend/src/components/shared/components/Word.tsx +4 -2
  184. lyrics_transcriber/frontend/src/hooks/useManualSync.ts +103 -1
  185. lyrics_transcriber/frontend/src/theme.ts +42 -15
  186. lyrics_transcriber/frontend/tsconfig.tsbuildinfo +1 -1
  187. lyrics_transcriber/frontend/vite.config.js +5 -0
  188. lyrics_transcriber/frontend/web_assets/assets/{index-BECn1o8Q.js → index-BSMgOq4Z.js} +6959 -5782
  189. lyrics_transcriber/frontend/web_assets/assets/index-BSMgOq4Z.js.map +1 -0
  190. lyrics_transcriber/frontend/web_assets/index.html +6 -2
  191. lyrics_transcriber/frontend/web_assets/nomad-karaoke-logo.svg +5 -0
  192. lyrics_transcriber/output/generator.py +17 -3
  193. lyrics_transcriber/output/video.py +60 -95
  194. lyrics_transcriber/frontend/web_assets/assets/index-BECn1o8Q.js.map +0 -1
  195. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/WHEEL +0 -0
  196. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/entry_points.txt +0 -0
  197. {karaoke_gen-0.90.1.dist-info → karaoke_gen-0.99.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,413 @@
1
+ """
2
+ Custom metrics for Cloud Monitoring.
3
+
4
+ This module provides application-level metrics for tracking job processing,
5
+ worker performance, and external API usage. Metrics can be viewed in:
6
+ 1. Cloud Logging (via structured log entries)
7
+ 2. Cloud Monitoring (via log-based metrics or OpenTelemetry)
8
+
9
+ The metrics service uses a pragmatic approach:
10
+ - Always emits metrics as structured log entries (works immediately)
11
+ - Uses the same JSON format as Cloud Logging
12
+ - Can be enhanced with OpenTelemetry metrics exporters when available
13
+
14
+ Usage:
15
+ from backend.services.metrics import metrics
16
+
17
+ # Record a job completion
18
+ metrics.record_job_completed("abc123", source="upload")
19
+
20
+ # Record worker duration
21
+ metrics.record_worker_duration("audio", 45.2, success=True)
22
+
23
+ # Record external API call
24
+ with metrics.time_external_api("modal"):
25
+ response = await modal_client.separate_audio(...)
26
+ """
27
+ import logging
28
+ import time
29
+ from contextlib import contextmanager
30
+ from dataclasses import dataclass, field
31
+ from typing import Any, Dict, Optional
32
+
33
+ from backend.services.tracing import get_current_trace_id, get_current_span_id
34
+
35
+
36
+ logger = logging.getLogger("metrics")
37
+
38
+
39
+ @dataclass
40
+ class MetricLabels:
41
+ """Common metric labels."""
42
+ job_id: Optional[str] = None
43
+ worker: Optional[str] = None
44
+ status: Optional[str] = None
45
+ source: Optional[str] = None
46
+ api: Optional[str] = None
47
+ operation: Optional[str] = None
48
+ bucket: Optional[str] = None
49
+
50
+ def to_dict(self) -> Dict[str, Any]:
51
+ """Convert to dict, excluding None values."""
52
+ return {k: v for k, v in self.__dict__.items() if v is not None}
53
+
54
+
55
+ class MetricsService:
56
+ """
57
+ Application metrics service.
58
+
59
+ Emits metrics as structured log entries that can be:
60
+ 1. Queried directly in Cloud Logging
61
+ 2. Converted to Cloud Monitoring metrics via log-based metrics
62
+ 3. Exported via OpenTelemetry (when configured)
63
+ """
64
+
65
+ def __init__(self):
66
+ """Initialize the metrics service."""
67
+ self._logger = logging.getLogger("metrics")
68
+ # Ensure metrics logger outputs at INFO level
69
+ self._logger.setLevel(logging.INFO)
70
+
71
+ def _emit_metric(self, metric_name: str, metric_type: str, value: float, labels: Dict[str, Any]) -> None:
72
+ """
73
+ Emit a metric as a structured log entry.
74
+
75
+ The log format is designed to be easily parsed by Cloud Logging
76
+ and converted to log-based metrics.
77
+
78
+ Args:
79
+ metric_name: Name of the metric (e.g., "jobs_total")
80
+ metric_type: Type of metric (counter, histogram, gauge)
81
+ value: Metric value
82
+ labels: Metric labels/dimensions
83
+ """
84
+ # Add trace context if available
85
+ trace_id = get_current_trace_id()
86
+ span_id = get_current_span_id()
87
+
88
+ # Build metric entry
89
+ metric_entry = {
90
+ "metric_name": metric_name,
91
+ "metric_type": metric_type,
92
+ "metric_value": value,
93
+ **labels,
94
+ }
95
+
96
+ if trace_id:
97
+ metric_entry["trace_id"] = trace_id
98
+ if span_id:
99
+ metric_entry["span_id"] = span_id
100
+
101
+ # Emit as structured log entry
102
+ # Use INFO level so metrics always show up
103
+ self._logger.info(
104
+ f"METRIC {metric_name}={value}",
105
+ extra=metric_entry
106
+ )
107
+
108
+ # =========================================
109
+ # Job Metrics
110
+ # =========================================
111
+
112
+ def record_job_created(self, job_id: str, source: str = "unknown") -> None:
113
+ """
114
+ Record a new job creation.
115
+
116
+ Args:
117
+ job_id: Job ID
118
+ source: Job source (upload, url, search)
119
+ """
120
+ self._emit_metric(
121
+ metric_name="jobs_total",
122
+ metric_type="counter",
123
+ value=1,
124
+ labels={"job_id": job_id, "status": "created", "source": source}
125
+ )
126
+
127
+ def record_job_completed(self, job_id: str, source: str = "unknown") -> None:
128
+ """
129
+ Record a job completion.
130
+
131
+ Args:
132
+ job_id: Job ID
133
+ source: Job source (upload, url, search)
134
+ """
135
+ self._emit_metric(
136
+ metric_name="jobs_total",
137
+ metric_type="counter",
138
+ value=1,
139
+ labels={"job_id": job_id, "status": "completed", "source": source}
140
+ )
141
+
142
+ def record_job_failed(self, job_id: str, source: str = "unknown", error: Optional[str] = None) -> None:
143
+ """
144
+ Record a job failure.
145
+
146
+ Args:
147
+ job_id: Job ID
148
+ source: Job source (upload, url, search)
149
+ error: Optional error message
150
+ """
151
+ labels = {"job_id": job_id, "status": "failed", "source": source}
152
+ if error:
153
+ labels["error"] = error[:200] # Truncate long errors
154
+ self._emit_metric(
155
+ metric_name="jobs_total",
156
+ metric_type="counter",
157
+ value=1,
158
+ labels=labels
159
+ )
160
+
161
+ def record_job_duration(self, job_id: str, duration_seconds: float, source: str = "unknown") -> None:
162
+ """
163
+ Record total job processing duration.
164
+
165
+ Args:
166
+ job_id: Job ID
167
+ duration_seconds: Total processing time in seconds
168
+ source: Job source (upload, url, search)
169
+ """
170
+ self._emit_metric(
171
+ metric_name="job_duration_seconds",
172
+ metric_type="histogram",
173
+ value=duration_seconds,
174
+ labels={"job_id": job_id, "source": source}
175
+ )
176
+
177
+ # =========================================
178
+ # Worker Metrics
179
+ # =========================================
180
+
181
+ def record_worker_started(self, worker: str, job_id: str) -> None:
182
+ """
183
+ Record a worker invocation start.
184
+
185
+ Args:
186
+ worker: Worker name (audio, lyrics, screens, video, render_video)
187
+ job_id: Job ID
188
+ """
189
+ self._emit_metric(
190
+ metric_name="worker_invocations_total",
191
+ metric_type="counter",
192
+ value=1,
193
+ labels={"worker": worker, "job_id": job_id, "status": "started"}
194
+ )
195
+
196
+ def record_worker_duration(self, worker: str, duration_seconds: float, success: bool, job_id: Optional[str] = None) -> None:
197
+ """
198
+ Record worker processing duration.
199
+
200
+ Args:
201
+ worker: Worker name
202
+ duration_seconds: Processing time in seconds
203
+ success: Whether worker completed successfully
204
+ job_id: Optional job ID
205
+ """
206
+ labels = {
207
+ "worker": worker,
208
+ "success": str(success).lower(),
209
+ }
210
+ if job_id:
211
+ labels["job_id"] = job_id
212
+
213
+ self._emit_metric(
214
+ metric_name="job_stage_duration_seconds",
215
+ metric_type="histogram",
216
+ value=duration_seconds,
217
+ labels=labels
218
+ )
219
+
220
+ # Also emit a counter for success/failure tracking
221
+ self._emit_metric(
222
+ metric_name="worker_invocations_total",
223
+ metric_type="counter",
224
+ value=1,
225
+ labels={"worker": worker, "success": str(success).lower(), "job_id": job_id or "unknown"}
226
+ )
227
+
228
+ # =========================================
229
+ # GCS Metrics
230
+ # =========================================
231
+
232
+ def record_gcs_operation(
233
+ self,
234
+ operation: str,
235
+ bucket: str,
236
+ success: bool,
237
+ size_bytes: Optional[int] = None,
238
+ duration_seconds: Optional[float] = None,
239
+ job_id: Optional[str] = None,
240
+ ) -> None:
241
+ """
242
+ Record a GCS operation.
243
+
244
+ Args:
245
+ operation: Operation type (upload, download, delete)
246
+ bucket: GCS bucket name
247
+ success: Whether operation succeeded
248
+ size_bytes: Optional file size in bytes
249
+ duration_seconds: Optional operation duration
250
+ job_id: Optional job ID
251
+ """
252
+ labels = {
253
+ "operation": operation,
254
+ "bucket": bucket,
255
+ "success": str(success).lower(),
256
+ }
257
+ if job_id:
258
+ labels["job_id"] = job_id
259
+
260
+ self._emit_metric(
261
+ metric_name="gcs_operations_total",
262
+ metric_type="counter",
263
+ value=1,
264
+ labels=labels
265
+ )
266
+
267
+ if size_bytes is not None:
268
+ self._emit_metric(
269
+ metric_name="gcs_operation_bytes",
270
+ metric_type="histogram",
271
+ value=size_bytes,
272
+ labels={**labels, "operation": operation}
273
+ )
274
+
275
+ if duration_seconds is not None:
276
+ self._emit_metric(
277
+ metric_name="gcs_operation_duration_seconds",
278
+ metric_type="histogram",
279
+ value=duration_seconds,
280
+ labels={**labels, "operation": operation}
281
+ )
282
+
283
+ # =========================================
284
+ # External API Metrics
285
+ # =========================================
286
+
287
+ def record_external_api_call(
288
+ self,
289
+ api: str,
290
+ success: bool,
291
+ duration_seconds: float,
292
+ job_id: Optional[str] = None,
293
+ error: Optional[str] = None,
294
+ ) -> None:
295
+ """
296
+ Record an external API call.
297
+
298
+ Args:
299
+ api: API name (modal, audioshake, genius, spotify)
300
+ success: Whether call succeeded
301
+ duration_seconds: API call duration
302
+ job_id: Optional job ID
303
+ error: Optional error message
304
+ """
305
+ labels = {
306
+ "api": api,
307
+ "success": str(success).lower(),
308
+ }
309
+ if job_id:
310
+ labels["job_id"] = job_id
311
+ if error:
312
+ labels["error"] = error[:100] # Truncate
313
+
314
+ self._emit_metric(
315
+ metric_name="external_api_calls_total",
316
+ metric_type="counter",
317
+ value=1,
318
+ labels=labels
319
+ )
320
+
321
+ self._emit_metric(
322
+ metric_name="external_api_duration_seconds",
323
+ metric_type="histogram",
324
+ value=duration_seconds,
325
+ labels={"api": api, "success": str(success).lower()}
326
+ )
327
+
328
+ @contextmanager
329
+ def time_external_api(self, api: str, job_id: Optional[str] = None):
330
+ """
331
+ Context manager to time an external API call.
332
+
333
+ Usage:
334
+ with metrics.time_external_api("modal", job_id) as timer:
335
+ response = await client.call_api()
336
+ timer.set_success(True)
337
+
338
+ Args:
339
+ api: API name
340
+ job_id: Optional job ID
341
+
342
+ Yields:
343
+ Timer object with set_success() method
344
+ """
345
+ timer = _ApiTimer()
346
+ start_time = time.time()
347
+
348
+ try:
349
+ yield timer
350
+ except Exception as e:
351
+ timer.set_success(False)
352
+ timer.error = str(e)
353
+ raise
354
+ finally:
355
+ duration = time.time() - start_time
356
+ self.record_external_api_call(
357
+ api=api,
358
+ success=timer.success,
359
+ duration_seconds=duration,
360
+ job_id=job_id,
361
+ error=timer.error,
362
+ )
363
+
364
+ @contextmanager
365
+ def time_worker(self, worker: str, job_id: str):
366
+ """
367
+ Context manager to time a worker execution.
368
+
369
+ Usage:
370
+ with metrics.time_worker("audio", job_id) as timer:
371
+ await process_audio()
372
+ timer.set_success(True)
373
+
374
+ Args:
375
+ worker: Worker name
376
+ job_id: Job ID
377
+
378
+ Yields:
379
+ Timer object with set_success() method
380
+ """
381
+ timer = _ApiTimer()
382
+ self.record_worker_started(worker, job_id)
383
+ start_time = time.time()
384
+
385
+ try:
386
+ yield timer
387
+ except Exception as e:
388
+ timer.set_success(False)
389
+ raise
390
+ finally:
391
+ duration = time.time() - start_time
392
+ self.record_worker_duration(
393
+ worker=worker,
394
+ duration_seconds=duration,
395
+ success=timer.success,
396
+ job_id=job_id,
397
+ )
398
+
399
+
400
+ class _ApiTimer:
401
+ """Helper class for tracking API call success state."""
402
+
403
+ def __init__(self):
404
+ self.success = True # Assume success unless set otherwise
405
+ self.error: Optional[str] = None
406
+
407
+ def set_success(self, success: bool) -> None:
408
+ self.success = success
409
+
410
+
411
+ # Global metrics instance
412
+ metrics = MetricsService()
413
+
@@ -0,0 +1,122 @@
1
+ """NLTK resource preloader for container startup.
2
+
3
+ Loads NLTK data at container startup to avoid slow downloads during request processing.
4
+ Cloud Run's ephemeral filesystem means NLTK data must be re-downloaded on each cold start,
5
+ which can take 30-100+ seconds for cmudict.
6
+
7
+ See docs/archive/2026-01-08-performance-investigation.md for background.
8
+ """
9
+
10
+ import logging
11
+ import time
12
+ from typing import Optional, Dict, Any
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Singleton storage for preloaded resources
17
+ _preloaded_resources: Dict[str, Any] = {}
18
+
19
+
20
+ def preload_nltk_cmudict() -> None:
21
+ """Preload NLTK's CMU Pronouncing Dictionary at startup.
22
+
23
+ The cmudict is used by SyllablesMatchHandler for syllable counting.
24
+ Without preloading, each SyllablesMatchHandler init downloads ~30MB,
25
+ which took 50-100+ seconds in Cloud Run.
26
+ """
27
+ global _preloaded_resources
28
+
29
+ if "cmudict" in _preloaded_resources:
30
+ logger.info("NLTK cmudict already preloaded")
31
+ return
32
+
33
+ logger.info("Preloading NLTK cmudict...")
34
+ start_time = time.time()
35
+
36
+ try:
37
+ import nltk
38
+
39
+ # Ensure the data is downloaded
40
+ try:
41
+ from nltk.corpus import cmudict
42
+
43
+ # Try to access it - will raise LookupError if not downloaded
44
+ _ = cmudict.dict()
45
+ except LookupError:
46
+ logger.info("Downloading NLTK cmudict data...")
47
+ nltk.download("cmudict", quiet=True)
48
+ from nltk.corpus import cmudict
49
+
50
+ # Load into memory
51
+ cmu_dict = cmudict.dict()
52
+ _preloaded_resources["cmudict"] = cmu_dict
53
+
54
+ elapsed = time.time() - start_time
55
+ logger.info(f"NLTK cmudict preloaded in {elapsed:.2f}s ({len(cmu_dict)} entries)")
56
+
57
+ except Exception as e:
58
+ logger.error(f"Failed to preload NLTK cmudict: {e}")
59
+ raise
60
+
61
+
62
+ def get_preloaded_cmudict() -> Optional[Dict]:
63
+ """Get the preloaded CMU dictionary if available.
64
+
65
+ Returns:
66
+ The preloaded cmudict dictionary, or None if not preloaded
67
+ """
68
+ return _preloaded_resources.get("cmudict")
69
+
70
+
71
+ def is_cmudict_preloaded() -> bool:
72
+ """Check if cmudict has been preloaded."""
73
+ return "cmudict" in _preloaded_resources
74
+
75
+
76
+ def preload_nltk_punkt() -> None:
77
+ """Preload NLTK's punkt tokenizer (optional, used for sentence tokenization)."""
78
+ global _preloaded_resources
79
+
80
+ if "punkt" in _preloaded_resources:
81
+ logger.info("NLTK punkt already preloaded")
82
+ return
83
+
84
+ logger.info("Preloading NLTK punkt tokenizer...")
85
+ start_time = time.time()
86
+
87
+ try:
88
+ import nltk
89
+
90
+ try:
91
+ from nltk.tokenize import word_tokenize
92
+
93
+ # Test it works
94
+ _ = word_tokenize("test")
95
+ except LookupError:
96
+ logger.info("Downloading NLTK punkt data...")
97
+ nltk.download("punkt", quiet=True)
98
+ nltk.download("punkt_tab", quiet=True)
99
+
100
+ _preloaded_resources["punkt"] = True
101
+
102
+ elapsed = time.time() - start_time
103
+ logger.info(f"NLTK punkt preloaded in {elapsed:.2f}s")
104
+
105
+ except Exception as e:
106
+ logger.warning(f"Failed to preload NLTK punkt (non-critical): {e}")
107
+
108
+
109
+ def preload_all_nltk_resources() -> None:
110
+ """Preload all NLTK resources used by the application."""
111
+ preload_nltk_cmudict()
112
+ # punkt is optional and less critical
113
+ try:
114
+ preload_nltk_punkt()
115
+ except Exception:
116
+ pass # Non-critical
117
+
118
+
119
+ def clear_preloaded_resources() -> None:
120
+ """Clear all preloaded resources. Useful for testing."""
121
+ global _preloaded_resources
122
+ _preloaded_resources.clear()