expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. expops-0.1.3.dist-info/METADATA +826 -0
  2. expops-0.1.3.dist-info/RECORD +86 -0
  3. expops-0.1.3.dist-info/WHEEL +5 -0
  4. expops-0.1.3.dist-info/entry_points.txt +3 -0
  5. expops-0.1.3.dist-info/licenses/LICENSE +674 -0
  6. expops-0.1.3.dist-info/top_level.txt +1 -0
  7. mlops/__init__.py +0 -0
  8. mlops/__main__.py +11 -0
  9. mlops/_version.py +34 -0
  10. mlops/adapters/__init__.py +12 -0
  11. mlops/adapters/base.py +86 -0
  12. mlops/adapters/config_schema.py +89 -0
  13. mlops/adapters/custom/__init__.py +3 -0
  14. mlops/adapters/custom/custom_adapter.py +447 -0
  15. mlops/adapters/plugin_manager.py +113 -0
  16. mlops/adapters/sklearn/__init__.py +3 -0
  17. mlops/adapters/sklearn/adapter.py +94 -0
  18. mlops/cluster/__init__.py +3 -0
  19. mlops/cluster/controller.py +496 -0
  20. mlops/cluster/process_runner.py +91 -0
  21. mlops/cluster/providers.py +258 -0
  22. mlops/core/__init__.py +95 -0
  23. mlops/core/custom_model_base.py +38 -0
  24. mlops/core/dask_networkx_executor.py +1265 -0
  25. mlops/core/executor_worker.py +1239 -0
  26. mlops/core/experiment_tracker.py +81 -0
  27. mlops/core/graph_types.py +64 -0
  28. mlops/core/networkx_parser.py +135 -0
  29. mlops/core/payload_spill.py +278 -0
  30. mlops/core/pipeline_utils.py +162 -0
  31. mlops/core/process_hashing.py +216 -0
  32. mlops/core/step_state_manager.py +1298 -0
  33. mlops/core/step_system.py +956 -0
  34. mlops/core/workspace.py +99 -0
  35. mlops/environment/__init__.py +10 -0
  36. mlops/environment/base.py +43 -0
  37. mlops/environment/conda_manager.py +307 -0
  38. mlops/environment/factory.py +70 -0
  39. mlops/environment/pyenv_manager.py +146 -0
  40. mlops/environment/setup_env.py +31 -0
  41. mlops/environment/system_manager.py +66 -0
  42. mlops/environment/utils.py +105 -0
  43. mlops/environment/venv_manager.py +134 -0
  44. mlops/main.py +527 -0
  45. mlops/managers/project_manager.py +400 -0
  46. mlops/managers/reproducibility_manager.py +575 -0
  47. mlops/platform.py +996 -0
  48. mlops/reporting/__init__.py +16 -0
  49. mlops/reporting/context.py +187 -0
  50. mlops/reporting/entrypoint.py +292 -0
  51. mlops/reporting/kv_utils.py +77 -0
  52. mlops/reporting/registry.py +50 -0
  53. mlops/runtime/__init__.py +9 -0
  54. mlops/runtime/context.py +34 -0
  55. mlops/runtime/env_export.py +113 -0
  56. mlops/storage/__init__.py +12 -0
  57. mlops/storage/adapters/__init__.py +9 -0
  58. mlops/storage/adapters/gcp_kv_store.py +778 -0
  59. mlops/storage/adapters/gcs_object_store.py +96 -0
  60. mlops/storage/adapters/memory_store.py +240 -0
  61. mlops/storage/adapters/redis_store.py +438 -0
  62. mlops/storage/factory.py +199 -0
  63. mlops/storage/interfaces/__init__.py +6 -0
  64. mlops/storage/interfaces/kv_store.py +118 -0
  65. mlops/storage/path_utils.py +38 -0
  66. mlops/templates/premier-league/charts/plot_metrics.js +70 -0
  67. mlops/templates/premier-league/charts/plot_metrics.py +145 -0
  68. mlops/templates/premier-league/charts/requirements.txt +6 -0
  69. mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
  70. mlops/templates/premier-league/configs/project_config.yaml +207 -0
  71. mlops/templates/premier-league/data/England CSV.csv +12154 -0
  72. mlops/templates/premier-league/models/premier_league_model.py +638 -0
  73. mlops/templates/premier-league/requirements.txt +8 -0
  74. mlops/templates/sklearn-basic/README.md +22 -0
  75. mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
  76. mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
  77. mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
  78. mlops/templates/sklearn-basic/data/train.csv +14 -0
  79. mlops/templates/sklearn-basic/models/model.py +62 -0
  80. mlops/templates/sklearn-basic/requirements.txt +10 -0
  81. mlops/web/__init__.py +3 -0
  82. mlops/web/server.py +585 -0
  83. mlops/web/ui/index.html +52 -0
  84. mlops/web/ui/mlops-charts.js +357 -0
  85. mlops/web/ui/script.js +1244 -0
  86. mlops/web/ui/styles.css +248 -0
@@ -0,0 +1,778 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional, List, Tuple
4
+ import json
5
+ import logging
6
+ import os
7
+ import time
8
+ import threading
9
+ from contextlib import contextmanager
10
+
11
+ from ..interfaces.kv_store import KeyValueEventStore
12
+ from ..path_utils import encode_probe_path
13
+
14
+ try:
15
+ from google.cloud import firestore_v1 as firestore # type: ignore
16
+ from google.cloud import pubsub_v1 as pubsub # type: ignore
17
+ from google.api_core import exceptions as gax_exceptions # type: ignore
18
+ except Exception: # pragma: no cover - optional dependency
19
+ firestore = None # type: ignore
20
+ pubsub = None # type: ignore
21
+ gax_exceptions = None # type: ignore
22
+
23
+
24
+ class GCPStore(KeyValueEventStore):
25
+ """GCP Firestore + Pub/Sub implementation of KeyValueEventStore.
26
+
27
+ Layout in Firestore (collections/documents):
28
+ - mlops_projects (collection)
29
+ - {project_id} (document)
30
+ - step_indices (collection)
31
+ - {process}:{step}:{ih}:{ch}:{fh} (document)
32
+ - process_indices (collection)
33
+ - {process}:{ih}:{ch}:{fh} (document)
34
+ - runs (collection)
35
+ - {run_id} (document) fields: status, timestamps (start,end), metrics (map), stats (map)
36
+ - steps (collection)
37
+ - {process}.{step} (document) record
38
+
39
+ Events are published to Pub/Sub on topic: {topic_name}
40
+ - default topic_name: mlops-projects-{project_id}-events
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ project_id: str,
46
+ gcp_project: Optional[str] = None,
47
+ topic_name: Optional[str] = None,
48
+ emulator_host: Optional[str] = None,
49
+ ) -> None:
50
+ if firestore is None or pubsub is None:
51
+ raise RuntimeError("google-cloud-firestore/pubsub not installed. Add google-cloud-firestore and google-cloud-pubsub to dependencies.")
52
+
53
+ # Record emulator host and support emulator for Firestore if provided
54
+ self._emulator_host = emulator_host
55
+ if self._emulator_host:
56
+ os.environ.setdefault("FIRESTORE_EMULATOR_HOST", self._emulator_host)
57
+
58
+ self.project_id = project_id
59
+ self.gcp_project = gcp_project or os.getenv("GOOGLE_CLOUD_PROJECT")
60
+ if not self.gcp_project:
61
+ # Allow running without real GCP project; attempts will fail but caller can catch
62
+ self.gcp_project = project_id
63
+
64
+ self._fs = None
65
+ self._publisher = None
66
+ self._topic_path = None
67
+ self.topic_name = topic_name or f"mlops-projects-{self.project_id}-events"
68
+ # Initialize clients now (driver) so immediate use works; they will be rebuilt lazily on unpickle
69
+ self._init_clients()
70
+
71
+ # Batch writing support to reduce network overhead
72
+ self._batch_writes: List[Tuple[str, Any, Any]] = [] # (operation, args, kwargs)
73
+ self._batch_mode = False
74
+
75
+ # Batch PubSub events to reduce thread creation
76
+ self._batch_events: List[Dict[str, Any]] = []
77
+ self._event_batch_size = 50 # Publish events in batches of 50
78
+ self._max_pending_events = 200 # Max events to queue before forcing flush
79
+ self._events_lock = threading.Lock() # Thread lock for batch_events access
80
+
81
+ # Logger for batch operations
82
+ self.logger = logging.getLogger(__name__)
83
+
84
+ @staticmethod
85
+ def required_env(config: Optional[Dict[str, Any]] = None) -> Dict[str, str]:
86
+ """Return SDK-required env vars based on config (for workers).
87
+
88
+ - GOOGLE_APPLICATION_CREDENTIALS if provided by user/global env
89
+ - GOOGLE_CLOUD_PROJECT if set in config
90
+ - FIRESTORE_EMULATOR_HOST if set in config
91
+ """
92
+ envs: Dict[str, str] = {}
93
+ try:
94
+ cfg = dict(config or {})
95
+ creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
96
+ if creds:
97
+ envs["GOOGLE_APPLICATION_CREDENTIALS"] = creds
98
+ gproj = cfg.get("gcp_project") or os.environ.get("GOOGLE_CLOUD_PROJECT")
99
+ if gproj:
100
+ envs["GOOGLE_CLOUD_PROJECT"] = str(gproj)
101
+ emu = cfg.get("emulator_host") or os.environ.get("FIRESTORE_EMULATOR_HOST")
102
+ if emu:
103
+ envs["FIRESTORE_EMULATOR_HOST"] = str(emu)
104
+ except Exception:
105
+ pass
106
+ return envs
107
+
108
+ def _init_clients(self) -> None:
109
+ """(Re)initialize Firestore and Pub/Sub clients and derived handles."""
110
+ if firestore is None or pubsub is None:
111
+ raise RuntimeError("google-cloud-firestore/pubsub not installed. Add google-cloud-firestore and google-cloud-pubsub to dependencies.")
112
+ # Respect emulator setting if present
113
+ if self._emulator_host:
114
+ os.environ.setdefault("FIRESTORE_EMULATOR_HOST", self._emulator_host)
115
+ self._fs = firestore.Client(project=self.gcp_project)
116
+ self._root = self._fs.collection("mlops_projects").document(self.project_id)
117
+ # Configure PublisherClient with batch settings to reduce thread creation
118
+ # batch_settings controls how many threads are created for batch operations
119
+ batch_settings = pubsub.types.BatchSettings(
120
+ max_bytes=1 * 1024 * 1024, # 1 MB max batch size
121
+ max_latency=1.0, # 1 second max latency before flushing
122
+ max_messages=100, # Max messages per batch
123
+ )
124
+ # publisher_options can limit thread creation
125
+ publisher_options = pubsub.types.PublisherOptions(
126
+ flow_control=pubsub.types.PublishFlowControl(
127
+ message_limit=500, # Limit pending messages
128
+ byte_limit=5 * 1024 * 1024, # 5 MB limit
129
+ limit_exceeded_behavior=pubsub.types.LimitExceededBehavior.BLOCK, # Block instead of creating more threads
130
+ )
131
+ )
132
+ self._publisher = pubsub.PublisherClient(
133
+ batch_settings=batch_settings,
134
+ publisher_options=publisher_options,
135
+ )
136
+ self._topic_path = self._publisher.topic_path(self.gcp_project, self.topic_name)
137
+ try:
138
+ self._publisher.get_topic(request={"topic": self._topic_path})
139
+ except Exception as e: # pragma: no cover - environment-specific
140
+ try:
141
+ if gax_exceptions and isinstance(e, getattr(gax_exceptions, "NotFound", Exception)):
142
+ try:
143
+ self._publisher.create_topic(request={"name": self._topic_path})
144
+ except Exception:
145
+ pass
146
+ except Exception:
147
+ pass
148
+
149
+ def __getstate__(self) -> Dict[str, Any]:
150
+ """Make the store picklable by excluding live client objects.
151
+
152
+ Only persist lightweight configuration; clients are re-created on unpickle.
153
+ """
154
+ return {
155
+ "project_id": self.project_id,
156
+ "gcp_project": self.gcp_project,
157
+ "topic_name": self.topic_name,
158
+ "_emulator_host": getattr(self, "_emulator_host", None),
159
+ }
160
+
161
+ def __setstate__(self, state: Dict[str, Any]) -> None:
162
+ self.project_id = state.get("project_id")
163
+ self.gcp_project = state.get("gcp_project")
164
+ self.topic_name = state.get("topic_name")
165
+ self._emulator_host = state.get("_emulator_host")
166
+ # Recreate clients on the unpickling side
167
+ self._fs = None
168
+ self._publisher = None
169
+ self._topic_path = None
170
+ # Reinitialize batch collections
171
+ self._batch_writes = []
172
+ self._batch_events = []
173
+ self._batch_mode = False
174
+ self._event_batch_size = 50
175
+ self._max_pending_events = 200
176
+ self._events_lock = threading.Lock()
177
+ self.logger = logging.getLogger(__name__)
178
+ self._init_clients()
179
+
180
+ # -------------------- Batch Writing Support --------------------
181
+ @contextmanager
182
+ def batch_write_context(self):
183
+ """Context manager for batching multiple write operations."""
184
+ self._batch_mode = True
185
+ self._batch_writes.clear()
186
+ try:
187
+ yield self
188
+ finally:
189
+ self._flush_batch_writes()
190
+ self._flush_events() # Flush PubSub events when batch completes
191
+ self._batch_mode = False
192
+
193
+ def _flush_batch_writes(self) -> None:
194
+ """Flush all batched write operations to Firestore."""
195
+ if not self._batch_writes:
196
+ return
197
+
198
+ try:
199
+ batch = self._fs.batch()
200
+ for operation, args, kwargs in self._batch_writes:
201
+ if operation == "set_step_cache_record":
202
+ process_name, step_name, ih, ch, fh, record = args
203
+ doc_id = self._step_idx_doc_id(process_name, step_name, ih, ch, fh)
204
+ doc_ref = self._root.collection("step_indices").document(doc_id)
205
+ batch.set(doc_ref, record, merge=True)
206
+ elif operation == "set_process_cache_record":
207
+ process_name, ih, ch, fh, record = args
208
+ doc_id = self._proc_idx_doc_id(process_name, ih, ch, fh)
209
+ doc_ref = self._root.collection("process_indices").document(doc_id)
210
+ batch.set(doc_ref, record, merge=True)
211
+ elif operation == "record_run_step":
212
+ run_id, process_name, step_name, record = args
213
+ doc_id = f"{process_name}.{step_name}"
214
+ run_ref = self._root.collection("runs").document(run_id)
215
+ step_ref = run_ref.collection("steps").document(doc_id)
216
+ batch.set(step_ref, record, merge=True)
217
+ # Touch run document for ordering
218
+ batch.set(run_ref, {"last_updated": time.time()}, merge=True)
219
+
220
+ batch.commit()
221
+ self.logger.debug(f"Flushed {len(self._batch_writes)} batched write operations")
222
+ except Exception as e:
223
+ self.logger.warning(f"Batch write flush failed: {e}")
224
+ finally:
225
+ self._batch_writes.clear()
226
+
227
+ # -------------------- Helpers --------------------
228
+ def _step_idx_doc_id(self, process_name: str, step_name: str, ih: str, ch: str, fh: Optional[str]) -> str:
229
+ return f"{process_name}:{step_name}:{ih}:{ch}:{fh or 'none'}"
230
+
231
+ def _proc_idx_doc_id(self, process_name: str, ih: str, ch: str, fh: Optional[str]) -> str:
232
+ return f"{process_name}:{ih}:{ch}:{fh or 'none'}"
233
+
234
+ def _charts_index_doc(self, run_id: str):
235
+ # Compact charts index per run for UI
236
+ return self._root.collection("runs").document(run_id).collection("charts_index").document("index")
237
+
238
+
239
+
240
+ # -------------------- Cache indices --------------------
241
+ def set_step_cache_record(
242
+ self,
243
+ process_name: str,
244
+ step_name: str,
245
+ input_hash: str,
246
+ config_hash: str,
247
+ function_hash: Optional[str],
248
+ record: Dict[str, Any],
249
+ ttl_seconds: Optional[int] = None,
250
+ ) -> None:
251
+ doc_id = self._step_idx_doc_id(process_name, step_name, input_hash, config_hash, function_hash)
252
+ payload = dict(record)
253
+ if ttl_seconds:
254
+ try:
255
+ payload["expires_at"] = time.time() + int(ttl_seconds)
256
+ except Exception:
257
+ pass
258
+
259
+ # Use batching if in batch mode
260
+ if self._batch_mode:
261
+ self._batch_writes.append(("set_step_cache_record", (process_name, step_name, input_hash, config_hash, function_hash, payload), {}))
262
+ else:
263
+ self._root.collection("step_indices").document(doc_id).set(payload, merge=True)
264
+
265
+ # Batched variant to coalesce writes for step completion
266
+ def set_step_cache_record_batched(
267
+ self,
268
+ run_id: str,
269
+ process_name: str,
270
+ step_name: str,
271
+ input_hash: str,
272
+ config_hash: str,
273
+ function_hash: Optional[str],
274
+ record: Dict[str, Any],
275
+ ttl_seconds: Optional[int] = None,
276
+ ) -> None:
277
+ batch = self._fs.batch()
278
+ # Step index write
279
+ doc_id = self._step_idx_doc_id(process_name, step_name, input_hash, config_hash, function_hash)
280
+ payload = dict(record)
281
+ if ttl_seconds:
282
+ try:
283
+ payload["expires_at"] = time.time() + int(ttl_seconds)
284
+ except Exception:
285
+ pass
286
+ batch.set(self._root.collection("step_indices").document(doc_id), payload, merge=True)
287
+ batch.commit()
288
+
289
+ def get_step_cache_path(
290
+ self,
291
+ process_name: str,
292
+ step_name: str,
293
+ input_hash: Optional[str],
294
+ config_hash: Optional[str],
295
+ function_hash: Optional[str],
296
+ ) -> Optional[str]:
297
+ if not input_hash or not config_hash:
298
+ return None
299
+ doc_id = self._step_idx_doc_id(process_name, step_name, input_hash, config_hash, function_hash)
300
+ snap = self._root.collection("step_indices").document(doc_id).get()
301
+ if not snap.exists:
302
+ return None
303
+ data = snap.to_dict() or {}
304
+ if data.get("status") in ("completed", "cached") and data.get("cache_path"):
305
+ return data["cache_path"]
306
+ return None
307
+
308
+ def get_step_cache_record(
309
+ self,
310
+ process_name: str,
311
+ step_name: str,
312
+ input_hash: Optional[str],
313
+ config_hash: Optional[str],
314
+ function_hash: Optional[str],
315
+ ) -> Optional[Dict[str, Any]]:
316
+ if not input_hash or not config_hash:
317
+ return None
318
+ doc_id = self._step_idx_doc_id(process_name, step_name, input_hash, config_hash, function_hash)
319
+ snap = self._root.collection("step_indices").document(doc_id).get()
320
+ return snap.to_dict() if snap.exists else None
321
+
322
+ def set_process_cache_record(
323
+ self,
324
+ process_name: str,
325
+ input_hash: str,
326
+ config_hash: str,
327
+ function_hash: Optional[str],
328
+ record: Dict[str, Any],
329
+ ttl_seconds: Optional[int] = None,
330
+ ) -> None:
331
+ if not input_hash or not config_hash:
332
+ return
333
+
334
+ doc_id = self._proc_idx_doc_id(process_name, input_hash, config_hash, function_hash)
335
+ payload = dict(record)
336
+ if ttl_seconds:
337
+ try:
338
+ payload["expires_at"] = time.time() + int(ttl_seconds)
339
+ except Exception:
340
+ pass
341
+
342
+ # Use batching if in batch mode
343
+ if self._batch_mode:
344
+ self._batch_writes.append(("set_process_cache_record", (process_name, input_hash, config_hash, function_hash, payload), {}))
345
+ else:
346
+ self._root.collection("process_indices").document(doc_id).set(payload, merge=True)
347
+
348
+ # Batched variant for process completion
349
+ def set_process_cache_record_batched(
350
+ self,
351
+ run_id: str,
352
+ process_name: str,
353
+ input_hash: str,
354
+ config_hash: str,
355
+ function_hash: Optional[str],
356
+ record: Dict[str, Any],
357
+ ttl_seconds: Optional[int] = None,
358
+ ) -> None:
359
+ if not input_hash or not config_hash:
360
+ return
361
+
362
+ batch = self._fs.batch()
363
+ # Process index write
364
+ doc_id = self._proc_idx_doc_id(process_name, input_hash, config_hash, function_hash)
365
+ payload = dict(record)
366
+ if ttl_seconds:
367
+ try:
368
+ payload["expires_at"] = time.time() + int(ttl_seconds)
369
+ except Exception:
370
+ pass
371
+ batch.set(self._root.collection("process_indices").document(doc_id), payload, merge=True)
372
+ # Optional: include a lightweight run summary touch (last_updated)
373
+ batch.set(
374
+ self._root.collection("runs").document(run_id),
375
+ {"last_updated": time.time()},
376
+ merge=True,
377
+ )
378
+ batch.commit()
379
+
380
+ def get_process_cache_path(
381
+ self,
382
+ process_name: str,
383
+ input_hash: Optional[str],
384
+ config_hash: Optional[str],
385
+ function_hash: Optional[str],
386
+ ) -> Optional[str]:
387
+ if not input_hash or not config_hash:
388
+ return None
389
+
390
+ doc_id = self._proc_idx_doc_id(process_name, input_hash, config_hash, function_hash)
391
+
392
+ snap = self._root.collection("process_indices").document(doc_id).get()
393
+ if not snap.exists:
394
+ return None
395
+
396
+ data = snap.to_dict() or {}
397
+
398
+ # Only accept a hit when status is terminal AND cache_path is present and valid
399
+ # Failed or stale entries without cache_path should be treated as cache misses
400
+ status = data.get("status")
401
+ cache_path = data.get("cache_path")
402
+
403
+ if status in ("completed", "cached") and cache_path and isinstance(cache_path, str):
404
+ return cache_path
405
+ return None
406
+
407
+ def get_process_cache_record(
408
+ self,
409
+ process_name: str,
410
+ input_hash: Optional[str],
411
+ config_hash: Optional[str],
412
+ function_hash: Optional[str],
413
+ ) -> Optional[Dict[str, Any]]:
414
+ if not input_hash or not config_hash:
415
+ return None
416
+ doc_id = self._proc_idx_doc_id(process_name, input_hash, config_hash, function_hash)
417
+ snap = self._root.collection("process_indices").document(doc_id).get()
418
+ return snap.to_dict() if snap.exists else None
419
+
420
+ def get_process_cache_paths_batch(
421
+ self,
422
+ lookups: list[tuple[str, Optional[str], Optional[str], Optional[str]]],
423
+ ) -> dict[str, Optional[str]]:
424
+ """Batch get process cache paths via Firestore get_all for fewer RPCs.
425
+
426
+ Returns mapping from composite key "process_name|ih|ch|fh" to cache_path (or None).
427
+ """
428
+ # Build document references and composite keys in order
429
+ refs = []
430
+ composite: list[str] = []
431
+ for process_name, ih, ch, fh in lookups or []:
432
+ if not ih or not ch:
433
+ composite.append(f"{process_name}|{ih}|{ch}|{fh or 'none'}")
434
+ refs.append(None)
435
+ continue
436
+ doc_id = self._proc_idx_doc_id(process_name, ih, ch, fh)
437
+ refs.append(self._root.collection("process_indices").document(doc_id))
438
+ composite.append(f"{process_name}|{ih}|{ch}|{fh or 'none'}")
439
+
440
+ out: dict[str, Optional[str]] = {}
441
+ # Early return
442
+ if not refs:
443
+ return out
444
+ # Firestore get_all supports filtering out None; maintain ordering by iterating zips
445
+ # Collect snapshots for non-None refs, keep placeholders for None
446
+ snaps_iter = self._fs.get_all([r for r in refs if r is not None])
447
+ snaps = list(snaps_iter)
448
+ # Re-map results back to original order
449
+ snap_idx = 0
450
+ for comp_key, ref in zip(composite, refs):
451
+ if ref is None:
452
+ out[comp_key] = None
453
+ continue
454
+ snap = snaps[snap_idx] if snap_idx < len(snaps) else None
455
+ snap_idx += 1
456
+ if snap is None or not getattr(snap, 'exists', False):
457
+ out[comp_key] = None
458
+ continue
459
+ data = snap.to_dict() or {}
460
+ if data.get("status") in ("completed", "cached") and data.get("cache_path"):
461
+ out[comp_key] = data.get("cache_path")
462
+ else:
463
+ out[comp_key] = None
464
+ return out
465
+
466
+ # -------------------- Run lifecycle + metrics --------------------
467
+ def mark_pipeline_started(self, run_id: str) -> None:
468
+ run_ref = self._root.collection("runs").document(run_id)
469
+ run_ref.set({
470
+ "status": "running",
471
+ "timestamps": {"start": time.time(), "end": None},
472
+ }, merge=True)
473
+ self.publish_event({"type": "pipeline.started", "run_id": run_id, "status": "running"})
474
+
475
+ def mark_pipeline_completed(self, run_id: str, success: bool) -> None:
476
+ run_ref = self._root.collection("runs").document(run_id)
477
+ run_ref.set({
478
+ "status": "completed" if success else "failed",
479
+ "timestamps": {"end": time.time()},
480
+ }, merge=True)
481
+ self.publish_event({"type": "pipeline.completed", "run_id": run_id, "status": "completed" if success else "failed"})
482
+ # Flush events immediately on pipeline completion to ensure they're sent
483
+ self._flush_events()
484
+
485
+
486
+ def get_run_status(self, run_id: str) -> Optional[str]:
487
+ try:
488
+ snap = self._root.collection("runs").document(run_id).get()
489
+ if not snap.exists:
490
+ return None
491
+ data = snap.to_dict() or {}
492
+ status = data.get("status")
493
+ return str(status).lower() if status is not None else None
494
+ except Exception:
495
+ return None
496
+
497
+
498
+ # -------------------- Events --------------------
499
+ def publish_event(self, event: Dict[str, Any]) -> None:
500
+ """Queue event for batch publishing to reduce thread creation.
501
+
502
+ Events are batched and published when:
503
+ - Batch size reaches _event_batch_size (50)
504
+ - Pending events exceed _max_pending_events (200)
505
+ - flush_events() is called explicitly
506
+ """
507
+ try:
508
+ with self._events_lock:
509
+ self._batch_events.append(event)
510
+ batch_size = len(self._batch_events)
511
+ should_flush = batch_size >= self._max_pending_events or batch_size >= self._event_batch_size
512
+
513
+ # Flush outside lock to avoid holding lock during I/O
514
+ if should_flush:
515
+ self._flush_events()
516
+ except Exception as e:
517
+ # If batching fails, try immediate publish as fallback
518
+ try:
519
+ data = json.dumps(event, default=str).encode("utf-8")
520
+ self._publisher.publish(self._topic_path, data=data)
521
+ except Exception:
522
+ pass
523
+
524
+ def _flush_events(self) -> None:
525
+ """Flush batched PubSub events (thread-safe)."""
526
+ # Extract events to publish while holding lock
527
+ events_to_publish = []
528
+ try:
529
+ with self._events_lock:
530
+ if not self._batch_events or not self._publisher:
531
+ return
532
+ events_to_publish = list(self._batch_events)
533
+ self._batch_events.clear()
534
+ except Exception:
535
+ return
536
+
537
+ # Publish events outside lock to avoid holding lock during I/O
538
+ if not events_to_publish:
539
+ return
540
+
541
+ try:
542
+ # Publish all events in batch
543
+ # Add small delay between publishes to avoid overwhelming gRPC's batch manager
544
+ futures = []
545
+ for idx, event in enumerate(events_to_publish):
546
+ try:
547
+ data = json.dumps(event, default=str).encode("utf-8")
548
+ # Retry publish on KeyError (gRPC batch operation error)
549
+ max_retries = 2
550
+ for attempt in range(max_retries + 1):
551
+ try:
552
+ future = self._publisher.publish(self._topic_path, data=data)
553
+ futures.append(future)
554
+ break # Success, exit retry loop
555
+ except (KeyError, RuntimeError) as e:
556
+ if attempt < max_retries:
557
+ # KeyError can happen if gRPC's batch state is corrupted
558
+ # Retry with a small delay
559
+ time.sleep(0.01 * (attempt + 1)) # Exponential backoff
560
+ continue
561
+ else:
562
+ # Log but don't fail - individual event loss is acceptable
563
+ if isinstance(e, KeyError):
564
+ self.logger.debug(f"PubSub batch operation error after {max_retries} retries (may be transient): {e}")
565
+ raise
566
+ # Small delay between publishes to avoid overwhelming gRPC batch manager
567
+ # This reduces the chance of KeyError in gRPC's internal threads
568
+ if idx < len(events_to_publish) - 1: # Don't delay after last event
569
+ time.sleep(0.001) # 1ms delay between publishes
570
+ except Exception as e:
571
+ # Catch any other exceptions and continue
572
+ self.logger.debug(f"PubSub publish error (event skipped): {e}")
573
+ pass
574
+
575
+ # Wait for all publishes to complete (non-blocking, but ensures they're submitted)
576
+ # This prevents thread exhaustion by not creating excessive background threads
577
+ # Note: We don't wait() on futures to avoid blocking, but the batching helps
578
+ self.logger.debug(f"Flushed {len(events_to_publish)} PubSub events")
579
+ except (KeyError, RuntimeError, Exception) as e:
580
+ # Handle KeyError from gRPC batch operations gracefully
581
+ if isinstance(e, KeyError):
582
+ self.logger.debug(f"PubSub batch flush error (may be transient): {e}")
583
+ else:
584
+ self.logger.warning(f"PubSub event flush failed: {e}")
585
+
586
+ # -------------------- Per-run step bookkeeping --------------------
587
+ def record_run_step(self, run_id: str, process_name: str, step_name: str, record: Dict[str, Any]) -> None:
588
+ # Persist per-run step record under runs/{run_id}/steps/{process}.{step}
589
+ try:
590
+ # Use batching if in batch mode
591
+ if self._batch_mode:
592
+ self._batch_writes.append(("record_run_step", (run_id, process_name, step_name, dict(record)), {}))
593
+ else:
594
+ run_ref = self._root.collection("runs").document(run_id)
595
+ doc_id = f"{process_name}.{step_name}"
596
+ run_ref.collection("steps").document(doc_id).set(dict(record), merge=True)
597
+ # Touch run document for ordering
598
+ try:
599
+ run_ref.set({"last_updated": time.time()}, merge=True)
600
+ except Exception:
601
+ pass
602
+ except Exception:
603
+ # Best-effort; ignore errors
604
+ return None
605
+
606
+ def list_run_steps(self, run_id: str) -> Dict[str, Dict[str, Any]]:
607
+ # Read directly from steps subcollection
608
+ results: Dict[str, Dict[str, Any]] = {}
609
+ steps_ref = self._root.collection("runs").document(run_id).collection("steps")
610
+ for doc in steps_ref.stream():
611
+ results[doc.id] = doc.to_dict() or {}
612
+ return results
613
+
614
+ def increment_stat(self, run_id: str, name: str, amount: int = 1) -> None:
615
+ from google.cloud.firestore_v1 import Increment # type: ignore
616
+ self._root.collection("runs").document(run_id).set({"stats": {name: Increment(amount)}}, merge=True)
617
+
618
+ def get_pipeline_stats(self, run_id: str) -> Dict[str, Any]:
619
+ snap = self._root.collection("runs").document(run_id).get()
620
+ if not snap.exists:
621
+ return {}
622
+ data = snap.to_dict() or {}
623
+ return data.get("stats", {})
624
+
625
+ # -------------------- Charts index --------------------
626
+ def record_run_chart_artifacts(self, run_id: str, chart_name: str, artifacts: list[dict[str, Any]]) -> None:
627
+ """Record chart artifacts into a compact charts_index document for the run.
628
+
629
+ Structure:
630
+ runs/{run_id}/charts_index/index -> {
631
+ charts: {
632
+ <chart_name>: {
633
+ type: "static"|"dynamic",
634
+ items: [ { title, object_path, cache_path, mime_type, size_bytes, created_at } ]
635
+ }
636
+ },
637
+ last_updated: <ts>
638
+ }
639
+ """
640
+ try:
641
+ idx_ref = self._charts_index_doc(run_id)
642
+ # Load existing charts map to avoid overwriting other entries
643
+ existing: Dict[str, Any] = {}
644
+ try:
645
+ snap = idx_ref.get()
646
+ if getattr(snap, 'exists', False):
647
+ data = snap.to_dict() or {}
648
+ if isinstance(data.get('charts'), dict):
649
+ existing = dict(data.get('charts'))
650
+ except Exception:
651
+ existing = {}
652
+ # Determine chart type if present on first artifact
653
+ chart_type = None
654
+ try:
655
+ if artifacts and isinstance(artifacts[0], dict):
656
+ ctype = artifacts[0].get("chart_type")
657
+ if isinstance(ctype, str) and ctype.strip():
658
+ chart_type = ctype.strip().lower()
659
+ except Exception:
660
+ chart_type = None
661
+ existing[chart_name] = {"type": (chart_type or "static"), "items": artifacts}
662
+ payload = {
663
+ "charts": existing,
664
+ "last_updated": time.time(),
665
+ }
666
+ idx_ref.set(payload, merge=True)
667
+ except Exception as e:
668
+ self.logger.debug(
669
+ "Failed to record chart artifacts (run_id=%s, chart_name=%s)",
670
+ run_id,
671
+ chart_name,
672
+ exc_info=True,
673
+ )
674
+ return None
675
+
676
+ def list_run_charts(self, run_id: str) -> Dict[str, Any]:
677
+ idx_ref = self._charts_index_doc(run_id)
678
+ snap = idx_ref.get()
679
+ if not snap.exists:
680
+ return {}
681
+ data = snap.to_dict() or {}
682
+ charts = data.get("charts", {})
683
+ # Ensure each entry has {type, items}
684
+ out: Dict[str, Any] = {}
685
+ if isinstance(charts, dict):
686
+ for name, val in charts.items():
687
+ if isinstance(val, dict):
688
+ ctype = val.get("type") or None
689
+ items = val.get("items") or []
690
+ out[name] = {"type": (str(ctype).lower() if isinstance(ctype, str) else "static"), "items": items}
691
+ return out
692
+
693
+ def copy_run_chart_artifacts(self, from_run_id: str, to_run_id: str, chart_name: str) -> bool:
694
+ try:
695
+ # Read chart artifacts from source run
696
+ from_idx_ref = self._charts_index_doc(from_run_id)
697
+ from_snap = from_idx_ref.get()
698
+ if not from_snap.exists:
699
+ return False
700
+
701
+ from_data = from_snap.to_dict() or {}
702
+ from_charts = from_data.get("charts", {})
703
+
704
+ # Check if the specific chart exists in source
705
+ if chart_name not in from_charts:
706
+ return False
707
+
708
+ chart_data = from_charts[chart_name]
709
+ if not isinstance(chart_data, dict):
710
+ return False
711
+
712
+ # Read existing charts from destination run
713
+ to_idx_ref = self._charts_index_doc(to_run_id)
714
+ to_snap = to_idx_ref.get()
715
+ to_data = to_snap.to_dict() if to_snap.exists else {}
716
+ to_charts = to_data.get("charts", {})
717
+
718
+ # Copy the chart data to destination
719
+ to_charts[chart_name] = chart_data
720
+
721
+ # Write back to destination run
722
+ to_idx_ref.set({
723
+ "charts": to_charts,
724
+ "last_updated": time.time()
725
+ }, merge=True)
726
+ return True
727
+
728
+ except Exception:
729
+ self.logger.debug("copy_run_chart_artifacts failed", exc_info=True)
730
+ return False
731
+
732
+ # -------------------- Probe metrics --------------------
733
+ def save_probe_metrics_by_path(self, run_id: str, probe_path: str, metrics: Dict[str, Any]) -> None:
734
+ """Store metrics under metric/{run_id}/probes_by_path/{encoded_path}."""
735
+ encoded = encode_probe_path(probe_path)
736
+ metric_ref = (
737
+ self._root.collection("metric").document(run_id).collection("probes_by_path").document(encoded)
738
+ )
739
+ payload = dict(metrics)
740
+ metric_ref.set(payload, merge=True)
741
+ try:
742
+ self.publish_event({
743
+ "type": "probe_metrics.updated",
744
+ "run_id": run_id,
745
+ "probe_path": probe_path,
746
+ "metrics": metrics,
747
+ })
748
+ except Exception:
749
+ pass
750
+
751
+ def get_probe_metrics_by_path(self, run_id: str, probe_path: str) -> Dict[str, Any]:
752
+ encoded = encode_probe_path(probe_path)
753
+ ref = self._root.collection("metric").document(run_id).collection("probes_by_path").document(encoded)
754
+ snap = ref.get()
755
+ if not snap.exists:
756
+ return {}
757
+ data = snap.to_dict() or {}
758
+ data.pop("updated_at", None)
759
+ return data
760
+
761
+
762
+ def list_runs(self, limit: int = 100) -> list[str]:
763
+ """List recent run IDs from Firestore for this project namespace.
764
+
765
+ Tries to order by 'last_updated' desc if present; otherwise returns up to `limit` docs.
766
+ """
767
+ try:
768
+ runs_col = self._root.collection("runs")
769
+ try:
770
+ # Prefer ordering by last_updated if available
771
+ docs = list(runs_col.order_by("last_updated", direction=firestore.Query.DESCENDING).limit(limit).stream()) # type: ignore[attr-defined]
772
+ except Exception:
773
+ docs = list(runs_col.limit(limit).stream())
774
+ return [d.id for d in docs]
775
+ except Exception:
776
+ return []
777
+
778
+