expops 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. expops-0.1.3.dist-info/METADATA +826 -0
  2. expops-0.1.3.dist-info/RECORD +86 -0
  3. expops-0.1.3.dist-info/WHEEL +5 -0
  4. expops-0.1.3.dist-info/entry_points.txt +3 -0
  5. expops-0.1.3.dist-info/licenses/LICENSE +674 -0
  6. expops-0.1.3.dist-info/top_level.txt +1 -0
  7. mlops/__init__.py +0 -0
  8. mlops/__main__.py +11 -0
  9. mlops/_version.py +34 -0
  10. mlops/adapters/__init__.py +12 -0
  11. mlops/adapters/base.py +86 -0
  12. mlops/adapters/config_schema.py +89 -0
  13. mlops/adapters/custom/__init__.py +3 -0
  14. mlops/adapters/custom/custom_adapter.py +447 -0
  15. mlops/adapters/plugin_manager.py +113 -0
  16. mlops/adapters/sklearn/__init__.py +3 -0
  17. mlops/adapters/sklearn/adapter.py +94 -0
  18. mlops/cluster/__init__.py +3 -0
  19. mlops/cluster/controller.py +496 -0
  20. mlops/cluster/process_runner.py +91 -0
  21. mlops/cluster/providers.py +258 -0
  22. mlops/core/__init__.py +95 -0
  23. mlops/core/custom_model_base.py +38 -0
  24. mlops/core/dask_networkx_executor.py +1265 -0
  25. mlops/core/executor_worker.py +1239 -0
  26. mlops/core/experiment_tracker.py +81 -0
  27. mlops/core/graph_types.py +64 -0
  28. mlops/core/networkx_parser.py +135 -0
  29. mlops/core/payload_spill.py +278 -0
  30. mlops/core/pipeline_utils.py +162 -0
  31. mlops/core/process_hashing.py +216 -0
  32. mlops/core/step_state_manager.py +1298 -0
  33. mlops/core/step_system.py +956 -0
  34. mlops/core/workspace.py +99 -0
  35. mlops/environment/__init__.py +10 -0
  36. mlops/environment/base.py +43 -0
  37. mlops/environment/conda_manager.py +307 -0
  38. mlops/environment/factory.py +70 -0
  39. mlops/environment/pyenv_manager.py +146 -0
  40. mlops/environment/setup_env.py +31 -0
  41. mlops/environment/system_manager.py +66 -0
  42. mlops/environment/utils.py +105 -0
  43. mlops/environment/venv_manager.py +134 -0
  44. mlops/main.py +527 -0
  45. mlops/managers/project_manager.py +400 -0
  46. mlops/managers/reproducibility_manager.py +575 -0
  47. mlops/platform.py +996 -0
  48. mlops/reporting/__init__.py +16 -0
  49. mlops/reporting/context.py +187 -0
  50. mlops/reporting/entrypoint.py +292 -0
  51. mlops/reporting/kv_utils.py +77 -0
  52. mlops/reporting/registry.py +50 -0
  53. mlops/runtime/__init__.py +9 -0
  54. mlops/runtime/context.py +34 -0
  55. mlops/runtime/env_export.py +113 -0
  56. mlops/storage/__init__.py +12 -0
  57. mlops/storage/adapters/__init__.py +9 -0
  58. mlops/storage/adapters/gcp_kv_store.py +778 -0
  59. mlops/storage/adapters/gcs_object_store.py +96 -0
  60. mlops/storage/adapters/memory_store.py +240 -0
  61. mlops/storage/adapters/redis_store.py +438 -0
  62. mlops/storage/factory.py +199 -0
  63. mlops/storage/interfaces/__init__.py +6 -0
  64. mlops/storage/interfaces/kv_store.py +118 -0
  65. mlops/storage/path_utils.py +38 -0
  66. mlops/templates/premier-league/charts/plot_metrics.js +70 -0
  67. mlops/templates/premier-league/charts/plot_metrics.py +145 -0
  68. mlops/templates/premier-league/charts/requirements.txt +6 -0
  69. mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
  70. mlops/templates/premier-league/configs/project_config.yaml +207 -0
  71. mlops/templates/premier-league/data/England CSV.csv +12154 -0
  72. mlops/templates/premier-league/models/premier_league_model.py +638 -0
  73. mlops/templates/premier-league/requirements.txt +8 -0
  74. mlops/templates/sklearn-basic/README.md +22 -0
  75. mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
  76. mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
  77. mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
  78. mlops/templates/sklearn-basic/data/train.csv +14 -0
  79. mlops/templates/sklearn-basic/models/model.py +62 -0
  80. mlops/templates/sklearn-basic/requirements.txt +10 -0
  81. mlops/web/__init__.py +3 -0
  82. mlops/web/server.py +585 -0
  83. mlops/web/ui/index.html +52 -0
  84. mlops/web/ui/mlops-charts.js +357 -0
  85. mlops/web/ui/script.js +1244 -0
  86. mlops/web/ui/styles.css +248 -0
@@ -0,0 +1,1298 @@
1
+ from typing import Dict, List, Optional, Any
2
+ import json
3
+ import hashlib
4
+ import io
5
+ from pathlib import Path
6
+ from datetime import datetime
7
+ import logging
8
+ from dataclasses import dataclass
9
+ import numpy as np
10
+ import joblib
11
+ import time
12
+ import inspect
13
+ import ast
14
+
15
+ from mlops.storage.interfaces.kv_store import KeyValueEventStore, ObjectStore
16
+ from mlops.storage.path_utils import decode_probe_path
17
+
18
+
19
+ @dataclass
20
+ class StepExecutionResult:
21
+ """Result of executing a single step."""
22
+ step_name: str
23
+ success: bool
24
+ result: Optional[Dict[str, Any]] = None
25
+ error: Optional[str] = None
26
+ execution_time: float = 0.0
27
+ timestamp: str = ""
28
+
29
+ @dataclass
30
+ class ProcessExecutionResult:
31
+ """Result of executing a single process."""
32
+ process_name: str
33
+ success: bool
34
+ result: Optional[Dict[str, Any]] = None
35
+ error: Optional[str] = None
36
+ execution_time: float = 0.0
37
+ timestamp: str = ""
38
+
39
+
40
+ class StepStateManager:
41
+ """State manager for step-based pipeline execution with caching."""
42
+
43
+ def __init__(self, cache_dir: Path, kv_store: KeyValueEventStore, logger: Optional[logging.Logger] = None,
44
+ cache_ttl_hours: Optional[int] = None, object_store: Optional[ObjectStore] = None,
45
+ object_prefix: Optional[str] = None):
46
+ self.logger = logger or logging.getLogger(__name__)
47
+ self.cache_dir = cache_dir
48
+
49
+ self.kv_store = kv_store
50
+ self.redis_ttl_seconds = int((cache_ttl_hours or 24) * 3600)
51
+ self.object_store = object_store
52
+ self.object_prefix = object_prefix.strip("/") if isinstance(object_prefix, str) else None
53
+
54
+ def _safe_proc(self, name: Optional[str]) -> str:
55
+ """Return a filesystem-safe process identifier."""
56
+ return (name or 'no_process').replace('/', '_')
57
+
58
+ def _stable_step_filename(
59
+ self,
60
+ process_name: Optional[str],
61
+ step_name: str,
62
+ input_hash: Optional[str],
63
+ config_hash: Optional[str],
64
+ function_hash: Optional[str],
65
+ ) -> Optional[str]:
66
+ if not input_hash or not config_hash:
67
+ return None
68
+ return f"stable_{self._safe_proc(process_name)}_{step_name}_{input_hash}_{config_hash}_{(function_hash or 'none')}.pkl"
69
+
70
+ def _stable_process_filename(
71
+ self,
72
+ process_name: str,
73
+ input_hash: Optional[str],
74
+ config_hash: Optional[str],
75
+ function_hash: Optional[str],
76
+ ) -> Optional[str]:
77
+ if not input_hash or not config_hash:
78
+ return None
79
+ return f"stable_process__{self._safe_proc(process_name)}_{input_hash}_{config_hash}_{(function_hash or 'none')}.pkl"
80
+
81
+ def _build_object_uri(self, filename: str) -> str:
82
+ """Build an object store URI honoring the optional prefix."""
83
+ return self.object_store.build_uri(*(filter(None, [self.object_prefix, filename])))
84
+
85
+ def _format_probe_path(
86
+ self,
87
+ process_name: Optional[str],
88
+ step_name: Optional[str],
89
+ input_hash: Optional[str] = None,
90
+ config_hash: Optional[str] = None,
91
+ function_hash: Optional[str] = None,
92
+ ) -> str:
93
+ """Clean, human-readable path string for charts.
94
+
95
+ Returns simple process or process/step paths without hash suffixes.
96
+ Since metrics are now cached directly, we don't need hash disambiguation.
97
+ """
98
+ if step_name is None:
99
+ # Process-level path
100
+ return str(process_name or "no_process")
101
+ else:
102
+ # Step-level path
103
+ return f"{process_name or 'no_process'}/{step_name}"
104
+
105
+ def _append_probe_metrics(self, run_id: str, probe_id: str, new_metrics: Dict[str, Any], path_key: str, step: int = 0) -> None:
106
+ """Append numeric metrics as step-indexed dictionaries under metric/{run_id}/probes/{probe_id}.
107
+
108
+ New behavior (MLflow-style):
109
+ - Numeric values -> stored as {step_number: value} dictionaries
110
+ - Non-numeric values -> store last snapshot under a separate map
111
+ - step=0 is reserved for auto-logged metrics (from process/step returns)
112
+ - step>=1 for manual log_metric() calls
113
+ """
114
+ try:
115
+ try:
116
+ self.logger.info(f"[Metrics] Append begin -> run_id={run_id}, path_key={path_key}, step={step}, keys={list((new_metrics or {}).keys())}")
117
+ except Exception:
118
+ pass
119
+ # Attempt to read existing metrics for this probe and append
120
+ existing = {}
121
+ try:
122
+ existing = self.kv_store.get_probe_metrics_by_path(run_id, path_key) or {}
123
+ except Exception:
124
+ existing = {}
125
+ updated: Dict[str, Any] = dict(existing) if isinstance(existing, dict) else {}
126
+ def _to_firestore_safe(obj: Any) -> Any:
127
+ """Convert values to Firestore-safe types.
128
+ - Dict keys must be strings
129
+ - Convert numpy scalars to native Python
130
+ - Recurse through lists/tuples/dicts
131
+ """
132
+ try:
133
+ import numpy as _np # type: ignore
134
+ except Exception:
135
+ _np = None # type: ignore
136
+ # Primitive JSON-safe types
137
+ if obj is None or isinstance(obj, (bool, int, float, str)):
138
+ return obj
139
+ # Numpy scalar types -> Python native
140
+ if _np is not None and isinstance(obj, (_np.integer, _np.floating)):
141
+ try:
142
+ return float(obj) if isinstance(obj, _np.floating) else int(obj)
143
+ except Exception:
144
+ return obj.item() # type: ignore[attr-defined]
145
+ # Lists/Tuples -> list of safe
146
+ if isinstance(obj, (list, tuple)):
147
+ return [_to_firestore_safe(x) for x in obj]
148
+ # Dicts -> string keys and safe values
149
+ if isinstance(obj, dict):
150
+ out = {}
151
+ for k, v in obj.items():
152
+ try:
153
+ out[str(k)] = _to_firestore_safe(v)
154
+ except Exception:
155
+ # Best-effort: stringify both key and value
156
+ out[str(k)] = str(v)
157
+ return out
158
+ # Fallback: stringify
159
+ try:
160
+ return str(obj)
161
+ except Exception:
162
+ return obj
163
+
164
+ for mname, mval in (new_metrics or {}).items():
165
+ try:
166
+ if isinstance(mval, (int, float)):
167
+ # Get existing metric dict (or create new one)
168
+ metric_dict = updated.get(mname) or {}
169
+ # Handle legacy list format - convert to dict
170
+ if isinstance(metric_dict, list):
171
+ # Convert old list format to dict (use indices as steps)
172
+ # IMPORTANT: Keys must be strings for Firestore compatibility
173
+ metric_dict = {str(i): v for i, v in enumerate(metric_dict)}
174
+ elif not isinstance(metric_dict, dict):
175
+ metric_dict = {}
176
+ # Add new value at specified step
177
+ # IMPORTANT: Convert step to string for Firestore compatibility
178
+ metric_dict[str(step)] = float(mval)
179
+ updated[mname] = metric_dict
180
+ else:
181
+ # Store non-numeric snapshot directly under the metric name
182
+ # Ensure payload is Firestore-safe (string keys, JSON-serializable)
183
+ safe_val = _to_firestore_safe(mval)
184
+ updated[mname] = safe_val
185
+ except Exception:
186
+ continue
187
+ try:
188
+ self.logger.info(f"[Metrics] Saving metrics -> run_id={run_id}, path_key={path_key}, keys={list(updated.keys())}")
189
+ except Exception:
190
+ pass
191
+ self.kv_store.save_probe_metrics_by_path(run_id, path_key, updated)
192
+ except Exception as e:
193
+ self.logger.warning(f"Failed to append probe metrics for {probe_id}: {e}")
194
+
195
+ def log_metric(self, run_id: str, process_name: Optional[str], step_name: Optional[str],
196
+ metric_name: str, value: Any, step: Optional[int] = None) -> None:
197
+ """Manually log a metric with a step number (MLflow-style).
198
+
199
+ Args:
200
+ run_id: Current run ID
201
+ process_name: Process name (None for process-level metrics)
202
+ step_name: Step name (None for process-level metrics)
203
+ metric_name: Name of the metric
204
+ value: Metric value
205
+ step: Step number (if None, auto-increments from the largest existing step)
206
+ """
207
+ try:
208
+ # Compute path for this process/step
209
+ path_key = self._format_probe_path(process_name, step_name)
210
+
211
+ # Get existing metrics to determine next step if needed
212
+ if step is None:
213
+ existing = self.kv_store.get_probe_metrics_by_path(run_id, path_key) or {}
214
+ metric_dict = existing.get(metric_name)
215
+ if isinstance(metric_dict, dict) and metric_dict:
216
+ try:
217
+ max_step = max(int(k) for k in metric_dict.keys())
218
+ step = max_step + 1
219
+ except (ValueError, TypeError):
220
+ step = 1
221
+ else:
222
+ # No existing data, start at 1
223
+ step = 1
224
+
225
+ # Log the metric
226
+ self._append_probe_metrics(run_id, path_key, {metric_name: value}, path_key, step=step)
227
+ except Exception as e:
228
+ self.logger.warning(f"Failed to log metric {metric_name}: {e}")
229
+
230
+ def _get_cache_path(self, run_id: str, step_name: str, process_name: Optional[str] = None) -> Path:
231
+ # Include process_name to avoid collisions across processes
232
+ safe_proc = (process_name or "no_process").replace("/", "_")
233
+ return self.cache_dir / f"{run_id}_{safe_proc}_{step_name}.pkl"
234
+
235
+ def _get_process_cache_path(self, run_id: str, process_name: str) -> Path:
236
+ safe_proc = (process_name or "no_process").replace("/", "_")
237
+ return self.cache_dir / f"{run_id}__process__{safe_proc}.pkl"
238
+
239
+ def _get_stable_step_cache_path(
240
+ self,
241
+ step_name: str,
242
+ process_name: Optional[str],
243
+ input_hash: Optional[str],
244
+ config_hash: Optional[str],
245
+ function_hash: Optional[str],
246
+ ) -> Optional[Path]:
247
+ """Deterministic, cross-run cache file path for a step based on hashes.
248
+
249
+ Returns None if required hashes are missing.
250
+ """
251
+ if not input_hash or not config_hash:
252
+ return None
253
+ safe_proc = (process_name or "no_process").replace("/", "_")
254
+ fhash = function_hash or "none"
255
+ return self.cache_dir / f"stable_{safe_proc}_{step_name}_{input_hash}_{config_hash}_{fhash}.pkl"
256
+
257
+ def _get_stable_process_cache_path(
258
+ self,
259
+ process_name: str,
260
+ input_hash: Optional[str],
261
+ config_hash: Optional[str],
262
+ function_hash: Optional[str],
263
+ ) -> Optional[Path]:
264
+ """Deterministic, cross-run cache file path for a process based on hashes.
265
+
266
+ Returns None if required hashes are missing.
267
+ """
268
+ if not input_hash or not config_hash:
269
+ return None
270
+ safe_proc = (process_name or "no_process").replace("/", "_")
271
+ fhash = function_hash or "none"
272
+ return self.cache_dir / f"stable_process__{safe_proc}_{input_hash}_{config_hash}_{fhash}.pkl"
273
+
274
+ def _compute_hash(self, obj: Any) -> str:
275
+ """Compute cryptographically secure SHA-256 hash of any object.
276
+
277
+ Uses canonical JSON serialization with custom handling for common non-JSON
278
+ types to ensure determinism and minimize collision risk.
279
+ """
280
+ try:
281
+ def _to_canonical(o: Any) -> Any:
282
+ # Primitive JSON types pass through
283
+ if o is None or isinstance(o, (str, int, float, bool)):
284
+ return o
285
+ # Paths -> string
286
+ if isinstance(o, Path):
287
+ return str(o)
288
+ # Datetime -> ISO8601
289
+ if isinstance(o, datetime):
290
+ return {"__datetime__": True, "iso": o.isoformat()}
291
+ # Bytes-like -> SHA-256 digest to avoid bloating payloads
292
+ if isinstance(o, (bytes, bytearray, memoryview)):
293
+ bh = hashlib.sha256()
294
+ bh.update(bytes(o))
295
+ return {"__bytes__": True, "sha256": bh.hexdigest()}
296
+ # NumPy arrays -> digest over shape|dtype|data
297
+ if isinstance(o, np.ndarray):
298
+ import os as _os
299
+ _prev_omp = _os.environ.get('OMP_NUM_THREADS')
300
+ try:
301
+ _os.environ['OMP_NUM_THREADS'] = '1'
302
+ ah = hashlib.sha256()
303
+ ah.update(b"ndarray|")
304
+ ah.update(str(o.shape).encode("utf-8"))
305
+ ah.update(b"|")
306
+ ah.update(str(o.dtype).encode("utf-8"))
307
+ ah.update(b"|")
308
+ ah.update(o.tobytes())
309
+ return {"__ndarray__": True, "sha256": ah.hexdigest()}
310
+ finally:
311
+ if _prev_omp is not None:
312
+ _os.environ['OMP_NUM_THREADS'] = _prev_omp
313
+ else:
314
+ _os.environ.pop('OMP_NUM_THREADS', None)
315
+ # Mappings -> dict with stringified keys, recursively canonicalized, sorted by key
316
+ if isinstance(o, dict):
317
+ return {str(k): _to_canonical(v) for k, v in sorted(o.items(), key=lambda kv: str(kv[0]))}
318
+ # Sequences -> list of canonicalized items
319
+ if isinstance(o, (list, tuple)):
320
+ return [_to_canonical(x) for x in o]
321
+ # Sets -> sorted list to make order deterministic
322
+ if isinstance(o, (set, frozenset)):
323
+ return sorted([_to_canonical(x) for x in o], key=lambda x: json.dumps(x, sort_keys=True, separators=(",", ":")))
324
+ # Fallback: use repr for a stable textual form
325
+ return {"__repr__": True, "type": type(o).__name__, "value": repr(o)}
326
+
327
+ canonical = _to_canonical(obj)
328
+ payload = json.dumps(canonical, sort_keys=True, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
329
+ return hashlib.sha256(payload).hexdigest()
330
+ except Exception as e:
331
+ self.logger.warning(f"Failed to compute hash for {type(obj)}: {e}")
332
+ return hashlib.sha256(f"{type(obj).__name__}:{datetime.now()}".encode()).hexdigest()
333
+
334
+ def _compute_function_hash(self, func: callable) -> str:
335
+ """Compute hash of a function's source code and signature only.
336
+ """
337
+ try:
338
+ try:
339
+ source = inspect.getsource(func)
340
+ except (OSError, TypeError):
341
+ source = f"{func.__module__}.{func.__qualname__}" if hasattr(func, '__qualname__') else str(func)
342
+
343
+ try:
344
+ sig = str(inspect.signature(func))
345
+ except (ValueError, TypeError):
346
+ sig = ""
347
+
348
+ try:
349
+ tree = ast.parse(source)
350
+ normalized = ast.dump(tree)
351
+ except:
352
+ normalized = ' '.join(source.split())
353
+
354
+ return hashlib.sha256(f"{normalized}|{sig}".encode()).hexdigest()
355
+
356
+ except Exception as e:
357
+ self.logger.warning(f"Failed to compute function hash for {func}: {e}")
358
+ # Fallback hash based on function name
359
+ return hashlib.sha256(str(func).encode()).hexdigest()
360
+
361
+ def start_pipeline_execution(self, run_id: str, config: Dict[str, Any], cache_enabled: bool = True) -> None:
362
+ """Record pipeline start."""
363
+ try:
364
+ self.kv_store.mark_pipeline_started(run_id)
365
+ except Exception as e:
366
+ self.logger.warning(f"kv_store pipeline start failed: {e}")
367
+
368
+ def record_step_started(
369
+ self,
370
+ run_id: str,
371
+ process_name: Optional[str],
372
+ step_name: str,
373
+ ) -> None:
374
+ """Record that a step has started running for the given run.
375
+
376
+ Writes a per-run step record so the web UI can show start time and live elapsed.
377
+ """
378
+ try:
379
+ record = {
380
+ "status": "running",
381
+ "started_at": time.time(),
382
+ "execution_time": 0.0, # Initialize to 0, will be updated on completion
383
+ "step_name": step_name,
384
+ "process_name": process_name or "no_process",
385
+ }
386
+ self.kv_store.record_run_step(run_id, process_name or "no_process", step_name, record)
387
+ self.kv_store.publish_event({
388
+ "type": "step.started",
389
+ "process": process_name or "no_process",
390
+ "step": step_name,
391
+ "status": "running",
392
+ })
393
+ except Exception as e:
394
+ self.logger.warning(f"kv_store step started record failed: {e}")
395
+
396
+ def record_process_started(
397
+ self,
398
+ run_id: str,
399
+ process_name: str,
400
+ input_hash: Optional[str] = None,
401
+ config_hash: Optional[str] = None,
402
+ function_hash: Optional[str] = None,
403
+ started_at: Optional[float] = None,
404
+ enable_logging: bool = True,
405
+ ) -> None:
406
+ """Record that a process has started running for the given run.
407
+
408
+ Writes a per-run process record under the special step name "__process__" to
409
+ allow the web UI to display start time and live elapsed. If strict hashes are
410
+ available, best-effort mark the process index as running as well.
411
+ """
412
+ try:
413
+ # Use provided started_at (captured at timing start) if available
414
+ _started = float(started_at) if isinstance(started_at, (int, float)) else time.time()
415
+ record = {
416
+ "status": "running",
417
+ "started_at": _started,
418
+ "execution_time": 0.0, # Initialize to 0, will be updated on completion
419
+ "process_name": process_name,
420
+ }
421
+ # Per-run process record for UI
422
+ try:
423
+ self.kv_store.record_run_step(run_id, process_name, "__process__", dict(record))
424
+ except Exception:
425
+ pass
426
+ # Best-effort: reflect running state in process index when hashes available
427
+ if input_hash and config_hash:
428
+ try:
429
+ # Avoid overriding an existing terminal cache record that already has a cache_path
430
+ existing = self.kv_store.get_process_cache_record(
431
+ process_name,
432
+ input_hash or "",
433
+ config_hash or "",
434
+ function_hash or None,
435
+ )
436
+ should_write_running = not (isinstance(existing, dict) and existing.get("status") in ("completed", "cached") and existing.get("cache_path"))
437
+ if should_write_running:
438
+ if hasattr(self.kv_store, "set_process_cache_record_batched") and callable(getattr(self.kv_store, "set_process_cache_record_batched")):
439
+ getattr(self.kv_store, "set_process_cache_record_batched")( # type: ignore
440
+ run_id,
441
+ process_name,
442
+ input_hash or "",
443
+ config_hash or "",
444
+ function_hash or None,
445
+ record,
446
+ ttl_seconds=self.redis_ttl_seconds,
447
+ )
448
+ else:
449
+ self.kv_store.set_process_cache_record(
450
+ process_name,
451
+ input_hash or "",
452
+ config_hash or "",
453
+ function_hash or None,
454
+ record,
455
+ ttl_seconds=self.redis_ttl_seconds,
456
+ )
457
+ except Exception:
458
+ pass
459
+ self.kv_store.publish_event({
460
+ "type": "process.started",
461
+ "process": process_name,
462
+ "status": "running",
463
+ })
464
+ except Exception as e:
465
+ self.logger.warning(f"kv_store process started record failed: {e}")
466
+
467
+ def record_step_completion(
468
+ self,
469
+ run_id: str,
470
+ step_result: StepExecutionResult,
471
+ input_hash: Optional[str] = None,
472
+ config_hash: Optional[str] = None,
473
+ function_name: Optional[str] = None,
474
+ function_hash: Optional[str] = None,
475
+ was_cached: bool = False,
476
+ process_name: Optional[str] = None,
477
+ enable_logging: bool = True,
478
+ cached_run_id: Optional[str] = None,
479
+ cached_started_at: Optional[float] = None,
480
+ cached_ended_at: Optional[float] = None,
481
+ cached_execution_time: Optional[float] = None,
482
+ ) -> None:
483
+ """Record step completion with hash-based caching including function hash and process_name.
484
+
485
+ Args:
486
+ enable_logging: If True, create probes for metrics. If False, skip probe creation.
487
+ """
488
+ step_name = step_result.step_name
489
+ cache_path = None
490
+
491
+ if step_result.success and step_result.result and not was_cached:
492
+ try:
493
+ # Prefer object store when configured; otherwise cache locally (absolute path)
494
+ if self.object_store:
495
+ import tempfile, os as _os
496
+ # Write to a temporary file to avoid large in-memory buffers
497
+ with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as _tmpf:
498
+ tmp_path = _tmpf.name
499
+ try:
500
+ joblib.dump(step_result.result, tmp_path)
501
+ fname = self._stable_step_filename(process_name, step_name, input_hash, config_hash, function_hash) \
502
+ if (input_hash and config_hash) else f"{run_id}_{self._safe_proc(process_name)}_{step_name}.pkl"
503
+ cache_uri = self._build_object_uri(fname)
504
+ if hasattr(self.object_store, 'put_file'):
505
+ self.object_store.put_file(cache_uri, tmp_path, content_type="application/octet-stream")
506
+ else:
507
+ with open(tmp_path, 'rb') as _f:
508
+ self.object_store.put_bytes(cache_uri, _f.read(), content_type="application/octet-stream")
509
+ cache_path = cache_uri
510
+ finally:
511
+ try:
512
+ _os.remove(tmp_path)
513
+ except Exception:
514
+ pass
515
+ else:
516
+ # Local filesystem fallback with absolute path stored in KV
517
+ if input_hash and config_hash:
518
+ local_path = self._get_stable_step_cache_path(step_name, process_name, input_hash, config_hash, function_hash)
519
+ else:
520
+ local_path = self._get_cache_path(run_id, step_name, process_name)
521
+ if local_path is not None:
522
+ local_path.parent.mkdir(parents=True, exist_ok=True)
523
+ joblib.dump(step_result.result, local_path)
524
+ cache_path = str(local_path.resolve())
525
+ except Exception as e:
526
+ self.logger.warning(f"Failed to cache step {step_name}: {e}")
527
+
528
+ status = "cached" if was_cached else ("completed" if step_result.success else "failed")
529
+ try:
530
+ # For cached steps, use the original timing from cache
531
+ if was_cached and cached_started_at is not None and cached_ended_at is not None and cached_execution_time is not None:
532
+ record = {
533
+ "status": status,
534
+ "execution_time": cached_execution_time,
535
+ "ended_at": cached_ended_at,
536
+ "cache_path": cache_path,
537
+ "step_name": step_name,
538
+ "process_name": process_name or "no_process",
539
+ "run_id": run_id,
540
+ "started_at": cached_started_at,
541
+ "cached_run_id": cached_run_id,
542
+ }
543
+ else:
544
+ # For non-cached steps, use current run timing
545
+ # Convert ISO timestamp to Unix timestamp for consistency with record_step_started
546
+ timestamp = step_result.timestamp
547
+ if isinstance(timestamp, str):
548
+ try:
549
+ from datetime import datetime
550
+ timestamp = datetime.fromisoformat(timestamp).timestamp()
551
+ except Exception:
552
+ timestamp = time.time()
553
+ elif not isinstance(timestamp, (int, float)):
554
+ timestamp = time.time()
555
+ # Preserve started_at if a prior running record exists so UI can compute duration
556
+ started_at_existing = None
557
+ try:
558
+ prev = self.kv_store.list_run_steps(run_id)
559
+ if isinstance(prev, dict):
560
+ key = f"{process_name or 'no_process'}.{step_name}"
561
+ started_at_existing = (prev.get(key) or {}).get("started_at")
562
+ except Exception:
563
+ started_at_existing = None
564
+
565
+ record = {
566
+ "status": status,
567
+ "execution_time": step_result.execution_time,
568
+ "ended_at": timestamp,
569
+ "cache_path": cache_path,
570
+ "step_name": step_name,
571
+ "process_name": process_name or "no_process",
572
+ "run_id": run_id,
573
+ }
574
+ # If we have an existing started_at, include it; otherwise derive from ended_at - execution_time
575
+ try:
576
+ if started_at_existing is not None:
577
+ record["started_at"] = started_at_existing
578
+ elif isinstance(step_result.execution_time, (int, float)) and step_result.execution_time >= 0:
579
+ record["started_at"] = float(record["ended_at"]) - float(step_result.execution_time)
580
+ except Exception:
581
+ pass
582
+ # Note: Metrics are no longer auto-logged from step results.
583
+ # Users must explicitly call log_metric() to log metrics.
584
+ # No probe_id bookkeeping needed.
585
+ # Prefer batched write when backend supports it (e.g., Firestore)
586
+ if hasattr(self.kv_store, "set_step_cache_record_batched") and callable(getattr(self.kv_store, "set_step_cache_record_batched")):
587
+ try:
588
+ getattr(self.kv_store, "set_step_cache_record_batched")( # type: ignore
589
+ run_id,
590
+ process_name or "no_process",
591
+ step_name,
592
+ input_hash or "",
593
+ config_hash or "",
594
+ function_hash or None,
595
+ record,
596
+ ttl_seconds=self.redis_ttl_seconds,
597
+ )
598
+ except Exception:
599
+ # Fallback to non-batched on error
600
+ self.kv_store.set_step_cache_record(
601
+ process_name or "no_process",
602
+ step_name,
603
+ input_hash or "",
604
+ config_hash or "",
605
+ function_hash or None,
606
+ record,
607
+ ttl_seconds=self.redis_ttl_seconds,
608
+ )
609
+ else:
610
+ self.kv_store.set_step_cache_record(
611
+ process_name or "no_process",
612
+ step_name,
613
+ input_hash or "",
614
+ config_hash or "",
615
+ function_hash or None,
616
+ record,
617
+ ttl_seconds=self.redis_ttl_seconds,
618
+ )
619
+ # Stats for cache hit/miss
620
+ if status == "cached":
621
+ self.kv_store.increment_stat(run_id, "cache_hit_count", 1)
622
+ # Per-run step record for UI
623
+ try:
624
+ self.kv_store.record_run_step(run_id, process_name or "no_process", step_name, dict(record))
625
+ except Exception as record_err:
626
+ self.logger.warning(f"❌ Failed to record step completion for {step_name} in process {process_name} run {run_id}: {record_err}")
627
+ import traceback
628
+ self.logger.debug(f"Traceback: {traceback.format_exc()}")
629
+ self.kv_store.publish_event({
630
+ "type": "step.completed",
631
+ "process": process_name or "no_process",
632
+ "step": step_name,
633
+ "status": status,
634
+ })
635
+ except Exception as e:
636
+ self.logger.warning(f"kv_store step index/event failed: {e}")
637
+
638
+ def record_process_completion(
639
+ self,
640
+ run_id: str,
641
+ process_result: ProcessExecutionResult,
642
+ input_hash: Optional[str] = None,
643
+ config_hash: Optional[str] = None,
644
+ function_hash: Optional[str] = None,
645
+ was_cached: bool = False,
646
+ enable_logging: bool = True,
647
+ cached_run_id: Optional[str] = None,
648
+ cached_started_at: Optional[float] = None,
649
+ cached_ended_at: Optional[float] = None,
650
+ cached_execution_time: Optional[float] = None,
651
+ ) -> None:
652
+ """Record process completion and cache combined process result.
653
+
654
+ Args:
655
+ enable_logging: If True, create probes for metrics. If False, skip probe creation.
656
+ cached_run_id: Run ID of the original cached execution (for was_cached=True)
657
+ cached_started_at: Start timestamp from original cached execution
658
+ cached_ended_at: End timestamp from original cached execution
659
+ cached_execution_time: Execution time from original cached execution
660
+ """
661
+ process_name = process_result.process_name
662
+ cache_path = None
663
+
664
+ # When we have a fresh result (not from cache) we persist the artifact.
665
+ if process_result.success and process_result.result and not was_cached:
666
+ try:
667
+ if self.object_store:
668
+ import tempfile, os as _os
669
+ # Write to a temporary file to avoid large in-memory buffers
670
+ with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as _tmpf:
671
+ tmp_path = _tmpf.name
672
+ try:
673
+ joblib.dump(process_result.result, tmp_path)
674
+ fname = self._stable_process_filename(process_name, input_hash, config_hash, function_hash) \
675
+ if (input_hash and config_hash) else f"{run_id}__process__{self._safe_proc(process_name)}.pkl"
676
+ cache_uri = self._build_object_uri(fname)
677
+ if hasattr(self.object_store, 'put_file'):
678
+ self.object_store.put_file(cache_uri, tmp_path, content_type="application/octet-stream")
679
+ else:
680
+ with open(tmp_path, 'rb') as _f:
681
+ self.object_store.put_bytes(cache_uri, _f.read(), content_type="application/octet-stream")
682
+ cache_path = cache_uri
683
+ finally:
684
+ try:
685
+ _os.remove(tmp_path)
686
+ except Exception:
687
+ pass
688
+ else:
689
+ # Local filesystem fallback with absolute path stored in KV
690
+ if input_hash and config_hash:
691
+ local_path = self._get_stable_process_cache_path(process_name, input_hash, config_hash, function_hash)
692
+ else:
693
+ local_path = self._get_process_cache_path(run_id, process_name)
694
+ if local_path is not None:
695
+ local_path.parent.mkdir(parents=True, exist_ok=True)
696
+ joblib.dump(process_result.result, local_path)
697
+ cache_path = str(local_path.resolve())
698
+ except Exception as e:
699
+ self.logger.warning(f"Failed to cache process {process_name}: {e}")
700
+
701
+ # If we are marking a cached completion, try to preserve the original cache_path
702
+ # so future lookups still return a path. Do not overwrite existing cache_path with null.
703
+ # Also preserve cache_path if result is None (e.g., MemoryError during deserialization)
704
+ # but process succeeded - worker already cached it
705
+ if (was_cached or (process_result.success and not process_result.result)) and (cache_path is None):
706
+ try:
707
+ if hasattr(self, "kv_store") and input_hash and config_hash:
708
+ # Prefer full record to recover cache_path even if status is currently running
709
+ existing_rec = self.kv_store.get_process_cache_record(process_name, input_hash, config_hash, function_hash)
710
+ if isinstance(existing_rec, dict) and existing_rec.get("cache_path"):
711
+ cache_path = existing_rec.get("cache_path")
712
+ else:
713
+ cache_path = self.kv_store.get_process_cache_path(process_name, input_hash, config_hash, function_hash)
714
+ except Exception:
715
+ cache_path = None
716
+
717
+ status = "cached" if was_cached else ("completed" if process_result.success else "failed")
718
+ try:
719
+ # For cached processes, use the original timing from the cached metadata
720
+ if was_cached and cached_started_at is not None and cached_ended_at is not None and cached_execution_time is not None:
721
+ record = {
722
+ "status": status,
723
+ "execution_time": cached_execution_time,
724
+ "ended_at": cached_ended_at,
725
+ "process_name": process_name,
726
+ "started_at": cached_started_at,
727
+ "cached_run_id": cached_run_id,
728
+ "cached_started_at": cached_started_at,
729
+ "cached_ended_at": cached_ended_at,
730
+ "cached_execution_time": cached_execution_time,
731
+ "run_id": run_id,
732
+ }
733
+ # Only include cache_path when we actually have one to avoid nulling existing values
734
+ if cache_path:
735
+ record["cache_path"] = cache_path
736
+ else:
737
+ # For non-cached processes, use current run timing
738
+ # Convert ISO timestamp to Unix timestamp for consistency with record_process_started
739
+ timestamp = process_result.timestamp
740
+ if isinstance(timestamp, str):
741
+ try:
742
+ from datetime import datetime
743
+ timestamp = datetime.fromisoformat(timestamp).timestamp()
744
+ except Exception:
745
+ timestamp = time.time()
746
+ elif not isinstance(timestamp, (int, float)):
747
+ timestamp = time.time()
748
+ # Preserve started_at from prior running record, and prefer the earliest ended_at
749
+ started_at_existing = None
750
+ ended_at_existing = None
751
+ try:
752
+ prev = self.kv_store.list_run_steps(run_id)
753
+ if isinstance(prev, dict):
754
+ key = f"{process_name}.__process__"
755
+ _prev_rec = (prev.get(key) or {})
756
+ started_at_existing = _prev_rec.get("started_at")
757
+ ended_at_existing = _prev_rec.get("ended_at")
758
+ except Exception:
759
+ started_at_existing = None
760
+ ended_at_existing = None
761
+
762
+ record = {
763
+ "status": status,
764
+ "execution_time": process_result.execution_time,
765
+ "ended_at": timestamp,
766
+ "process_name": process_name,
767
+ # Persist this run id on process index for future cache lookups and provenance
768
+ "run_id": run_id,
769
+ }
770
+ # Only include cache_path when we actually have one to avoid nulling existing values
771
+ if cache_path:
772
+ record["cache_path"] = cache_path
773
+ try:
774
+ if started_at_existing is not None:
775
+ record["started_at"] = started_at_existing
776
+ elif isinstance(process_result.execution_time, (int, float)) and process_result.execution_time >= 0:
777
+ record["started_at"] = float(record["ended_at"]) - float(process_result.execution_time)
778
+ # Preserve the earliest ended_at if a prior value exists to avoid late overwrites
779
+ if ended_at_existing is not None:
780
+ try:
781
+ record["ended_at"] = min(float(record["ended_at"]), float(ended_at_existing))
782
+ except Exception:
783
+ pass
784
+ except Exception:
785
+ pass
786
+ try:
787
+ if enable_logging:
788
+ # If this was cached, try to copy ALL metrics from the source run
789
+ # Find another run with the same process hash and copy all its probe metrics
790
+ if was_cached and input_hash and config_hash:
791
+ self.logger.info(f"🔍 [METRICS COPY] Attempting to copy cached metrics for {process_name} (was_cached={was_cached})")
792
+ try:
793
+ from ..storage.adapters.gcp_kv_store import GCPStore
794
+ if isinstance(self.kv_store, GCPStore):
795
+ self.logger.info(f"🔍 [METRICS COPY] GCP store detected, proceeding with metrics copy (path-based)")
796
+ # Scan recent runs and copy metrics from probes_by_path
797
+ try:
798
+ runs_col = self.kv_store._root.collection('runs')
799
+ metric_col = self.kv_store._root.collection('metric')
800
+ run_docs = list(runs_col.limit(10).stream())
801
+ self.logger.info(f"🔍 [METRICS COPY] Query returned {len(run_docs)} runs")
802
+ except Exception as query_err:
803
+ self.logger.warning(f"❌ [METRICS COPY] Failed to query runs: {query_err}")
804
+ run_docs = []
805
+ found_metrics = False
806
+ for run_doc in run_docs:
807
+ try:
808
+ source_run_id = run_doc.id
809
+ if source_run_id == run_id:
810
+ continue
811
+ try:
812
+ docs = list(metric_col.document(source_run_id).collection('probes_by_path').limit(50).stream())
813
+ except Exception:
814
+ docs = []
815
+ for d in docs:
816
+ try:
817
+ source_metrics = d.to_dict() or {}
818
+ if source_metrics:
819
+ enc_id = getattr(d, 'id', '')
820
+ try:
821
+ path = decode_probe_path(enc_id)
822
+ except Exception:
823
+ continue
824
+ if not (isinstance(path, str) and (path == process_name or path.startswith(f"{process_name}/"))):
825
+ continue
826
+ self.kv_store.save_probe_metrics_by_path(run_id, path, source_metrics)
827
+ found_metrics = True
828
+ except Exception:
829
+ continue
830
+ if found_metrics:
831
+ break
832
+ except Exception:
833
+ continue
834
+ if not found_metrics:
835
+ self.logger.warning(f"⚠️ [METRICS COPY] No source metrics found for {process_name}")
836
+ else:
837
+ self.logger.info(f"🔍 [METRICS COPY] Not using GCP store (type={type(self.kv_store).__name__}), skipping metrics copy")
838
+ except Exception as copy_err:
839
+ self.logger.warning(f"❌ [METRICS COPY] Failed to copy metrics for {process_name}: {copy_err}")
840
+ import traceback
841
+ self.logger.warning(f"Traceback: {traceback.format_exc()}")
842
+ # Note: Step-level probes are now created directly by individual steps
843
+ # with clean paths that match chart configurations exactly
844
+ except Exception as e:
845
+ print(f"❌ Exception in process metrics handling: {e}")
846
+ pass
847
+ # Prefer batched write when backend supports it (e.g., Firestore)
848
+ try:
849
+ # Debug: surface the exact key parts used for process_indices completion write
850
+ self.logger.debug(
851
+ f"process_indices[complete] key parts -> process={process_name}, ih={input_hash}, ch={config_hash}, fh={function_hash}"
852
+ )
853
+ except Exception:
854
+ pass
855
+ _should_write_index = (not was_cached) or bool(cache_path)
856
+ if _should_write_index:
857
+ if hasattr(self.kv_store, "set_process_cache_record_batched") and callable(getattr(self.kv_store, "set_process_cache_record_batched")):
858
+ try:
859
+ getattr(self.kv_store, "set_process_cache_record_batched")( # type: ignore
860
+ run_id,
861
+ process_name,
862
+ input_hash or "",
863
+ config_hash or "",
864
+ function_hash or None,
865
+ record,
866
+ ttl_seconds=self.redis_ttl_seconds,
867
+ )
868
+ except Exception:
869
+ self.kv_store.set_process_cache_record(
870
+ process_name,
871
+ input_hash or "",
872
+ config_hash or "",
873
+ function_hash or None,
874
+ record,
875
+ ttl_seconds=self.redis_ttl_seconds,
876
+ )
877
+ else:
878
+ self.kv_store.set_process_cache_record(
879
+ process_name,
880
+ input_hash or "",
881
+ config_hash or "",
882
+ function_hash or None,
883
+ record,
884
+ ttl_seconds=self.redis_ttl_seconds,
885
+ )
886
+ # Per-run process summary record for UI
887
+ try:
888
+ self.kv_store.record_run_step(run_id, process_name, "__process__", dict(record))
889
+ except Exception as record_err:
890
+ self.logger.warning(f"❌ Failed to record __process__ completion for {process_name} in run {run_id}: {record_err}")
891
+ import traceback
892
+ self.logger.debug(f"Traceback: {traceback.format_exc()}")
893
+ self.kv_store.publish_event({
894
+ "type": "process.completed",
895
+ "process": process_name,
896
+ "status": status,
897
+ })
898
+ except Exception as e:
899
+ self.logger.warning(f"kv_store process index/event failed: {e}")
900
+
901
+ def can_skip_step(
902
+ self,
903
+ run_id: str,
904
+ step_name: str,
905
+ input_hash: str,
906
+ config_hash: str,
907
+ function_hash: Optional[str] = None,
908
+ process_name: Optional[str] = None,
909
+ ) -> bool:
910
+ """Check if step can be skipped based on hash validation including function hash and process name."""
911
+ if not input_hash or not config_hash:
912
+ return False
913
+ # Fast-path via kv_store exact index only
914
+ try:
915
+ path = self.kv_store.get_step_cache_path(
916
+ process_name or "no_process",
917
+ step_name,
918
+ input_hash,
919
+ config_hash,
920
+ function_hash,
921
+ )
922
+ if path:
923
+ # Support object store URIs and local absolute paths
924
+ if isinstance(path, str) and path.startswith('gs://') and self.object_store:
925
+ try:
926
+ return self.object_store.exists(path)
927
+ except Exception:
928
+ pass
929
+ else:
930
+ try:
931
+ return Path(path).exists()
932
+ except Exception:
933
+ pass
934
+ except Exception:
935
+ pass
936
+ return False
937
+
938
+ def get_expired_cache_entries(self, step_name: Optional[str] = None) -> List[Dict[str, Any]]:
939
+ """Get expired cache entries for potential recovery or debugging (removed)."""
940
+ return []
941
+
942
+ def restore_expired_cache_entry(self, run_id: str, step_name: str) -> bool:
943
+ """Restore an expired cache entry back to 'completed' status (removed)."""
944
+ return False
945
+
946
+ def _cleanup_stale_cache_entry(self, step_name: str, cache_path: str) -> None:
947
+ """Deprecated no-op (removed)."""
948
+ return None
949
+
950
+ def get_cached_step_result(
951
+ self,
952
+ run_id: str,
953
+ step_name: str,
954
+ process_name: Optional[str] = None,
955
+ input_hash: Optional[str] = None,
956
+ config_hash: Optional[str] = None,
957
+ function_hash: Optional[str] = None,
958
+ ) -> Optional[Dict[str, Any]]:
959
+ """Get cached step result via kv_store strict-hash lookup.
960
+
961
+ Only returns a result if the KV index has a valid cache_path entry.
962
+ No fallback to deterministic file paths - cache must be explicitly indexed.
963
+ """
964
+ cache_path = self.kv_store.get_step_cache_path(
965
+ process_name or "no_process",
966
+ step_name,
967
+ input_hash,
968
+ config_hash,
969
+ function_hash,
970
+ )
971
+
972
+ loaded = None
973
+ # If index returned a path, load from appropriate backend
974
+ try:
975
+ if cache_path and isinstance(cache_path, str):
976
+ if cache_path.startswith('gs://') and self.object_store:
977
+ data = self.object_store.get_bytes(cache_path)
978
+ loaded = joblib.load(io.BytesIO(data))
979
+ else:
980
+ p = Path(cache_path)
981
+ if p.exists():
982
+ loaded = joblib.load(p)
983
+ except Exception as e:
984
+ self.logger.warning(f"Failed to load cached step result from {cache_path}: {e}")
985
+
986
+ proc = process_name or "no_process"
987
+ if loaded is not None:
988
+ try:
989
+ self.logger.info(f"[Cache] step hit: {proc}/{step_name}")
990
+ except Exception:
991
+ pass
992
+ return loaded
993
+ else:
994
+ try:
995
+ self.logger.info(f"[Cache] step miss: {proc}/{step_name}")
996
+ except Exception:
997
+ pass
998
+ return None
999
+
1000
+ def get_cached_step_result_with_metadata(
1001
+ self,
1002
+ run_id: str,
1003
+ step_name: str,
1004
+ process_name: Optional[str] = None,
1005
+ input_hash: Optional[str] = None,
1006
+ config_hash: Optional[str] = None,
1007
+ function_hash: Optional[str] = None,
1008
+ ) -> Optional[tuple[Dict[str, Any], str, Dict[str, Any]]]:
1009
+ """Get cached step result with metadata including cached run-id and timing.
1010
+
1011
+ Returns: (result, cached_run_id, cached_metadata) or None if not cached.
1012
+ """
1013
+ # First check if cache exists
1014
+ cache_path = self.kv_store.get_step_cache_path(
1015
+ process_name or "no_process",
1016
+ step_name,
1017
+ input_hash,
1018
+ config_hash,
1019
+ function_hash,
1020
+ )
1021
+
1022
+ if not cache_path:
1023
+ try:
1024
+ self.logger.info(f"[Cache] step miss: {(process_name or 'no_process')}/{step_name}")
1025
+ except Exception:
1026
+ pass
1027
+ return None
1028
+
1029
+ # Get the full cache record to extract metadata
1030
+ cache_record = self.kv_store.get_step_cache_record(
1031
+ process_name or "no_process",
1032
+ step_name,
1033
+ input_hash,
1034
+ config_hash,
1035
+ function_hash,
1036
+ )
1037
+
1038
+ if not cache_record:
1039
+ return None
1040
+
1041
+ # Load the actual cached result
1042
+ try:
1043
+ if cache_path and isinstance(cache_path, str):
1044
+ if cache_path.startswith('gs://') and self.object_store:
1045
+ data = self.object_store.get_bytes(cache_path)
1046
+ result = joblib.load(io.BytesIO(data))
1047
+ else:
1048
+ p = Path(cache_path)
1049
+ if p.exists():
1050
+ result = joblib.load(p)
1051
+ else:
1052
+ result = None
1053
+ else:
1054
+ result = None
1055
+ except Exception as e:
1056
+ self.logger.warning(f"Failed to load cached step result from {cache_path}: {e}")
1057
+ result = None
1058
+
1059
+ proc = process_name or "no_process"
1060
+ if result is None:
1061
+ try:
1062
+ self.logger.info(f"[Cache] step miss: {proc}/{step_name}")
1063
+ except Exception:
1064
+ pass
1065
+ return None
1066
+
1067
+ # Extract metadata
1068
+ # Prefer cached_run_id if it exists (points to original run that executed the step)
1069
+ # Otherwise use run_id (this record is from the original execution)
1070
+ cached_run_id = cache_record.get("cached_run_id") or cache_record.get("run_id", "unknown")
1071
+ cached_metadata = {
1072
+ "started_at": cache_record.get("cached_started_at") or cache_record.get("started_at"),
1073
+ "ended_at": cache_record.get("cached_ended_at") or cache_record.get("ended_at"),
1074
+ "execution_time": cache_record.get("cached_execution_time") or cache_record.get("execution_time"),
1075
+ "run_id": cached_run_id,
1076
+ }
1077
+
1078
+ return (result, cached_run_id, cached_metadata)
1079
+
1080
+ def get_cached_process_result(
1081
+ self,
1082
+ process_name: str,
1083
+ input_hash: Optional[str] = None,
1084
+ config_hash: Optional[str] = None,
1085
+ function_hash: Optional[str] = None,
1086
+ run_id: Optional[str] = None,
1087
+ ) -> Optional[Dict[str, Any]]:
1088
+ """Get cached process result via kv_store strict-hash lookup.
1089
+
1090
+ Only returns a result if the KV index has a valid cache_path entry.
1091
+ No fallback to deterministic file paths - cache must be explicitly indexed.
1092
+ """
1093
+ cache_path = self.kv_store.get_process_cache_path(
1094
+ process_name,
1095
+ input_hash,
1096
+ config_hash,
1097
+ function_hash,
1098
+ )
1099
+
1100
+ loaded = None
1101
+ load_error = None
1102
+ # If index returned a path, load from appropriate backend
1103
+ try:
1104
+ if cache_path and isinstance(cache_path, str):
1105
+ if cache_path.startswith('gs://') and self.object_store:
1106
+ data = self.object_store.get_bytes(cache_path)
1107
+ loaded = joblib.load(io.BytesIO(data))
1108
+ else:
1109
+ p = Path(cache_path)
1110
+ if p.exists():
1111
+ loaded = joblib.load(p)
1112
+ else:
1113
+ load_error = f"Cache file not found at {cache_path}"
1114
+ else:
1115
+ load_error = "Invalid cache path format"
1116
+ except Exception as e:
1117
+ load_error = str(e)
1118
+ self.logger.warning(f"Failed to load cached result for {process_name}: {e}")
1119
+
1120
+ if loaded is not None:
1121
+ try:
1122
+ self.logger.info(f"[Cache] process hit: {process_name}")
1123
+ except Exception:
1124
+ pass
1125
+ return loaded
1126
+ else:
1127
+ # Cache entry exists but file couldn't be loaded - treat as miss
1128
+ if load_error:
1129
+ self.logger.warning(f"⚠️ [CACHE] Stale cache entry for {process_name}: cache index exists but file load failed ({load_error}). Treating as cache miss.")
1130
+ try:
1131
+ self.logger.info(f"[Cache] process miss: {process_name}")
1132
+ except Exception:
1133
+ pass
1134
+ return None
1135
+
1136
+ def get_cached_process_result_with_metadata(
1137
+ self,
1138
+ process_name: str,
1139
+ input_hash: Optional[str] = None,
1140
+ config_hash: Optional[str] = None,
1141
+ function_hash: Optional[str] = None,
1142
+ run_id: Optional[str] = None,
1143
+ ) -> Optional[tuple[Dict[str, Any], str, Dict[str, Any]]]:
1144
+ """Get cached process result with metadata including cached run-id and timing.
1145
+
1146
+ Returns: (result, cached_run_id, cached_metadata) or None if not cached.
1147
+
1148
+ If cache path exists but file cannot be loaded, logs a warning and returns None
1149
+ to allow fallback to execution. The stale cache entry remains in the index
1150
+ but won't be used until it's overwritten by a successful execution.
1151
+ """
1152
+ # First check if cache exists
1153
+ cache_path = self.kv_store.get_process_cache_path(
1154
+ process_name,
1155
+ input_hash,
1156
+ config_hash,
1157
+ function_hash,
1158
+ )
1159
+
1160
+ if not cache_path:
1161
+ try:
1162
+ self.logger.info(f"[Cache] process miss: {process_name}")
1163
+ except Exception:
1164
+ pass
1165
+ return None
1166
+
1167
+ # Get the full cache record to extract metadata
1168
+ cache_record = self.kv_store.get_process_cache_record(
1169
+ process_name,
1170
+ input_hash,
1171
+ config_hash,
1172
+ function_hash,
1173
+ )
1174
+
1175
+ if not cache_record:
1176
+ return None
1177
+
1178
+ # Load the actual cached result
1179
+ load_error = None
1180
+ try:
1181
+ if cache_path and isinstance(cache_path, str):
1182
+ if cache_path.startswith('gs://') and self.object_store:
1183
+ data = self.object_store.get_bytes(cache_path)
1184
+ result = joblib.load(io.BytesIO(data))
1185
+ else:
1186
+ p = Path(cache_path)
1187
+ if p.exists():
1188
+ result = joblib.load(p)
1189
+ else:
1190
+ result = None
1191
+ load_error = f"Cache file not found at {cache_path}"
1192
+ else:
1193
+ result = None
1194
+ load_error = "Invalid cache path format"
1195
+ except Exception as e:
1196
+ load_error = str(e)
1197
+ result = None
1198
+
1199
+ # Extract metadata
1200
+ # Prefer cached_run_id if it exists (points to original run that executed the process)
1201
+ # Otherwise use run_id (this record is from the original execution)
1202
+ cached_run_id = cache_record.get("cached_run_id") or cache_record.get("run_id", "unknown")
1203
+ cached_metadata = {
1204
+ "started_at": cache_record.get("cached_started_at") or cache_record.get("started_at"),
1205
+ "ended_at": cache_record.get("cached_ended_at") or cache_record.get("ended_at"),
1206
+ "execution_time": cache_record.get("cached_execution_time") or cache_record.get("execution_time"),
1207
+ "run_id": cached_run_id,
1208
+ }
1209
+
1210
+ if result is not None:
1211
+ try:
1212
+ self.logger.info(f"[Cache] process hit: {process_name}")
1213
+ except Exception:
1214
+ pass
1215
+ return (result, cached_run_id, cached_metadata)
1216
+ else:
1217
+ # Cache entry exists but file couldn't be loaded - treat as miss
1218
+ # This can happen if cache files were deleted or corrupted
1219
+ try:
1220
+ if load_error:
1221
+ self.logger.warning(f"⚠️ [CACHE] Stale cache entry for {process_name}: cache index exists but file load failed ({load_error}). Treating as cache miss and will re-execute.")
1222
+ else:
1223
+ self.logger.info(f"[Cache] process miss: {process_name}")
1224
+ except Exception:
1225
+ pass
1226
+ return None
1227
+
1228
+ def load_process_result_from_path(self, cache_path: str) -> Optional[Dict[str, Any]]:
1229
+ """Load a process result dictionary directly from a known cache path or object URI.
1230
+
1231
+ Best-effort; returns None on any failure.
1232
+ """
1233
+ try:
1234
+ if not cache_path or not isinstance(cache_path, str):
1235
+ return None
1236
+ if cache_path.startswith('gs://') and self.object_store:
1237
+ data = self.object_store.get_bytes(cache_path)
1238
+ return joblib.load(io.BytesIO(data))
1239
+ p = Path(cache_path)
1240
+ if p.exists():
1241
+ return joblib.load(p)
1242
+ except Exception:
1243
+ return None
1244
+
1245
+ def get_cache_statistics(self) -> Dict[str, Any]:
1246
+ """Get cache statistics."""
1247
+ cache_files = list(self.cache_dir.glob("*.pkl"))
1248
+ total_size = sum(f.stat().st_size for f in cache_files if f.exists())
1249
+ return {
1250
+ 'total_cache_files': len(cache_files),
1251
+ 'total_cache_size_mb': total_size / (1024**2),
1252
+ 'cache_directory': str(self.cache_dir)
1253
+ }
1254
+
1255
+ def complete_pipeline_execution(self, run_id: str, success: bool) -> None:
1256
+ """Record pipeline completion."""
1257
+ status = "completed" if success else "failed"
1258
+ try:
1259
+ self.kv_store.mark_pipeline_completed(run_id, success)
1260
+ except Exception as e:
1261
+ self.logger.warning(f"kv_store pipeline completion failed: {e}")
1262
+
1263
+ def get_step_results(self, run_id: str) -> Dict[str, Dict[str, Any]]:
1264
+ """Get all step results for a run."""
1265
+ results = {}
1266
+ try:
1267
+ step_records = self.kv_store.list_run_steps(run_id)
1268
+ for key, rec in step_records.items():
1269
+ cache_path = rec.get("cache_path")
1270
+ if cache_path:
1271
+ try:
1272
+ if isinstance(cache_path, str) and cache_path.startswith('gs://') and self.object_store:
1273
+ data = self.object_store.get_bytes(cache_path)
1274
+ result = joblib.load(io.BytesIO(data))
1275
+ else:
1276
+ result = joblib.load(Path(cache_path))
1277
+ step_name = key.split(".", 1)[-1]
1278
+ results[step_name] = result
1279
+ except Exception as e:
1280
+ self.logger.warning(f"Failed to load result for {key}: {e}")
1281
+ except Exception as e:
1282
+ self.logger.warning(f"kv_store list_run_steps failed: {e}")
1283
+ return results
1284
+
1285
+ def can_resume_from_step(self, run_id: str, step_name: str, config_hash: str) -> bool:
1286
+ """Check if pipeline can be resumed (not supported without persistent status)."""
1287
+ return False
1288
+
1289
+ def get_pipeline_stats(self, run_id: str) -> Dict[str, Any]:
1290
+ """Get pipeline statistics."""
1291
+ try:
1292
+ return self.kv_store.get_pipeline_stats(run_id)
1293
+ except Exception:
1294
+ return {}
1295
+
1296
+ def _compute_config_hash(self, config: Dict[str, Any]) -> str:
1297
+ """Compute a hash of the step graph configuration for change detection."""
1298
+ return self._compute_hash(config)