expops 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- expops-0.1.3.dist-info/METADATA +826 -0
- expops-0.1.3.dist-info/RECORD +86 -0
- expops-0.1.3.dist-info/WHEEL +5 -0
- expops-0.1.3.dist-info/entry_points.txt +3 -0
- expops-0.1.3.dist-info/licenses/LICENSE +674 -0
- expops-0.1.3.dist-info/top_level.txt +1 -0
- mlops/__init__.py +0 -0
- mlops/__main__.py +11 -0
- mlops/_version.py +34 -0
- mlops/adapters/__init__.py +12 -0
- mlops/adapters/base.py +86 -0
- mlops/adapters/config_schema.py +89 -0
- mlops/adapters/custom/__init__.py +3 -0
- mlops/adapters/custom/custom_adapter.py +447 -0
- mlops/adapters/plugin_manager.py +113 -0
- mlops/adapters/sklearn/__init__.py +3 -0
- mlops/adapters/sklearn/adapter.py +94 -0
- mlops/cluster/__init__.py +3 -0
- mlops/cluster/controller.py +496 -0
- mlops/cluster/process_runner.py +91 -0
- mlops/cluster/providers.py +258 -0
- mlops/core/__init__.py +95 -0
- mlops/core/custom_model_base.py +38 -0
- mlops/core/dask_networkx_executor.py +1265 -0
- mlops/core/executor_worker.py +1239 -0
- mlops/core/experiment_tracker.py +81 -0
- mlops/core/graph_types.py +64 -0
- mlops/core/networkx_parser.py +135 -0
- mlops/core/payload_spill.py +278 -0
- mlops/core/pipeline_utils.py +162 -0
- mlops/core/process_hashing.py +216 -0
- mlops/core/step_state_manager.py +1298 -0
- mlops/core/step_system.py +956 -0
- mlops/core/workspace.py +99 -0
- mlops/environment/__init__.py +10 -0
- mlops/environment/base.py +43 -0
- mlops/environment/conda_manager.py +307 -0
- mlops/environment/factory.py +70 -0
- mlops/environment/pyenv_manager.py +146 -0
- mlops/environment/setup_env.py +31 -0
- mlops/environment/system_manager.py +66 -0
- mlops/environment/utils.py +105 -0
- mlops/environment/venv_manager.py +134 -0
- mlops/main.py +527 -0
- mlops/managers/project_manager.py +400 -0
- mlops/managers/reproducibility_manager.py +575 -0
- mlops/platform.py +996 -0
- mlops/reporting/__init__.py +16 -0
- mlops/reporting/context.py +187 -0
- mlops/reporting/entrypoint.py +292 -0
- mlops/reporting/kv_utils.py +77 -0
- mlops/reporting/registry.py +50 -0
- mlops/runtime/__init__.py +9 -0
- mlops/runtime/context.py +34 -0
- mlops/runtime/env_export.py +113 -0
- mlops/storage/__init__.py +12 -0
- mlops/storage/adapters/__init__.py +9 -0
- mlops/storage/adapters/gcp_kv_store.py +778 -0
- mlops/storage/adapters/gcs_object_store.py +96 -0
- mlops/storage/adapters/memory_store.py +240 -0
- mlops/storage/adapters/redis_store.py +438 -0
- mlops/storage/factory.py +199 -0
- mlops/storage/interfaces/__init__.py +6 -0
- mlops/storage/interfaces/kv_store.py +118 -0
- mlops/storage/path_utils.py +38 -0
- mlops/templates/premier-league/charts/plot_metrics.js +70 -0
- mlops/templates/premier-league/charts/plot_metrics.py +145 -0
- mlops/templates/premier-league/charts/requirements.txt +6 -0
- mlops/templates/premier-league/configs/cluster_config.yaml +13 -0
- mlops/templates/premier-league/configs/project_config.yaml +207 -0
- mlops/templates/premier-league/data/England CSV.csv +12154 -0
- mlops/templates/premier-league/models/premier_league_model.py +638 -0
- mlops/templates/premier-league/requirements.txt +8 -0
- mlops/templates/sklearn-basic/README.md +22 -0
- mlops/templates/sklearn-basic/charts/plot_metrics.py +85 -0
- mlops/templates/sklearn-basic/charts/requirements.txt +3 -0
- mlops/templates/sklearn-basic/configs/project_config.yaml +64 -0
- mlops/templates/sklearn-basic/data/train.csv +14 -0
- mlops/templates/sklearn-basic/models/model.py +62 -0
- mlops/templates/sklearn-basic/requirements.txt +10 -0
- mlops/web/__init__.py +3 -0
- mlops/web/server.py +585 -0
- mlops/web/ui/index.html +52 -0
- mlops/web/ui/mlops-charts.js +357 -0
- mlops/web/ui/script.js +1244 -0
- mlops/web/ui/styles.css +248 -0
|
@@ -0,0 +1,1298 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Any
|
|
2
|
+
import json
|
|
3
|
+
import hashlib
|
|
4
|
+
import io
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
import numpy as np
|
|
10
|
+
import joblib
|
|
11
|
+
import time
|
|
12
|
+
import inspect
|
|
13
|
+
import ast
|
|
14
|
+
|
|
15
|
+
from mlops.storage.interfaces.kv_store import KeyValueEventStore, ObjectStore
|
|
16
|
+
from mlops.storage.path_utils import decode_probe_path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class StepExecutionResult:
|
|
21
|
+
"""Result of executing a single step."""
|
|
22
|
+
step_name: str
|
|
23
|
+
success: bool
|
|
24
|
+
result: Optional[Dict[str, Any]] = None
|
|
25
|
+
error: Optional[str] = None
|
|
26
|
+
execution_time: float = 0.0
|
|
27
|
+
timestamp: str = ""
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ProcessExecutionResult:
|
|
31
|
+
"""Result of executing a single process."""
|
|
32
|
+
process_name: str
|
|
33
|
+
success: bool
|
|
34
|
+
result: Optional[Dict[str, Any]] = None
|
|
35
|
+
error: Optional[str] = None
|
|
36
|
+
execution_time: float = 0.0
|
|
37
|
+
timestamp: str = ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StepStateManager:
|
|
41
|
+
"""State manager for step-based pipeline execution with caching."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, cache_dir: Path, kv_store: KeyValueEventStore, logger: Optional[logging.Logger] = None,
|
|
44
|
+
cache_ttl_hours: Optional[int] = None, object_store: Optional[ObjectStore] = None,
|
|
45
|
+
object_prefix: Optional[str] = None):
|
|
46
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
47
|
+
self.cache_dir = cache_dir
|
|
48
|
+
|
|
49
|
+
self.kv_store = kv_store
|
|
50
|
+
self.redis_ttl_seconds = int((cache_ttl_hours or 24) * 3600)
|
|
51
|
+
self.object_store = object_store
|
|
52
|
+
self.object_prefix = object_prefix.strip("/") if isinstance(object_prefix, str) else None
|
|
53
|
+
|
|
54
|
+
def _safe_proc(self, name: Optional[str]) -> str:
|
|
55
|
+
"""Return a filesystem-safe process identifier."""
|
|
56
|
+
return (name or 'no_process').replace('/', '_')
|
|
57
|
+
|
|
58
|
+
def _stable_step_filename(
|
|
59
|
+
self,
|
|
60
|
+
process_name: Optional[str],
|
|
61
|
+
step_name: str,
|
|
62
|
+
input_hash: Optional[str],
|
|
63
|
+
config_hash: Optional[str],
|
|
64
|
+
function_hash: Optional[str],
|
|
65
|
+
) -> Optional[str]:
|
|
66
|
+
if not input_hash or not config_hash:
|
|
67
|
+
return None
|
|
68
|
+
return f"stable_{self._safe_proc(process_name)}_{step_name}_{input_hash}_{config_hash}_{(function_hash or 'none')}.pkl"
|
|
69
|
+
|
|
70
|
+
def _stable_process_filename(
|
|
71
|
+
self,
|
|
72
|
+
process_name: str,
|
|
73
|
+
input_hash: Optional[str],
|
|
74
|
+
config_hash: Optional[str],
|
|
75
|
+
function_hash: Optional[str],
|
|
76
|
+
) -> Optional[str]:
|
|
77
|
+
if not input_hash or not config_hash:
|
|
78
|
+
return None
|
|
79
|
+
return f"stable_process__{self._safe_proc(process_name)}_{input_hash}_{config_hash}_{(function_hash or 'none')}.pkl"
|
|
80
|
+
|
|
81
|
+
def _build_object_uri(self, filename: str) -> str:
|
|
82
|
+
"""Build an object store URI honoring the optional prefix."""
|
|
83
|
+
return self.object_store.build_uri(*(filter(None, [self.object_prefix, filename])))
|
|
84
|
+
|
|
85
|
+
def _format_probe_path(
|
|
86
|
+
self,
|
|
87
|
+
process_name: Optional[str],
|
|
88
|
+
step_name: Optional[str],
|
|
89
|
+
input_hash: Optional[str] = None,
|
|
90
|
+
config_hash: Optional[str] = None,
|
|
91
|
+
function_hash: Optional[str] = None,
|
|
92
|
+
) -> str:
|
|
93
|
+
"""Clean, human-readable path string for charts.
|
|
94
|
+
|
|
95
|
+
Returns simple process or process/step paths without hash suffixes.
|
|
96
|
+
Since metrics are now cached directly, we don't need hash disambiguation.
|
|
97
|
+
"""
|
|
98
|
+
if step_name is None:
|
|
99
|
+
# Process-level path
|
|
100
|
+
return str(process_name or "no_process")
|
|
101
|
+
else:
|
|
102
|
+
# Step-level path
|
|
103
|
+
return f"{process_name or 'no_process'}/{step_name}"
|
|
104
|
+
|
|
105
|
+
def _append_probe_metrics(self, run_id: str, probe_id: str, new_metrics: Dict[str, Any], path_key: str, step: int = 0) -> None:
|
|
106
|
+
"""Append numeric metrics as step-indexed dictionaries under metric/{run_id}/probes/{probe_id}.
|
|
107
|
+
|
|
108
|
+
New behavior (MLflow-style):
|
|
109
|
+
- Numeric values -> stored as {step_number: value} dictionaries
|
|
110
|
+
- Non-numeric values -> store last snapshot under a separate map
|
|
111
|
+
- step=0 is reserved for auto-logged metrics (from process/step returns)
|
|
112
|
+
- step>=1 for manual log_metric() calls
|
|
113
|
+
"""
|
|
114
|
+
try:
|
|
115
|
+
try:
|
|
116
|
+
self.logger.info(f"[Metrics] Append begin -> run_id={run_id}, path_key={path_key}, step={step}, keys={list((new_metrics or {}).keys())}")
|
|
117
|
+
except Exception:
|
|
118
|
+
pass
|
|
119
|
+
# Attempt to read existing metrics for this probe and append
|
|
120
|
+
existing = {}
|
|
121
|
+
try:
|
|
122
|
+
existing = self.kv_store.get_probe_metrics_by_path(run_id, path_key) or {}
|
|
123
|
+
except Exception:
|
|
124
|
+
existing = {}
|
|
125
|
+
updated: Dict[str, Any] = dict(existing) if isinstance(existing, dict) else {}
|
|
126
|
+
def _to_firestore_safe(obj: Any) -> Any:
|
|
127
|
+
"""Convert values to Firestore-safe types.
|
|
128
|
+
- Dict keys must be strings
|
|
129
|
+
- Convert numpy scalars to native Python
|
|
130
|
+
- Recurse through lists/tuples/dicts
|
|
131
|
+
"""
|
|
132
|
+
try:
|
|
133
|
+
import numpy as _np # type: ignore
|
|
134
|
+
except Exception:
|
|
135
|
+
_np = None # type: ignore
|
|
136
|
+
# Primitive JSON-safe types
|
|
137
|
+
if obj is None or isinstance(obj, (bool, int, float, str)):
|
|
138
|
+
return obj
|
|
139
|
+
# Numpy scalar types -> Python native
|
|
140
|
+
if _np is not None and isinstance(obj, (_np.integer, _np.floating)):
|
|
141
|
+
try:
|
|
142
|
+
return float(obj) if isinstance(obj, _np.floating) else int(obj)
|
|
143
|
+
except Exception:
|
|
144
|
+
return obj.item() # type: ignore[attr-defined]
|
|
145
|
+
# Lists/Tuples -> list of safe
|
|
146
|
+
if isinstance(obj, (list, tuple)):
|
|
147
|
+
return [_to_firestore_safe(x) for x in obj]
|
|
148
|
+
# Dicts -> string keys and safe values
|
|
149
|
+
if isinstance(obj, dict):
|
|
150
|
+
out = {}
|
|
151
|
+
for k, v in obj.items():
|
|
152
|
+
try:
|
|
153
|
+
out[str(k)] = _to_firestore_safe(v)
|
|
154
|
+
except Exception:
|
|
155
|
+
# Best-effort: stringify both key and value
|
|
156
|
+
out[str(k)] = str(v)
|
|
157
|
+
return out
|
|
158
|
+
# Fallback: stringify
|
|
159
|
+
try:
|
|
160
|
+
return str(obj)
|
|
161
|
+
except Exception:
|
|
162
|
+
return obj
|
|
163
|
+
|
|
164
|
+
for mname, mval in (new_metrics or {}).items():
|
|
165
|
+
try:
|
|
166
|
+
if isinstance(mval, (int, float)):
|
|
167
|
+
# Get existing metric dict (or create new one)
|
|
168
|
+
metric_dict = updated.get(mname) or {}
|
|
169
|
+
# Handle legacy list format - convert to dict
|
|
170
|
+
if isinstance(metric_dict, list):
|
|
171
|
+
# Convert old list format to dict (use indices as steps)
|
|
172
|
+
# IMPORTANT: Keys must be strings for Firestore compatibility
|
|
173
|
+
metric_dict = {str(i): v for i, v in enumerate(metric_dict)}
|
|
174
|
+
elif not isinstance(metric_dict, dict):
|
|
175
|
+
metric_dict = {}
|
|
176
|
+
# Add new value at specified step
|
|
177
|
+
# IMPORTANT: Convert step to string for Firestore compatibility
|
|
178
|
+
metric_dict[str(step)] = float(mval)
|
|
179
|
+
updated[mname] = metric_dict
|
|
180
|
+
else:
|
|
181
|
+
# Store non-numeric snapshot directly under the metric name
|
|
182
|
+
# Ensure payload is Firestore-safe (string keys, JSON-serializable)
|
|
183
|
+
safe_val = _to_firestore_safe(mval)
|
|
184
|
+
updated[mname] = safe_val
|
|
185
|
+
except Exception:
|
|
186
|
+
continue
|
|
187
|
+
try:
|
|
188
|
+
self.logger.info(f"[Metrics] Saving metrics -> run_id={run_id}, path_key={path_key}, keys={list(updated.keys())}")
|
|
189
|
+
except Exception:
|
|
190
|
+
pass
|
|
191
|
+
self.kv_store.save_probe_metrics_by_path(run_id, path_key, updated)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
self.logger.warning(f"Failed to append probe metrics for {probe_id}: {e}")
|
|
194
|
+
|
|
195
|
+
def log_metric(self, run_id: str, process_name: Optional[str], step_name: Optional[str],
|
|
196
|
+
metric_name: str, value: Any, step: Optional[int] = None) -> None:
|
|
197
|
+
"""Manually log a metric with a step number (MLflow-style).
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
run_id: Current run ID
|
|
201
|
+
process_name: Process name (None for process-level metrics)
|
|
202
|
+
step_name: Step name (None for process-level metrics)
|
|
203
|
+
metric_name: Name of the metric
|
|
204
|
+
value: Metric value
|
|
205
|
+
step: Step number (if None, auto-increments from the largest existing step)
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
# Compute path for this process/step
|
|
209
|
+
path_key = self._format_probe_path(process_name, step_name)
|
|
210
|
+
|
|
211
|
+
# Get existing metrics to determine next step if needed
|
|
212
|
+
if step is None:
|
|
213
|
+
existing = self.kv_store.get_probe_metrics_by_path(run_id, path_key) or {}
|
|
214
|
+
metric_dict = existing.get(metric_name)
|
|
215
|
+
if isinstance(metric_dict, dict) and metric_dict:
|
|
216
|
+
try:
|
|
217
|
+
max_step = max(int(k) for k in metric_dict.keys())
|
|
218
|
+
step = max_step + 1
|
|
219
|
+
except (ValueError, TypeError):
|
|
220
|
+
step = 1
|
|
221
|
+
else:
|
|
222
|
+
# No existing data, start at 1
|
|
223
|
+
step = 1
|
|
224
|
+
|
|
225
|
+
# Log the metric
|
|
226
|
+
self._append_probe_metrics(run_id, path_key, {metric_name: value}, path_key, step=step)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
self.logger.warning(f"Failed to log metric {metric_name}: {e}")
|
|
229
|
+
|
|
230
|
+
def _get_cache_path(self, run_id: str, step_name: str, process_name: Optional[str] = None) -> Path:
|
|
231
|
+
# Include process_name to avoid collisions across processes
|
|
232
|
+
safe_proc = (process_name or "no_process").replace("/", "_")
|
|
233
|
+
return self.cache_dir / f"{run_id}_{safe_proc}_{step_name}.pkl"
|
|
234
|
+
|
|
235
|
+
def _get_process_cache_path(self, run_id: str, process_name: str) -> Path:
|
|
236
|
+
safe_proc = (process_name or "no_process").replace("/", "_")
|
|
237
|
+
return self.cache_dir / f"{run_id}__process__{safe_proc}.pkl"
|
|
238
|
+
|
|
239
|
+
def _get_stable_step_cache_path(
|
|
240
|
+
self,
|
|
241
|
+
step_name: str,
|
|
242
|
+
process_name: Optional[str],
|
|
243
|
+
input_hash: Optional[str],
|
|
244
|
+
config_hash: Optional[str],
|
|
245
|
+
function_hash: Optional[str],
|
|
246
|
+
) -> Optional[Path]:
|
|
247
|
+
"""Deterministic, cross-run cache file path for a step based on hashes.
|
|
248
|
+
|
|
249
|
+
Returns None if required hashes are missing.
|
|
250
|
+
"""
|
|
251
|
+
if not input_hash or not config_hash:
|
|
252
|
+
return None
|
|
253
|
+
safe_proc = (process_name or "no_process").replace("/", "_")
|
|
254
|
+
fhash = function_hash or "none"
|
|
255
|
+
return self.cache_dir / f"stable_{safe_proc}_{step_name}_{input_hash}_{config_hash}_{fhash}.pkl"
|
|
256
|
+
|
|
257
|
+
def _get_stable_process_cache_path(
|
|
258
|
+
self,
|
|
259
|
+
process_name: str,
|
|
260
|
+
input_hash: Optional[str],
|
|
261
|
+
config_hash: Optional[str],
|
|
262
|
+
function_hash: Optional[str],
|
|
263
|
+
) -> Optional[Path]:
|
|
264
|
+
"""Deterministic, cross-run cache file path for a process based on hashes.
|
|
265
|
+
|
|
266
|
+
Returns None if required hashes are missing.
|
|
267
|
+
"""
|
|
268
|
+
if not input_hash or not config_hash:
|
|
269
|
+
return None
|
|
270
|
+
safe_proc = (process_name or "no_process").replace("/", "_")
|
|
271
|
+
fhash = function_hash or "none"
|
|
272
|
+
return self.cache_dir / f"stable_process__{safe_proc}_{input_hash}_{config_hash}_{fhash}.pkl"
|
|
273
|
+
|
|
274
|
+
def _compute_hash(self, obj: Any) -> str:
|
|
275
|
+
"""Compute cryptographically secure SHA-256 hash of any object.
|
|
276
|
+
|
|
277
|
+
Uses canonical JSON serialization with custom handling for common non-JSON
|
|
278
|
+
types to ensure determinism and minimize collision risk.
|
|
279
|
+
"""
|
|
280
|
+
try:
|
|
281
|
+
def _to_canonical(o: Any) -> Any:
|
|
282
|
+
# Primitive JSON types pass through
|
|
283
|
+
if o is None or isinstance(o, (str, int, float, bool)):
|
|
284
|
+
return o
|
|
285
|
+
# Paths -> string
|
|
286
|
+
if isinstance(o, Path):
|
|
287
|
+
return str(o)
|
|
288
|
+
# Datetime -> ISO8601
|
|
289
|
+
if isinstance(o, datetime):
|
|
290
|
+
return {"__datetime__": True, "iso": o.isoformat()}
|
|
291
|
+
# Bytes-like -> SHA-256 digest to avoid bloating payloads
|
|
292
|
+
if isinstance(o, (bytes, bytearray, memoryview)):
|
|
293
|
+
bh = hashlib.sha256()
|
|
294
|
+
bh.update(bytes(o))
|
|
295
|
+
return {"__bytes__": True, "sha256": bh.hexdigest()}
|
|
296
|
+
# NumPy arrays -> digest over shape|dtype|data
|
|
297
|
+
if isinstance(o, np.ndarray):
|
|
298
|
+
import os as _os
|
|
299
|
+
_prev_omp = _os.environ.get('OMP_NUM_THREADS')
|
|
300
|
+
try:
|
|
301
|
+
_os.environ['OMP_NUM_THREADS'] = '1'
|
|
302
|
+
ah = hashlib.sha256()
|
|
303
|
+
ah.update(b"ndarray|")
|
|
304
|
+
ah.update(str(o.shape).encode("utf-8"))
|
|
305
|
+
ah.update(b"|")
|
|
306
|
+
ah.update(str(o.dtype).encode("utf-8"))
|
|
307
|
+
ah.update(b"|")
|
|
308
|
+
ah.update(o.tobytes())
|
|
309
|
+
return {"__ndarray__": True, "sha256": ah.hexdigest()}
|
|
310
|
+
finally:
|
|
311
|
+
if _prev_omp is not None:
|
|
312
|
+
_os.environ['OMP_NUM_THREADS'] = _prev_omp
|
|
313
|
+
else:
|
|
314
|
+
_os.environ.pop('OMP_NUM_THREADS', None)
|
|
315
|
+
# Mappings -> dict with stringified keys, recursively canonicalized, sorted by key
|
|
316
|
+
if isinstance(o, dict):
|
|
317
|
+
return {str(k): _to_canonical(v) for k, v in sorted(o.items(), key=lambda kv: str(kv[0]))}
|
|
318
|
+
# Sequences -> list of canonicalized items
|
|
319
|
+
if isinstance(o, (list, tuple)):
|
|
320
|
+
return [_to_canonical(x) for x in o]
|
|
321
|
+
# Sets -> sorted list to make order deterministic
|
|
322
|
+
if isinstance(o, (set, frozenset)):
|
|
323
|
+
return sorted([_to_canonical(x) for x in o], key=lambda x: json.dumps(x, sort_keys=True, separators=(",", ":")))
|
|
324
|
+
# Fallback: use repr for a stable textual form
|
|
325
|
+
return {"__repr__": True, "type": type(o).__name__, "value": repr(o)}
|
|
326
|
+
|
|
327
|
+
canonical = _to_canonical(obj)
|
|
328
|
+
payload = json.dumps(canonical, sort_keys=True, separators=(",", ":"), ensure_ascii=False).encode("utf-8")
|
|
329
|
+
return hashlib.sha256(payload).hexdigest()
|
|
330
|
+
except Exception as e:
|
|
331
|
+
self.logger.warning(f"Failed to compute hash for {type(obj)}: {e}")
|
|
332
|
+
return hashlib.sha256(f"{type(obj).__name__}:{datetime.now()}".encode()).hexdigest()
|
|
333
|
+
|
|
334
|
+
def _compute_function_hash(self, func: callable) -> str:
|
|
335
|
+
"""Compute hash of a function's source code and signature only.
|
|
336
|
+
"""
|
|
337
|
+
try:
|
|
338
|
+
try:
|
|
339
|
+
source = inspect.getsource(func)
|
|
340
|
+
except (OSError, TypeError):
|
|
341
|
+
source = f"{func.__module__}.{func.__qualname__}" if hasattr(func, '__qualname__') else str(func)
|
|
342
|
+
|
|
343
|
+
try:
|
|
344
|
+
sig = str(inspect.signature(func))
|
|
345
|
+
except (ValueError, TypeError):
|
|
346
|
+
sig = ""
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
tree = ast.parse(source)
|
|
350
|
+
normalized = ast.dump(tree)
|
|
351
|
+
except:
|
|
352
|
+
normalized = ' '.join(source.split())
|
|
353
|
+
|
|
354
|
+
return hashlib.sha256(f"{normalized}|{sig}".encode()).hexdigest()
|
|
355
|
+
|
|
356
|
+
except Exception as e:
|
|
357
|
+
self.logger.warning(f"Failed to compute function hash for {func}: {e}")
|
|
358
|
+
# Fallback hash based on function name
|
|
359
|
+
return hashlib.sha256(str(func).encode()).hexdigest()
|
|
360
|
+
|
|
361
|
+
def start_pipeline_execution(self, run_id: str, config: Dict[str, Any], cache_enabled: bool = True) -> None:
|
|
362
|
+
"""Record pipeline start."""
|
|
363
|
+
try:
|
|
364
|
+
self.kv_store.mark_pipeline_started(run_id)
|
|
365
|
+
except Exception as e:
|
|
366
|
+
self.logger.warning(f"kv_store pipeline start failed: {e}")
|
|
367
|
+
|
|
368
|
+
def record_step_started(
|
|
369
|
+
self,
|
|
370
|
+
run_id: str,
|
|
371
|
+
process_name: Optional[str],
|
|
372
|
+
step_name: str,
|
|
373
|
+
) -> None:
|
|
374
|
+
"""Record that a step has started running for the given run.
|
|
375
|
+
|
|
376
|
+
Writes a per-run step record so the web UI can show start time and live elapsed.
|
|
377
|
+
"""
|
|
378
|
+
try:
|
|
379
|
+
record = {
|
|
380
|
+
"status": "running",
|
|
381
|
+
"started_at": time.time(),
|
|
382
|
+
"execution_time": 0.0, # Initialize to 0, will be updated on completion
|
|
383
|
+
"step_name": step_name,
|
|
384
|
+
"process_name": process_name or "no_process",
|
|
385
|
+
}
|
|
386
|
+
self.kv_store.record_run_step(run_id, process_name or "no_process", step_name, record)
|
|
387
|
+
self.kv_store.publish_event({
|
|
388
|
+
"type": "step.started",
|
|
389
|
+
"process": process_name or "no_process",
|
|
390
|
+
"step": step_name,
|
|
391
|
+
"status": "running",
|
|
392
|
+
})
|
|
393
|
+
except Exception as e:
|
|
394
|
+
self.logger.warning(f"kv_store step started record failed: {e}")
|
|
395
|
+
|
|
396
|
+
def record_process_started(
|
|
397
|
+
self,
|
|
398
|
+
run_id: str,
|
|
399
|
+
process_name: str,
|
|
400
|
+
input_hash: Optional[str] = None,
|
|
401
|
+
config_hash: Optional[str] = None,
|
|
402
|
+
function_hash: Optional[str] = None,
|
|
403
|
+
started_at: Optional[float] = None,
|
|
404
|
+
enable_logging: bool = True,
|
|
405
|
+
) -> None:
|
|
406
|
+
"""Record that a process has started running for the given run.
|
|
407
|
+
|
|
408
|
+
Writes a per-run process record under the special step name "__process__" to
|
|
409
|
+
allow the web UI to display start time and live elapsed. If strict hashes are
|
|
410
|
+
available, best-effort mark the process index as running as well.
|
|
411
|
+
"""
|
|
412
|
+
try:
|
|
413
|
+
# Use provided started_at (captured at timing start) if available
|
|
414
|
+
_started = float(started_at) if isinstance(started_at, (int, float)) else time.time()
|
|
415
|
+
record = {
|
|
416
|
+
"status": "running",
|
|
417
|
+
"started_at": _started,
|
|
418
|
+
"execution_time": 0.0, # Initialize to 0, will be updated on completion
|
|
419
|
+
"process_name": process_name,
|
|
420
|
+
}
|
|
421
|
+
# Per-run process record for UI
|
|
422
|
+
try:
|
|
423
|
+
self.kv_store.record_run_step(run_id, process_name, "__process__", dict(record))
|
|
424
|
+
except Exception:
|
|
425
|
+
pass
|
|
426
|
+
# Best-effort: reflect running state in process index when hashes available
|
|
427
|
+
if input_hash and config_hash:
|
|
428
|
+
try:
|
|
429
|
+
# Avoid overriding an existing terminal cache record that already has a cache_path
|
|
430
|
+
existing = self.kv_store.get_process_cache_record(
|
|
431
|
+
process_name,
|
|
432
|
+
input_hash or "",
|
|
433
|
+
config_hash or "",
|
|
434
|
+
function_hash or None,
|
|
435
|
+
)
|
|
436
|
+
should_write_running = not (isinstance(existing, dict) and existing.get("status") in ("completed", "cached") and existing.get("cache_path"))
|
|
437
|
+
if should_write_running:
|
|
438
|
+
if hasattr(self.kv_store, "set_process_cache_record_batched") and callable(getattr(self.kv_store, "set_process_cache_record_batched")):
|
|
439
|
+
getattr(self.kv_store, "set_process_cache_record_batched")( # type: ignore
|
|
440
|
+
run_id,
|
|
441
|
+
process_name,
|
|
442
|
+
input_hash or "",
|
|
443
|
+
config_hash or "",
|
|
444
|
+
function_hash or None,
|
|
445
|
+
record,
|
|
446
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
447
|
+
)
|
|
448
|
+
else:
|
|
449
|
+
self.kv_store.set_process_cache_record(
|
|
450
|
+
process_name,
|
|
451
|
+
input_hash or "",
|
|
452
|
+
config_hash or "",
|
|
453
|
+
function_hash or None,
|
|
454
|
+
record,
|
|
455
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
456
|
+
)
|
|
457
|
+
except Exception:
|
|
458
|
+
pass
|
|
459
|
+
self.kv_store.publish_event({
|
|
460
|
+
"type": "process.started",
|
|
461
|
+
"process": process_name,
|
|
462
|
+
"status": "running",
|
|
463
|
+
})
|
|
464
|
+
except Exception as e:
|
|
465
|
+
self.logger.warning(f"kv_store process started record failed: {e}")
|
|
466
|
+
|
|
467
|
+
def record_step_completion(
|
|
468
|
+
self,
|
|
469
|
+
run_id: str,
|
|
470
|
+
step_result: StepExecutionResult,
|
|
471
|
+
input_hash: Optional[str] = None,
|
|
472
|
+
config_hash: Optional[str] = None,
|
|
473
|
+
function_name: Optional[str] = None,
|
|
474
|
+
function_hash: Optional[str] = None,
|
|
475
|
+
was_cached: bool = False,
|
|
476
|
+
process_name: Optional[str] = None,
|
|
477
|
+
enable_logging: bool = True,
|
|
478
|
+
cached_run_id: Optional[str] = None,
|
|
479
|
+
cached_started_at: Optional[float] = None,
|
|
480
|
+
cached_ended_at: Optional[float] = None,
|
|
481
|
+
cached_execution_time: Optional[float] = None,
|
|
482
|
+
) -> None:
|
|
483
|
+
"""Record step completion with hash-based caching including function hash and process_name.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
enable_logging: If True, create probes for metrics. If False, skip probe creation.
|
|
487
|
+
"""
|
|
488
|
+
step_name = step_result.step_name
|
|
489
|
+
cache_path = None
|
|
490
|
+
|
|
491
|
+
if step_result.success and step_result.result and not was_cached:
|
|
492
|
+
try:
|
|
493
|
+
# Prefer object store when configured; otherwise cache locally (absolute path)
|
|
494
|
+
if self.object_store:
|
|
495
|
+
import tempfile, os as _os
|
|
496
|
+
# Write to a temporary file to avoid large in-memory buffers
|
|
497
|
+
with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as _tmpf:
|
|
498
|
+
tmp_path = _tmpf.name
|
|
499
|
+
try:
|
|
500
|
+
joblib.dump(step_result.result, tmp_path)
|
|
501
|
+
fname = self._stable_step_filename(process_name, step_name, input_hash, config_hash, function_hash) \
|
|
502
|
+
if (input_hash and config_hash) else f"{run_id}_{self._safe_proc(process_name)}_{step_name}.pkl"
|
|
503
|
+
cache_uri = self._build_object_uri(fname)
|
|
504
|
+
if hasattr(self.object_store, 'put_file'):
|
|
505
|
+
self.object_store.put_file(cache_uri, tmp_path, content_type="application/octet-stream")
|
|
506
|
+
else:
|
|
507
|
+
with open(tmp_path, 'rb') as _f:
|
|
508
|
+
self.object_store.put_bytes(cache_uri, _f.read(), content_type="application/octet-stream")
|
|
509
|
+
cache_path = cache_uri
|
|
510
|
+
finally:
|
|
511
|
+
try:
|
|
512
|
+
_os.remove(tmp_path)
|
|
513
|
+
except Exception:
|
|
514
|
+
pass
|
|
515
|
+
else:
|
|
516
|
+
# Local filesystem fallback with absolute path stored in KV
|
|
517
|
+
if input_hash and config_hash:
|
|
518
|
+
local_path = self._get_stable_step_cache_path(step_name, process_name, input_hash, config_hash, function_hash)
|
|
519
|
+
else:
|
|
520
|
+
local_path = self._get_cache_path(run_id, step_name, process_name)
|
|
521
|
+
if local_path is not None:
|
|
522
|
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
523
|
+
joblib.dump(step_result.result, local_path)
|
|
524
|
+
cache_path = str(local_path.resolve())
|
|
525
|
+
except Exception as e:
|
|
526
|
+
self.logger.warning(f"Failed to cache step {step_name}: {e}")
|
|
527
|
+
|
|
528
|
+
status = "cached" if was_cached else ("completed" if step_result.success else "failed")
|
|
529
|
+
try:
|
|
530
|
+
# For cached steps, use the original timing from cache
|
|
531
|
+
if was_cached and cached_started_at is not None and cached_ended_at is not None and cached_execution_time is not None:
|
|
532
|
+
record = {
|
|
533
|
+
"status": status,
|
|
534
|
+
"execution_time": cached_execution_time,
|
|
535
|
+
"ended_at": cached_ended_at,
|
|
536
|
+
"cache_path": cache_path,
|
|
537
|
+
"step_name": step_name,
|
|
538
|
+
"process_name": process_name or "no_process",
|
|
539
|
+
"run_id": run_id,
|
|
540
|
+
"started_at": cached_started_at,
|
|
541
|
+
"cached_run_id": cached_run_id,
|
|
542
|
+
}
|
|
543
|
+
else:
|
|
544
|
+
# For non-cached steps, use current run timing
|
|
545
|
+
# Convert ISO timestamp to Unix timestamp for consistency with record_step_started
|
|
546
|
+
timestamp = step_result.timestamp
|
|
547
|
+
if isinstance(timestamp, str):
|
|
548
|
+
try:
|
|
549
|
+
from datetime import datetime
|
|
550
|
+
timestamp = datetime.fromisoformat(timestamp).timestamp()
|
|
551
|
+
except Exception:
|
|
552
|
+
timestamp = time.time()
|
|
553
|
+
elif not isinstance(timestamp, (int, float)):
|
|
554
|
+
timestamp = time.time()
|
|
555
|
+
# Preserve started_at if a prior running record exists so UI can compute duration
|
|
556
|
+
started_at_existing = None
|
|
557
|
+
try:
|
|
558
|
+
prev = self.kv_store.list_run_steps(run_id)
|
|
559
|
+
if isinstance(prev, dict):
|
|
560
|
+
key = f"{process_name or 'no_process'}.{step_name}"
|
|
561
|
+
started_at_existing = (prev.get(key) or {}).get("started_at")
|
|
562
|
+
except Exception:
|
|
563
|
+
started_at_existing = None
|
|
564
|
+
|
|
565
|
+
record = {
|
|
566
|
+
"status": status,
|
|
567
|
+
"execution_time": step_result.execution_time,
|
|
568
|
+
"ended_at": timestamp,
|
|
569
|
+
"cache_path": cache_path,
|
|
570
|
+
"step_name": step_name,
|
|
571
|
+
"process_name": process_name or "no_process",
|
|
572
|
+
"run_id": run_id,
|
|
573
|
+
}
|
|
574
|
+
# If we have an existing started_at, include it; otherwise derive from ended_at - execution_time
|
|
575
|
+
try:
|
|
576
|
+
if started_at_existing is not None:
|
|
577
|
+
record["started_at"] = started_at_existing
|
|
578
|
+
elif isinstance(step_result.execution_time, (int, float)) and step_result.execution_time >= 0:
|
|
579
|
+
record["started_at"] = float(record["ended_at"]) - float(step_result.execution_time)
|
|
580
|
+
except Exception:
|
|
581
|
+
pass
|
|
582
|
+
# Note: Metrics are no longer auto-logged from step results.
|
|
583
|
+
# Users must explicitly call log_metric() to log metrics.
|
|
584
|
+
# No probe_id bookkeeping needed.
|
|
585
|
+
# Prefer batched write when backend supports it (e.g., Firestore)
|
|
586
|
+
if hasattr(self.kv_store, "set_step_cache_record_batched") and callable(getattr(self.kv_store, "set_step_cache_record_batched")):
|
|
587
|
+
try:
|
|
588
|
+
getattr(self.kv_store, "set_step_cache_record_batched")( # type: ignore
|
|
589
|
+
run_id,
|
|
590
|
+
process_name or "no_process",
|
|
591
|
+
step_name,
|
|
592
|
+
input_hash or "",
|
|
593
|
+
config_hash or "",
|
|
594
|
+
function_hash or None,
|
|
595
|
+
record,
|
|
596
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
597
|
+
)
|
|
598
|
+
except Exception:
|
|
599
|
+
# Fallback to non-batched on error
|
|
600
|
+
self.kv_store.set_step_cache_record(
|
|
601
|
+
process_name or "no_process",
|
|
602
|
+
step_name,
|
|
603
|
+
input_hash or "",
|
|
604
|
+
config_hash or "",
|
|
605
|
+
function_hash or None,
|
|
606
|
+
record,
|
|
607
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
608
|
+
)
|
|
609
|
+
else:
|
|
610
|
+
self.kv_store.set_step_cache_record(
|
|
611
|
+
process_name or "no_process",
|
|
612
|
+
step_name,
|
|
613
|
+
input_hash or "",
|
|
614
|
+
config_hash or "",
|
|
615
|
+
function_hash or None,
|
|
616
|
+
record,
|
|
617
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
618
|
+
)
|
|
619
|
+
# Stats for cache hit/miss
|
|
620
|
+
if status == "cached":
|
|
621
|
+
self.kv_store.increment_stat(run_id, "cache_hit_count", 1)
|
|
622
|
+
# Per-run step record for UI
|
|
623
|
+
try:
|
|
624
|
+
self.kv_store.record_run_step(run_id, process_name or "no_process", step_name, dict(record))
|
|
625
|
+
except Exception as record_err:
|
|
626
|
+
self.logger.warning(f"❌ Failed to record step completion for {step_name} in process {process_name} run {run_id}: {record_err}")
|
|
627
|
+
import traceback
|
|
628
|
+
self.logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
629
|
+
self.kv_store.publish_event({
|
|
630
|
+
"type": "step.completed",
|
|
631
|
+
"process": process_name or "no_process",
|
|
632
|
+
"step": step_name,
|
|
633
|
+
"status": status,
|
|
634
|
+
})
|
|
635
|
+
except Exception as e:
|
|
636
|
+
self.logger.warning(f"kv_store step index/event failed: {e}")
|
|
637
|
+
|
|
638
|
+
def record_process_completion(
|
|
639
|
+
self,
|
|
640
|
+
run_id: str,
|
|
641
|
+
process_result: ProcessExecutionResult,
|
|
642
|
+
input_hash: Optional[str] = None,
|
|
643
|
+
config_hash: Optional[str] = None,
|
|
644
|
+
function_hash: Optional[str] = None,
|
|
645
|
+
was_cached: bool = False,
|
|
646
|
+
enable_logging: bool = True,
|
|
647
|
+
cached_run_id: Optional[str] = None,
|
|
648
|
+
cached_started_at: Optional[float] = None,
|
|
649
|
+
cached_ended_at: Optional[float] = None,
|
|
650
|
+
cached_execution_time: Optional[float] = None,
|
|
651
|
+
) -> None:
|
|
652
|
+
"""Record process completion and cache combined process result.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
enable_logging: If True, create probes for metrics. If False, skip probe creation.
|
|
656
|
+
cached_run_id: Run ID of the original cached execution (for was_cached=True)
|
|
657
|
+
cached_started_at: Start timestamp from original cached execution
|
|
658
|
+
cached_ended_at: End timestamp from original cached execution
|
|
659
|
+
cached_execution_time: Execution time from original cached execution
|
|
660
|
+
"""
|
|
661
|
+
process_name = process_result.process_name
|
|
662
|
+
cache_path = None
|
|
663
|
+
|
|
664
|
+
# When we have a fresh result (not from cache) we persist the artifact.
|
|
665
|
+
if process_result.success and process_result.result and not was_cached:
|
|
666
|
+
try:
|
|
667
|
+
if self.object_store:
|
|
668
|
+
import tempfile, os as _os
|
|
669
|
+
# Write to a temporary file to avoid large in-memory buffers
|
|
670
|
+
with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as _tmpf:
|
|
671
|
+
tmp_path = _tmpf.name
|
|
672
|
+
try:
|
|
673
|
+
joblib.dump(process_result.result, tmp_path)
|
|
674
|
+
fname = self._stable_process_filename(process_name, input_hash, config_hash, function_hash) \
|
|
675
|
+
if (input_hash and config_hash) else f"{run_id}__process__{self._safe_proc(process_name)}.pkl"
|
|
676
|
+
cache_uri = self._build_object_uri(fname)
|
|
677
|
+
if hasattr(self.object_store, 'put_file'):
|
|
678
|
+
self.object_store.put_file(cache_uri, tmp_path, content_type="application/octet-stream")
|
|
679
|
+
else:
|
|
680
|
+
with open(tmp_path, 'rb') as _f:
|
|
681
|
+
self.object_store.put_bytes(cache_uri, _f.read(), content_type="application/octet-stream")
|
|
682
|
+
cache_path = cache_uri
|
|
683
|
+
finally:
|
|
684
|
+
try:
|
|
685
|
+
_os.remove(tmp_path)
|
|
686
|
+
except Exception:
|
|
687
|
+
pass
|
|
688
|
+
else:
|
|
689
|
+
# Local filesystem fallback with absolute path stored in KV
|
|
690
|
+
if input_hash and config_hash:
|
|
691
|
+
local_path = self._get_stable_process_cache_path(process_name, input_hash, config_hash, function_hash)
|
|
692
|
+
else:
|
|
693
|
+
local_path = self._get_process_cache_path(run_id, process_name)
|
|
694
|
+
if local_path is not None:
|
|
695
|
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
696
|
+
joblib.dump(process_result.result, local_path)
|
|
697
|
+
cache_path = str(local_path.resolve())
|
|
698
|
+
except Exception as e:
|
|
699
|
+
self.logger.warning(f"Failed to cache process {process_name}: {e}")
|
|
700
|
+
|
|
701
|
+
# If we are marking a cached completion, try to preserve the original cache_path
|
|
702
|
+
# so future lookups still return a path. Do not overwrite existing cache_path with null.
|
|
703
|
+
# Also preserve cache_path if result is None (e.g., MemoryError during deserialization)
|
|
704
|
+
# but process succeeded - worker already cached it
|
|
705
|
+
if (was_cached or (process_result.success and not process_result.result)) and (cache_path is None):
|
|
706
|
+
try:
|
|
707
|
+
if hasattr(self, "kv_store") and input_hash and config_hash:
|
|
708
|
+
# Prefer full record to recover cache_path even if status is currently running
|
|
709
|
+
existing_rec = self.kv_store.get_process_cache_record(process_name, input_hash, config_hash, function_hash)
|
|
710
|
+
if isinstance(existing_rec, dict) and existing_rec.get("cache_path"):
|
|
711
|
+
cache_path = existing_rec.get("cache_path")
|
|
712
|
+
else:
|
|
713
|
+
cache_path = self.kv_store.get_process_cache_path(process_name, input_hash, config_hash, function_hash)
|
|
714
|
+
except Exception:
|
|
715
|
+
cache_path = None
|
|
716
|
+
|
|
717
|
+
status = "cached" if was_cached else ("completed" if process_result.success else "failed")
|
|
718
|
+
try:
|
|
719
|
+
# For cached processes, use the original timing from the cached metadata
|
|
720
|
+
if was_cached and cached_started_at is not None and cached_ended_at is not None and cached_execution_time is not None:
|
|
721
|
+
record = {
|
|
722
|
+
"status": status,
|
|
723
|
+
"execution_time": cached_execution_time,
|
|
724
|
+
"ended_at": cached_ended_at,
|
|
725
|
+
"process_name": process_name,
|
|
726
|
+
"started_at": cached_started_at,
|
|
727
|
+
"cached_run_id": cached_run_id,
|
|
728
|
+
"cached_started_at": cached_started_at,
|
|
729
|
+
"cached_ended_at": cached_ended_at,
|
|
730
|
+
"cached_execution_time": cached_execution_time,
|
|
731
|
+
"run_id": run_id,
|
|
732
|
+
}
|
|
733
|
+
# Only include cache_path when we actually have one to avoid nulling existing values
|
|
734
|
+
if cache_path:
|
|
735
|
+
record["cache_path"] = cache_path
|
|
736
|
+
else:
|
|
737
|
+
# For non-cached processes, use current run timing
|
|
738
|
+
# Convert ISO timestamp to Unix timestamp for consistency with record_process_started
|
|
739
|
+
timestamp = process_result.timestamp
|
|
740
|
+
if isinstance(timestamp, str):
|
|
741
|
+
try:
|
|
742
|
+
from datetime import datetime
|
|
743
|
+
timestamp = datetime.fromisoformat(timestamp).timestamp()
|
|
744
|
+
except Exception:
|
|
745
|
+
timestamp = time.time()
|
|
746
|
+
elif not isinstance(timestamp, (int, float)):
|
|
747
|
+
timestamp = time.time()
|
|
748
|
+
# Preserve started_at from prior running record, and prefer the earliest ended_at
|
|
749
|
+
started_at_existing = None
|
|
750
|
+
ended_at_existing = None
|
|
751
|
+
try:
|
|
752
|
+
prev = self.kv_store.list_run_steps(run_id)
|
|
753
|
+
if isinstance(prev, dict):
|
|
754
|
+
key = f"{process_name}.__process__"
|
|
755
|
+
_prev_rec = (prev.get(key) or {})
|
|
756
|
+
started_at_existing = _prev_rec.get("started_at")
|
|
757
|
+
ended_at_existing = _prev_rec.get("ended_at")
|
|
758
|
+
except Exception:
|
|
759
|
+
started_at_existing = None
|
|
760
|
+
ended_at_existing = None
|
|
761
|
+
|
|
762
|
+
record = {
|
|
763
|
+
"status": status,
|
|
764
|
+
"execution_time": process_result.execution_time,
|
|
765
|
+
"ended_at": timestamp,
|
|
766
|
+
"process_name": process_name,
|
|
767
|
+
# Persist this run id on process index for future cache lookups and provenance
|
|
768
|
+
"run_id": run_id,
|
|
769
|
+
}
|
|
770
|
+
# Only include cache_path when we actually have one to avoid nulling existing values
|
|
771
|
+
if cache_path:
|
|
772
|
+
record["cache_path"] = cache_path
|
|
773
|
+
try:
|
|
774
|
+
if started_at_existing is not None:
|
|
775
|
+
record["started_at"] = started_at_existing
|
|
776
|
+
elif isinstance(process_result.execution_time, (int, float)) and process_result.execution_time >= 0:
|
|
777
|
+
record["started_at"] = float(record["ended_at"]) - float(process_result.execution_time)
|
|
778
|
+
# Preserve the earliest ended_at if a prior value exists to avoid late overwrites
|
|
779
|
+
if ended_at_existing is not None:
|
|
780
|
+
try:
|
|
781
|
+
record["ended_at"] = min(float(record["ended_at"]), float(ended_at_existing))
|
|
782
|
+
except Exception:
|
|
783
|
+
pass
|
|
784
|
+
except Exception:
|
|
785
|
+
pass
|
|
786
|
+
try:
|
|
787
|
+
if enable_logging:
|
|
788
|
+
# If this was cached, try to copy ALL metrics from the source run
|
|
789
|
+
# Find another run with the same process hash and copy all its probe metrics
|
|
790
|
+
if was_cached and input_hash and config_hash:
|
|
791
|
+
self.logger.info(f"🔍 [METRICS COPY] Attempting to copy cached metrics for {process_name} (was_cached={was_cached})")
|
|
792
|
+
try:
|
|
793
|
+
from ..storage.adapters.gcp_kv_store import GCPStore
|
|
794
|
+
if isinstance(self.kv_store, GCPStore):
|
|
795
|
+
self.logger.info(f"🔍 [METRICS COPY] GCP store detected, proceeding with metrics copy (path-based)")
|
|
796
|
+
# Scan recent runs and copy metrics from probes_by_path
|
|
797
|
+
try:
|
|
798
|
+
runs_col = self.kv_store._root.collection('runs')
|
|
799
|
+
metric_col = self.kv_store._root.collection('metric')
|
|
800
|
+
run_docs = list(runs_col.limit(10).stream())
|
|
801
|
+
self.logger.info(f"🔍 [METRICS COPY] Query returned {len(run_docs)} runs")
|
|
802
|
+
except Exception as query_err:
|
|
803
|
+
self.logger.warning(f"❌ [METRICS COPY] Failed to query runs: {query_err}")
|
|
804
|
+
run_docs = []
|
|
805
|
+
found_metrics = False
|
|
806
|
+
for run_doc in run_docs:
|
|
807
|
+
try:
|
|
808
|
+
source_run_id = run_doc.id
|
|
809
|
+
if source_run_id == run_id:
|
|
810
|
+
continue
|
|
811
|
+
try:
|
|
812
|
+
docs = list(metric_col.document(source_run_id).collection('probes_by_path').limit(50).stream())
|
|
813
|
+
except Exception:
|
|
814
|
+
docs = []
|
|
815
|
+
for d in docs:
|
|
816
|
+
try:
|
|
817
|
+
source_metrics = d.to_dict() or {}
|
|
818
|
+
if source_metrics:
|
|
819
|
+
enc_id = getattr(d, 'id', '')
|
|
820
|
+
try:
|
|
821
|
+
path = decode_probe_path(enc_id)
|
|
822
|
+
except Exception:
|
|
823
|
+
continue
|
|
824
|
+
if not (isinstance(path, str) and (path == process_name or path.startswith(f"{process_name}/"))):
|
|
825
|
+
continue
|
|
826
|
+
self.kv_store.save_probe_metrics_by_path(run_id, path, source_metrics)
|
|
827
|
+
found_metrics = True
|
|
828
|
+
except Exception:
|
|
829
|
+
continue
|
|
830
|
+
if found_metrics:
|
|
831
|
+
break
|
|
832
|
+
except Exception:
|
|
833
|
+
continue
|
|
834
|
+
if not found_metrics:
|
|
835
|
+
self.logger.warning(f"⚠️ [METRICS COPY] No source metrics found for {process_name}")
|
|
836
|
+
else:
|
|
837
|
+
self.logger.info(f"🔍 [METRICS COPY] Not using GCP store (type={type(self.kv_store).__name__}), skipping metrics copy")
|
|
838
|
+
except Exception as copy_err:
|
|
839
|
+
self.logger.warning(f"❌ [METRICS COPY] Failed to copy metrics for {process_name}: {copy_err}")
|
|
840
|
+
import traceback
|
|
841
|
+
self.logger.warning(f"Traceback: {traceback.format_exc()}")
|
|
842
|
+
# Note: Step-level probes are now created directly by individual steps
|
|
843
|
+
# with clean paths that match chart configurations exactly
|
|
844
|
+
except Exception as e:
|
|
845
|
+
print(f"❌ Exception in process metrics handling: {e}")
|
|
846
|
+
pass
|
|
847
|
+
# Prefer batched write when backend supports it (e.g., Firestore)
|
|
848
|
+
try:
|
|
849
|
+
# Debug: surface the exact key parts used for process_indices completion write
|
|
850
|
+
self.logger.debug(
|
|
851
|
+
f"process_indices[complete] key parts -> process={process_name}, ih={input_hash}, ch={config_hash}, fh={function_hash}"
|
|
852
|
+
)
|
|
853
|
+
except Exception:
|
|
854
|
+
pass
|
|
855
|
+
_should_write_index = (not was_cached) or bool(cache_path)
|
|
856
|
+
if _should_write_index:
|
|
857
|
+
if hasattr(self.kv_store, "set_process_cache_record_batched") and callable(getattr(self.kv_store, "set_process_cache_record_batched")):
|
|
858
|
+
try:
|
|
859
|
+
getattr(self.kv_store, "set_process_cache_record_batched")( # type: ignore
|
|
860
|
+
run_id,
|
|
861
|
+
process_name,
|
|
862
|
+
input_hash or "",
|
|
863
|
+
config_hash or "",
|
|
864
|
+
function_hash or None,
|
|
865
|
+
record,
|
|
866
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
867
|
+
)
|
|
868
|
+
except Exception:
|
|
869
|
+
self.kv_store.set_process_cache_record(
|
|
870
|
+
process_name,
|
|
871
|
+
input_hash or "",
|
|
872
|
+
config_hash or "",
|
|
873
|
+
function_hash or None,
|
|
874
|
+
record,
|
|
875
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
876
|
+
)
|
|
877
|
+
else:
|
|
878
|
+
self.kv_store.set_process_cache_record(
|
|
879
|
+
process_name,
|
|
880
|
+
input_hash or "",
|
|
881
|
+
config_hash or "",
|
|
882
|
+
function_hash or None,
|
|
883
|
+
record,
|
|
884
|
+
ttl_seconds=self.redis_ttl_seconds,
|
|
885
|
+
)
|
|
886
|
+
# Per-run process summary record for UI
|
|
887
|
+
try:
|
|
888
|
+
self.kv_store.record_run_step(run_id, process_name, "__process__", dict(record))
|
|
889
|
+
except Exception as record_err:
|
|
890
|
+
self.logger.warning(f"❌ Failed to record __process__ completion for {process_name} in run {run_id}: {record_err}")
|
|
891
|
+
import traceback
|
|
892
|
+
self.logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
893
|
+
self.kv_store.publish_event({
|
|
894
|
+
"type": "process.completed",
|
|
895
|
+
"process": process_name,
|
|
896
|
+
"status": status,
|
|
897
|
+
})
|
|
898
|
+
except Exception as e:
|
|
899
|
+
self.logger.warning(f"kv_store process index/event failed: {e}")
|
|
900
|
+
|
|
901
|
+
def can_skip_step(
|
|
902
|
+
self,
|
|
903
|
+
run_id: str,
|
|
904
|
+
step_name: str,
|
|
905
|
+
input_hash: str,
|
|
906
|
+
config_hash: str,
|
|
907
|
+
function_hash: Optional[str] = None,
|
|
908
|
+
process_name: Optional[str] = None,
|
|
909
|
+
) -> bool:
|
|
910
|
+
"""Check if step can be skipped based on hash validation including function hash and process name."""
|
|
911
|
+
if not input_hash or not config_hash:
|
|
912
|
+
return False
|
|
913
|
+
# Fast-path via kv_store exact index only
|
|
914
|
+
try:
|
|
915
|
+
path = self.kv_store.get_step_cache_path(
|
|
916
|
+
process_name or "no_process",
|
|
917
|
+
step_name,
|
|
918
|
+
input_hash,
|
|
919
|
+
config_hash,
|
|
920
|
+
function_hash,
|
|
921
|
+
)
|
|
922
|
+
if path:
|
|
923
|
+
# Support object store URIs and local absolute paths
|
|
924
|
+
if isinstance(path, str) and path.startswith('gs://') and self.object_store:
|
|
925
|
+
try:
|
|
926
|
+
return self.object_store.exists(path)
|
|
927
|
+
except Exception:
|
|
928
|
+
pass
|
|
929
|
+
else:
|
|
930
|
+
try:
|
|
931
|
+
return Path(path).exists()
|
|
932
|
+
except Exception:
|
|
933
|
+
pass
|
|
934
|
+
except Exception:
|
|
935
|
+
pass
|
|
936
|
+
return False
|
|
937
|
+
|
|
938
|
+
def get_expired_cache_entries(self, step_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
939
|
+
"""Get expired cache entries for potential recovery or debugging (removed)."""
|
|
940
|
+
return []
|
|
941
|
+
|
|
942
|
+
def restore_expired_cache_entry(self, run_id: str, step_name: str) -> bool:
|
|
943
|
+
"""Restore an expired cache entry back to 'completed' status (removed)."""
|
|
944
|
+
return False
|
|
945
|
+
|
|
946
|
+
def _cleanup_stale_cache_entry(self, step_name: str, cache_path: str) -> None:
|
|
947
|
+
"""Deprecated no-op (removed)."""
|
|
948
|
+
return None
|
|
949
|
+
|
|
950
|
+
def get_cached_step_result(
|
|
951
|
+
self,
|
|
952
|
+
run_id: str,
|
|
953
|
+
step_name: str,
|
|
954
|
+
process_name: Optional[str] = None,
|
|
955
|
+
input_hash: Optional[str] = None,
|
|
956
|
+
config_hash: Optional[str] = None,
|
|
957
|
+
function_hash: Optional[str] = None,
|
|
958
|
+
) -> Optional[Dict[str, Any]]:
|
|
959
|
+
"""Get cached step result via kv_store strict-hash lookup.
|
|
960
|
+
|
|
961
|
+
Only returns a result if the KV index has a valid cache_path entry.
|
|
962
|
+
No fallback to deterministic file paths - cache must be explicitly indexed.
|
|
963
|
+
"""
|
|
964
|
+
cache_path = self.kv_store.get_step_cache_path(
|
|
965
|
+
process_name or "no_process",
|
|
966
|
+
step_name,
|
|
967
|
+
input_hash,
|
|
968
|
+
config_hash,
|
|
969
|
+
function_hash,
|
|
970
|
+
)
|
|
971
|
+
|
|
972
|
+
loaded = None
|
|
973
|
+
# If index returned a path, load from appropriate backend
|
|
974
|
+
try:
|
|
975
|
+
if cache_path and isinstance(cache_path, str):
|
|
976
|
+
if cache_path.startswith('gs://') and self.object_store:
|
|
977
|
+
data = self.object_store.get_bytes(cache_path)
|
|
978
|
+
loaded = joblib.load(io.BytesIO(data))
|
|
979
|
+
else:
|
|
980
|
+
p = Path(cache_path)
|
|
981
|
+
if p.exists():
|
|
982
|
+
loaded = joblib.load(p)
|
|
983
|
+
except Exception as e:
|
|
984
|
+
self.logger.warning(f"Failed to load cached step result from {cache_path}: {e}")
|
|
985
|
+
|
|
986
|
+
proc = process_name or "no_process"
|
|
987
|
+
if loaded is not None:
|
|
988
|
+
try:
|
|
989
|
+
self.logger.info(f"[Cache] step hit: {proc}/{step_name}")
|
|
990
|
+
except Exception:
|
|
991
|
+
pass
|
|
992
|
+
return loaded
|
|
993
|
+
else:
|
|
994
|
+
try:
|
|
995
|
+
self.logger.info(f"[Cache] step miss: {proc}/{step_name}")
|
|
996
|
+
except Exception:
|
|
997
|
+
pass
|
|
998
|
+
return None
|
|
999
|
+
|
|
1000
|
+
def get_cached_step_result_with_metadata(
|
|
1001
|
+
self,
|
|
1002
|
+
run_id: str,
|
|
1003
|
+
step_name: str,
|
|
1004
|
+
process_name: Optional[str] = None,
|
|
1005
|
+
input_hash: Optional[str] = None,
|
|
1006
|
+
config_hash: Optional[str] = None,
|
|
1007
|
+
function_hash: Optional[str] = None,
|
|
1008
|
+
) -> Optional[tuple[Dict[str, Any], str, Dict[str, Any]]]:
|
|
1009
|
+
"""Get cached step result with metadata including cached run-id and timing.
|
|
1010
|
+
|
|
1011
|
+
Returns: (result, cached_run_id, cached_metadata) or None if not cached.
|
|
1012
|
+
"""
|
|
1013
|
+
# First check if cache exists
|
|
1014
|
+
cache_path = self.kv_store.get_step_cache_path(
|
|
1015
|
+
process_name or "no_process",
|
|
1016
|
+
step_name,
|
|
1017
|
+
input_hash,
|
|
1018
|
+
config_hash,
|
|
1019
|
+
function_hash,
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
if not cache_path:
|
|
1023
|
+
try:
|
|
1024
|
+
self.logger.info(f"[Cache] step miss: {(process_name or 'no_process')}/{step_name}")
|
|
1025
|
+
except Exception:
|
|
1026
|
+
pass
|
|
1027
|
+
return None
|
|
1028
|
+
|
|
1029
|
+
# Get the full cache record to extract metadata
|
|
1030
|
+
cache_record = self.kv_store.get_step_cache_record(
|
|
1031
|
+
process_name or "no_process",
|
|
1032
|
+
step_name,
|
|
1033
|
+
input_hash,
|
|
1034
|
+
config_hash,
|
|
1035
|
+
function_hash,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
if not cache_record:
|
|
1039
|
+
return None
|
|
1040
|
+
|
|
1041
|
+
# Load the actual cached result
|
|
1042
|
+
try:
|
|
1043
|
+
if cache_path and isinstance(cache_path, str):
|
|
1044
|
+
if cache_path.startswith('gs://') and self.object_store:
|
|
1045
|
+
data = self.object_store.get_bytes(cache_path)
|
|
1046
|
+
result = joblib.load(io.BytesIO(data))
|
|
1047
|
+
else:
|
|
1048
|
+
p = Path(cache_path)
|
|
1049
|
+
if p.exists():
|
|
1050
|
+
result = joblib.load(p)
|
|
1051
|
+
else:
|
|
1052
|
+
result = None
|
|
1053
|
+
else:
|
|
1054
|
+
result = None
|
|
1055
|
+
except Exception as e:
|
|
1056
|
+
self.logger.warning(f"Failed to load cached step result from {cache_path}: {e}")
|
|
1057
|
+
result = None
|
|
1058
|
+
|
|
1059
|
+
proc = process_name or "no_process"
|
|
1060
|
+
if result is None:
|
|
1061
|
+
try:
|
|
1062
|
+
self.logger.info(f"[Cache] step miss: {proc}/{step_name}")
|
|
1063
|
+
except Exception:
|
|
1064
|
+
pass
|
|
1065
|
+
return None
|
|
1066
|
+
|
|
1067
|
+
# Extract metadata
|
|
1068
|
+
# Prefer cached_run_id if it exists (points to original run that executed the step)
|
|
1069
|
+
# Otherwise use run_id (this record is from the original execution)
|
|
1070
|
+
cached_run_id = cache_record.get("cached_run_id") or cache_record.get("run_id", "unknown")
|
|
1071
|
+
cached_metadata = {
|
|
1072
|
+
"started_at": cache_record.get("cached_started_at") or cache_record.get("started_at"),
|
|
1073
|
+
"ended_at": cache_record.get("cached_ended_at") or cache_record.get("ended_at"),
|
|
1074
|
+
"execution_time": cache_record.get("cached_execution_time") or cache_record.get("execution_time"),
|
|
1075
|
+
"run_id": cached_run_id,
|
|
1076
|
+
}
|
|
1077
|
+
|
|
1078
|
+
return (result, cached_run_id, cached_metadata)
|
|
1079
|
+
|
|
1080
|
+
def get_cached_process_result(
|
|
1081
|
+
self,
|
|
1082
|
+
process_name: str,
|
|
1083
|
+
input_hash: Optional[str] = None,
|
|
1084
|
+
config_hash: Optional[str] = None,
|
|
1085
|
+
function_hash: Optional[str] = None,
|
|
1086
|
+
run_id: Optional[str] = None,
|
|
1087
|
+
) -> Optional[Dict[str, Any]]:
|
|
1088
|
+
"""Get cached process result via kv_store strict-hash lookup.
|
|
1089
|
+
|
|
1090
|
+
Only returns a result if the KV index has a valid cache_path entry.
|
|
1091
|
+
No fallback to deterministic file paths - cache must be explicitly indexed.
|
|
1092
|
+
"""
|
|
1093
|
+
cache_path = self.kv_store.get_process_cache_path(
|
|
1094
|
+
process_name,
|
|
1095
|
+
input_hash,
|
|
1096
|
+
config_hash,
|
|
1097
|
+
function_hash,
|
|
1098
|
+
)
|
|
1099
|
+
|
|
1100
|
+
loaded = None
|
|
1101
|
+
load_error = None
|
|
1102
|
+
# If index returned a path, load from appropriate backend
|
|
1103
|
+
try:
|
|
1104
|
+
if cache_path and isinstance(cache_path, str):
|
|
1105
|
+
if cache_path.startswith('gs://') and self.object_store:
|
|
1106
|
+
data = self.object_store.get_bytes(cache_path)
|
|
1107
|
+
loaded = joblib.load(io.BytesIO(data))
|
|
1108
|
+
else:
|
|
1109
|
+
p = Path(cache_path)
|
|
1110
|
+
if p.exists():
|
|
1111
|
+
loaded = joblib.load(p)
|
|
1112
|
+
else:
|
|
1113
|
+
load_error = f"Cache file not found at {cache_path}"
|
|
1114
|
+
else:
|
|
1115
|
+
load_error = "Invalid cache path format"
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
load_error = str(e)
|
|
1118
|
+
self.logger.warning(f"Failed to load cached result for {process_name}: {e}")
|
|
1119
|
+
|
|
1120
|
+
if loaded is not None:
|
|
1121
|
+
try:
|
|
1122
|
+
self.logger.info(f"[Cache] process hit: {process_name}")
|
|
1123
|
+
except Exception:
|
|
1124
|
+
pass
|
|
1125
|
+
return loaded
|
|
1126
|
+
else:
|
|
1127
|
+
# Cache entry exists but file couldn't be loaded - treat as miss
|
|
1128
|
+
if load_error:
|
|
1129
|
+
self.logger.warning(f"⚠️ [CACHE] Stale cache entry for {process_name}: cache index exists but file load failed ({load_error}). Treating as cache miss.")
|
|
1130
|
+
try:
|
|
1131
|
+
self.logger.info(f"[Cache] process miss: {process_name}")
|
|
1132
|
+
except Exception:
|
|
1133
|
+
pass
|
|
1134
|
+
return None
|
|
1135
|
+
|
|
1136
|
+
def get_cached_process_result_with_metadata(
|
|
1137
|
+
self,
|
|
1138
|
+
process_name: str,
|
|
1139
|
+
input_hash: Optional[str] = None,
|
|
1140
|
+
config_hash: Optional[str] = None,
|
|
1141
|
+
function_hash: Optional[str] = None,
|
|
1142
|
+
run_id: Optional[str] = None,
|
|
1143
|
+
) -> Optional[tuple[Dict[str, Any], str, Dict[str, Any]]]:
|
|
1144
|
+
"""Get cached process result with metadata including cached run-id and timing.
|
|
1145
|
+
|
|
1146
|
+
Returns: (result, cached_run_id, cached_metadata) or None if not cached.
|
|
1147
|
+
|
|
1148
|
+
If cache path exists but file cannot be loaded, logs a warning and returns None
|
|
1149
|
+
to allow fallback to execution. The stale cache entry remains in the index
|
|
1150
|
+
but won't be used until it's overwritten by a successful execution.
|
|
1151
|
+
"""
|
|
1152
|
+
# First check if cache exists
|
|
1153
|
+
cache_path = self.kv_store.get_process_cache_path(
|
|
1154
|
+
process_name,
|
|
1155
|
+
input_hash,
|
|
1156
|
+
config_hash,
|
|
1157
|
+
function_hash,
|
|
1158
|
+
)
|
|
1159
|
+
|
|
1160
|
+
if not cache_path:
|
|
1161
|
+
try:
|
|
1162
|
+
self.logger.info(f"[Cache] process miss: {process_name}")
|
|
1163
|
+
except Exception:
|
|
1164
|
+
pass
|
|
1165
|
+
return None
|
|
1166
|
+
|
|
1167
|
+
# Get the full cache record to extract metadata
|
|
1168
|
+
cache_record = self.kv_store.get_process_cache_record(
|
|
1169
|
+
process_name,
|
|
1170
|
+
input_hash,
|
|
1171
|
+
config_hash,
|
|
1172
|
+
function_hash,
|
|
1173
|
+
)
|
|
1174
|
+
|
|
1175
|
+
if not cache_record:
|
|
1176
|
+
return None
|
|
1177
|
+
|
|
1178
|
+
# Load the actual cached result
|
|
1179
|
+
load_error = None
|
|
1180
|
+
try:
|
|
1181
|
+
if cache_path and isinstance(cache_path, str):
|
|
1182
|
+
if cache_path.startswith('gs://') and self.object_store:
|
|
1183
|
+
data = self.object_store.get_bytes(cache_path)
|
|
1184
|
+
result = joblib.load(io.BytesIO(data))
|
|
1185
|
+
else:
|
|
1186
|
+
p = Path(cache_path)
|
|
1187
|
+
if p.exists():
|
|
1188
|
+
result = joblib.load(p)
|
|
1189
|
+
else:
|
|
1190
|
+
result = None
|
|
1191
|
+
load_error = f"Cache file not found at {cache_path}"
|
|
1192
|
+
else:
|
|
1193
|
+
result = None
|
|
1194
|
+
load_error = "Invalid cache path format"
|
|
1195
|
+
except Exception as e:
|
|
1196
|
+
load_error = str(e)
|
|
1197
|
+
result = None
|
|
1198
|
+
|
|
1199
|
+
# Extract metadata
|
|
1200
|
+
# Prefer cached_run_id if it exists (points to original run that executed the process)
|
|
1201
|
+
# Otherwise use run_id (this record is from the original execution)
|
|
1202
|
+
cached_run_id = cache_record.get("cached_run_id") or cache_record.get("run_id", "unknown")
|
|
1203
|
+
cached_metadata = {
|
|
1204
|
+
"started_at": cache_record.get("cached_started_at") or cache_record.get("started_at"),
|
|
1205
|
+
"ended_at": cache_record.get("cached_ended_at") or cache_record.get("ended_at"),
|
|
1206
|
+
"execution_time": cache_record.get("cached_execution_time") or cache_record.get("execution_time"),
|
|
1207
|
+
"run_id": cached_run_id,
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
if result is not None:
|
|
1211
|
+
try:
|
|
1212
|
+
self.logger.info(f"[Cache] process hit: {process_name}")
|
|
1213
|
+
except Exception:
|
|
1214
|
+
pass
|
|
1215
|
+
return (result, cached_run_id, cached_metadata)
|
|
1216
|
+
else:
|
|
1217
|
+
# Cache entry exists but file couldn't be loaded - treat as miss
|
|
1218
|
+
# This can happen if cache files were deleted or corrupted
|
|
1219
|
+
try:
|
|
1220
|
+
if load_error:
|
|
1221
|
+
self.logger.warning(f"⚠️ [CACHE] Stale cache entry for {process_name}: cache index exists but file load failed ({load_error}). Treating as cache miss and will re-execute.")
|
|
1222
|
+
else:
|
|
1223
|
+
self.logger.info(f"[Cache] process miss: {process_name}")
|
|
1224
|
+
except Exception:
|
|
1225
|
+
pass
|
|
1226
|
+
return None
|
|
1227
|
+
|
|
1228
|
+
def load_process_result_from_path(self, cache_path: str) -> Optional[Dict[str, Any]]:
|
|
1229
|
+
"""Load a process result dictionary directly from a known cache path or object URI.
|
|
1230
|
+
|
|
1231
|
+
Best-effort; returns None on any failure.
|
|
1232
|
+
"""
|
|
1233
|
+
try:
|
|
1234
|
+
if not cache_path or not isinstance(cache_path, str):
|
|
1235
|
+
return None
|
|
1236
|
+
if cache_path.startswith('gs://') and self.object_store:
|
|
1237
|
+
data = self.object_store.get_bytes(cache_path)
|
|
1238
|
+
return joblib.load(io.BytesIO(data))
|
|
1239
|
+
p = Path(cache_path)
|
|
1240
|
+
if p.exists():
|
|
1241
|
+
return joblib.load(p)
|
|
1242
|
+
except Exception:
|
|
1243
|
+
return None
|
|
1244
|
+
|
|
1245
|
+
def get_cache_statistics(self) -> Dict[str, Any]:
|
|
1246
|
+
"""Get cache statistics."""
|
|
1247
|
+
cache_files = list(self.cache_dir.glob("*.pkl"))
|
|
1248
|
+
total_size = sum(f.stat().st_size for f in cache_files if f.exists())
|
|
1249
|
+
return {
|
|
1250
|
+
'total_cache_files': len(cache_files),
|
|
1251
|
+
'total_cache_size_mb': total_size / (1024**2),
|
|
1252
|
+
'cache_directory': str(self.cache_dir)
|
|
1253
|
+
}
|
|
1254
|
+
|
|
1255
|
+
def complete_pipeline_execution(self, run_id: str, success: bool) -> None:
|
|
1256
|
+
"""Record pipeline completion."""
|
|
1257
|
+
status = "completed" if success else "failed"
|
|
1258
|
+
try:
|
|
1259
|
+
self.kv_store.mark_pipeline_completed(run_id, success)
|
|
1260
|
+
except Exception as e:
|
|
1261
|
+
self.logger.warning(f"kv_store pipeline completion failed: {e}")
|
|
1262
|
+
|
|
1263
|
+
def get_step_results(self, run_id: str) -> Dict[str, Dict[str, Any]]:
|
|
1264
|
+
"""Get all step results for a run."""
|
|
1265
|
+
results = {}
|
|
1266
|
+
try:
|
|
1267
|
+
step_records = self.kv_store.list_run_steps(run_id)
|
|
1268
|
+
for key, rec in step_records.items():
|
|
1269
|
+
cache_path = rec.get("cache_path")
|
|
1270
|
+
if cache_path:
|
|
1271
|
+
try:
|
|
1272
|
+
if isinstance(cache_path, str) and cache_path.startswith('gs://') and self.object_store:
|
|
1273
|
+
data = self.object_store.get_bytes(cache_path)
|
|
1274
|
+
result = joblib.load(io.BytesIO(data))
|
|
1275
|
+
else:
|
|
1276
|
+
result = joblib.load(Path(cache_path))
|
|
1277
|
+
step_name = key.split(".", 1)[-1]
|
|
1278
|
+
results[step_name] = result
|
|
1279
|
+
except Exception as e:
|
|
1280
|
+
self.logger.warning(f"Failed to load result for {key}: {e}")
|
|
1281
|
+
except Exception as e:
|
|
1282
|
+
self.logger.warning(f"kv_store list_run_steps failed: {e}")
|
|
1283
|
+
return results
|
|
1284
|
+
|
|
1285
|
+
def can_resume_from_step(self, run_id: str, step_name: str, config_hash: str) -> bool:
|
|
1286
|
+
"""Check if pipeline can be resumed (not supported without persistent status)."""
|
|
1287
|
+
return False
|
|
1288
|
+
|
|
1289
|
+
def get_pipeline_stats(self, run_id: str) -> Dict[str, Any]:
|
|
1290
|
+
"""Get pipeline statistics."""
|
|
1291
|
+
try:
|
|
1292
|
+
return self.kv_store.get_pipeline_stats(run_id)
|
|
1293
|
+
except Exception:
|
|
1294
|
+
return {}
|
|
1295
|
+
|
|
1296
|
+
def _compute_config_hash(self, config: Dict[str, Any]) -> str:
|
|
1297
|
+
"""Compute a hash of the step graph configuration for change detection."""
|
|
1298
|
+
return self._compute_hash(config)
|