flowyml 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowyml/core/execution_status.py +1 -0
- flowyml/core/executor.py +175 -3
- flowyml/storage/sql.py +53 -13
- flowyml/ui/backend/main.py +2 -0
- flowyml/ui/backend/routers/assets.py +36 -0
- flowyml/ui/backend/routers/execution.py +2 -2
- flowyml/ui/backend/routers/runs.py +211 -0
- flowyml/ui/backend/routers/stats.py +2 -2
- flowyml/ui/backend/routers/websocket.py +121 -0
- flowyml/ui/frontend/dist/assets/index-CBUXOWze.css +1 -0
- flowyml/ui/frontend/dist/assets/index-DF8dJaFL.js +629 -0
- flowyml/ui/frontend/dist/index.html +2 -2
- flowyml/ui/frontend/package-lock.json +289 -0
- flowyml/ui/frontend/package.json +1 -0
- flowyml/ui/frontend/src/app/compare/page.jsx +213 -0
- flowyml/ui/frontend/src/app/experiments/compare/page.jsx +289 -0
- flowyml/ui/frontend/src/app/experiments/page.jsx +61 -1
- flowyml/ui/frontend/src/app/runs/[runId]/page.jsx +418 -203
- flowyml/ui/frontend/src/app/runs/page.jsx +64 -3
- flowyml/ui/frontend/src/app/settings/page.jsx +1 -1
- flowyml/ui/frontend/src/app/tokens/page.jsx +8 -6
- flowyml/ui/frontend/src/components/ArtifactViewer.jsx +159 -0
- flowyml/ui/frontend/src/components/NavigationTree.jsx +26 -9
- flowyml/ui/frontend/src/components/PipelineGraph.jsx +26 -24
- flowyml/ui/frontend/src/components/RunDetailsPanel.jsx +42 -14
- flowyml/ui/frontend/src/router/index.jsx +4 -0
- {flowyml-1.4.0.dist-info → flowyml-1.5.0.dist-info}/METADATA +1 -1
- {flowyml-1.4.0.dist-info → flowyml-1.5.0.dist-info}/RECORD +31 -27
- flowyml/ui/frontend/dist/assets/index-DcYwrn2j.css +0 -1
- flowyml/ui/frontend/dist/assets/index-Dlz_ygOL.js +0 -592
- {flowyml-1.4.0.dist-info → flowyml-1.5.0.dist-info}/WHEEL +0 -0
- {flowyml-1.4.0.dist-info → flowyml-1.5.0.dist-info}/entry_points.txt +0 -0
- {flowyml-1.4.0.dist-info → flowyml-1.5.0.dist-info}/licenses/LICENSE +0 -0
flowyml/core/execution_status.py
CHANGED
flowyml/core/executor.py
CHANGED
|
@@ -7,6 +7,133 @@ from typing import Any
|
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
from datetime import datetime
|
|
9
9
|
|
|
10
|
+
import threading
|
|
11
|
+
import ctypes
|
|
12
|
+
import requests
|
|
13
|
+
import os
|
|
14
|
+
import inspect
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StopExecutionError(Exception):
|
|
18
|
+
"""Exception raised when execution is stopped externally."""
|
|
19
|
+
|
|
20
|
+
pass
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# Alias for backwards compatibility
|
|
24
|
+
StopExecution = StopExecutionError
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _async_raise(tid, exctype):
|
|
28
|
+
"""Raises an exception in the threads with id tid"""
|
|
29
|
+
if not inspect.isclass(exctype):
|
|
30
|
+
raise TypeError("Only types can be raised (not instances)")
|
|
31
|
+
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(tid), ctypes.py_object(exctype))
|
|
32
|
+
if res == 0:
|
|
33
|
+
raise ValueError("invalid thread id")
|
|
34
|
+
if res != 1:
|
|
35
|
+
# """if it returns a number greater than one, you're in trouble,
|
|
36
|
+
# and you should call it again with exc=NULL to revert the effect"""
|
|
37
|
+
ctypes.pythonapi.PyThreadState_SetAsyncExc(ctypes.c_long(tid), None)
|
|
38
|
+
raise SystemError("PyThreadState_SetAsyncExc failed")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LogCapture:
|
|
42
|
+
"""Context manager to capture stdout/stderr for streaming to the server."""
|
|
43
|
+
|
|
44
|
+
def __init__(self):
|
|
45
|
+
self._buffer = []
|
|
46
|
+
self._lock = threading.Lock()
|
|
47
|
+
|
|
48
|
+
def write(self, text):
|
|
49
|
+
if text.strip():
|
|
50
|
+
with self._lock:
|
|
51
|
+
self._buffer.append(text)
|
|
52
|
+
|
|
53
|
+
def flush(self):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
def get_and_clear(self) -> list[str]:
|
|
57
|
+
with self._lock:
|
|
58
|
+
lines = self._buffer[:]
|
|
59
|
+
self._buffer.clear()
|
|
60
|
+
return lines
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class MonitorThread(threading.Thread):
|
|
64
|
+
"""Background thread that sends heartbeats and flushes logs to the server."""
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
run_id: str,
|
|
69
|
+
step_name: str,
|
|
70
|
+
target_tid: int,
|
|
71
|
+
log_capture: LogCapture | None = None,
|
|
72
|
+
interval: int = 5,
|
|
73
|
+
):
|
|
74
|
+
super().__init__()
|
|
75
|
+
self.run_id = run_id
|
|
76
|
+
self.step_name = step_name
|
|
77
|
+
self.target_tid = target_tid
|
|
78
|
+
self.log_capture = log_capture
|
|
79
|
+
self.interval = interval
|
|
80
|
+
self._stop_event = threading.Event()
|
|
81
|
+
self.api_url = os.getenv("FLOWYML_SERVER_URL", "http://localhost:8000")
|
|
82
|
+
|
|
83
|
+
def stop(self):
|
|
84
|
+
self._stop_event.set()
|
|
85
|
+
|
|
86
|
+
def _flush_logs(self):
|
|
87
|
+
"""Send captured logs to the server."""
|
|
88
|
+
if not self.log_capture:
|
|
89
|
+
return
|
|
90
|
+
|
|
91
|
+
lines = self.log_capture.get_and_clear()
|
|
92
|
+
if not lines:
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
content = "".join(lines)
|
|
96
|
+
with contextlib.suppress(Exception):
|
|
97
|
+
requests.post(
|
|
98
|
+
f"{self.api_url}/api/runs/{self.run_id}/steps/{self.step_name}/logs",
|
|
99
|
+
json={
|
|
100
|
+
"content": content,
|
|
101
|
+
"level": "INFO",
|
|
102
|
+
"timestamp": datetime.now().isoformat(),
|
|
103
|
+
},
|
|
104
|
+
timeout=2,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def run(self):
|
|
108
|
+
while not self._stop_event.is_set():
|
|
109
|
+
try:
|
|
110
|
+
# Send heartbeat
|
|
111
|
+
response = requests.post(
|
|
112
|
+
f"{self.api_url}/api/runs/{self.run_id}/steps/{self.step_name}/heartbeat",
|
|
113
|
+
json={"step_name": self.step_name, "status": "running"},
|
|
114
|
+
timeout=2,
|
|
115
|
+
)
|
|
116
|
+
if response.status_code == 200:
|
|
117
|
+
data = response.json()
|
|
118
|
+
if data.get("action") == "stop":
|
|
119
|
+
print(f"Received stop signal for step {self.step_name}")
|
|
120
|
+
_async_raise(self.target_tid, StopExecution)
|
|
121
|
+
break
|
|
122
|
+
except Exception:
|
|
123
|
+
pass # Ignore heartbeat failures
|
|
124
|
+
|
|
125
|
+
# Flush logs
|
|
126
|
+
self._flush_logs()
|
|
127
|
+
|
|
128
|
+
self._stop_event.wait(self.interval)
|
|
129
|
+
|
|
130
|
+
# Final log flush
|
|
131
|
+
self._flush_logs()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# Keep HeartbeatThread as an alias for backwards compatibility
|
|
135
|
+
HeartbeatThread = MonitorThread
|
|
136
|
+
|
|
10
137
|
|
|
11
138
|
@dataclass
|
|
12
139
|
class ExecutionResult:
|
|
@@ -103,8 +230,6 @@ class LocalExecutor(Executor):
|
|
|
103
230
|
# or just pass what we can.
|
|
104
231
|
# A simple approach: pass nothing if it takes no args, or kwargs if it does.
|
|
105
232
|
# But inspect is safer.
|
|
106
|
-
import inspect
|
|
107
|
-
|
|
108
233
|
sig = inspect.signature(step.condition)
|
|
109
234
|
kwargs = {**inputs, **context_params}
|
|
110
235
|
|
|
@@ -157,7 +282,54 @@ class LocalExecutor(Executor):
|
|
|
157
282
|
kwargs = {**inputs, **context_params}
|
|
158
283
|
|
|
159
284
|
# Execute step
|
|
160
|
-
|
|
285
|
+
monitor_thread = None
|
|
286
|
+
log_capture = None
|
|
287
|
+
original_stdout = None
|
|
288
|
+
original_stderr = None
|
|
289
|
+
try:
|
|
290
|
+
# Start monitoring thread with log capture if run_id is present
|
|
291
|
+
if run_id:
|
|
292
|
+
import sys
|
|
293
|
+
|
|
294
|
+
log_capture = LogCapture()
|
|
295
|
+
original_stdout = sys.stdout
|
|
296
|
+
original_stderr = sys.stderr
|
|
297
|
+
sys.stdout = log_capture
|
|
298
|
+
sys.stderr = log_capture
|
|
299
|
+
|
|
300
|
+
monitor_thread = MonitorThread(
|
|
301
|
+
run_id=run_id,
|
|
302
|
+
step_name=step.name,
|
|
303
|
+
target_tid=threading.get_ident(),
|
|
304
|
+
log_capture=log_capture,
|
|
305
|
+
)
|
|
306
|
+
monitor_thread.start()
|
|
307
|
+
|
|
308
|
+
result = step.func(**kwargs)
|
|
309
|
+
except StopExecution:
|
|
310
|
+
duration = time.time() - start_time
|
|
311
|
+
return ExecutionResult(
|
|
312
|
+
step_name=step.name,
|
|
313
|
+
success=False,
|
|
314
|
+
error="Execution stopped by user",
|
|
315
|
+
duration_seconds=duration,
|
|
316
|
+
retries=retries,
|
|
317
|
+
)
|
|
318
|
+
finally:
|
|
319
|
+
# Restore stdout/stderr
|
|
320
|
+
if original_stdout:
|
|
321
|
+
import sys
|
|
322
|
+
|
|
323
|
+
sys.stdout = original_stdout
|
|
324
|
+
if original_stderr:
|
|
325
|
+
import sys
|
|
326
|
+
|
|
327
|
+
sys.stderr = original_stderr
|
|
328
|
+
|
|
329
|
+
# Stop monitor thread
|
|
330
|
+
if monitor_thread:
|
|
331
|
+
monitor_thread.stop()
|
|
332
|
+
monitor_thread.join()
|
|
161
333
|
|
|
162
334
|
# Materialize output if artifact store is available
|
|
163
335
|
artifact_uri = None
|
flowyml/storage/sql.py
CHANGED
|
@@ -884,26 +884,66 @@ class SQLMetadataStore(MetadataStore):
|
|
|
884
884
|
"period_days": days,
|
|
885
885
|
}
|
|
886
886
|
|
|
887
|
-
def get_statistics(self) -> dict:
|
|
887
|
+
def get_statistics(self, project: str | None = None) -> dict:
|
|
888
888
|
"""Get global statistics."""
|
|
889
889
|
with self.engine.connect() as conn:
|
|
890
|
-
# Total runs
|
|
891
|
-
|
|
890
|
+
# 1. Total runs
|
|
891
|
+
runs_stmt = select(func.count()).select_from(self.runs)
|
|
892
|
+
if project:
|
|
893
|
+
runs_stmt = runs_stmt.where(self.runs.c.project == project)
|
|
894
|
+
total_runs = conn.execute(runs_stmt).scalar() or 0
|
|
892
895
|
|
|
893
|
-
# Total pipelines
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
896
|
+
# 2. Total pipelines (unique names)
|
|
897
|
+
pipelines_stmt = select(func.count(func.distinct(self.runs.c.pipeline_name)))
|
|
898
|
+
if project:
|
|
899
|
+
pipelines_stmt = pipelines_stmt.where(self.runs.c.project == project)
|
|
900
|
+
total_pipelines = conn.execute(pipelines_stmt).scalar() or 0
|
|
897
901
|
|
|
898
|
-
# Total
|
|
899
|
-
|
|
902
|
+
# 3. Total artifacts
|
|
903
|
+
artifacts_stmt = select(func.count()).select_from(self.artifacts)
|
|
904
|
+
if project:
|
|
905
|
+
artifacts_stmt = artifacts_stmt.where(self.artifacts.c.project == project)
|
|
906
|
+
total_artifacts = conn.execute(artifacts_stmt).scalar() or 0
|
|
900
907
|
|
|
901
|
-
# Total
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
908
|
+
# 4. Total experiments
|
|
909
|
+
experiments_stmt = select(func.count()).select_from(self.experiments)
|
|
910
|
+
if project:
|
|
911
|
+
experiments_stmt = experiments_stmt.where(self.experiments.c.project == project)
|
|
912
|
+
total_experiments = conn.execute(experiments_stmt).scalar() or 0
|
|
913
|
+
|
|
914
|
+
# 5. Total models
|
|
915
|
+
models_stmt = select(func.count(func.distinct(self.model_metrics.c.model_name)))
|
|
916
|
+
if project:
|
|
917
|
+
models_stmt = models_stmt.where(self.model_metrics.c.project == project)
|
|
918
|
+
total_models = conn.execute(models_stmt).scalar() or 0
|
|
919
|
+
|
|
920
|
+
# 6. Status counts (completed vs failed)
|
|
921
|
+
status_stmt = select(self.runs.c.status, func.count()).group_by(self.runs.c.status)
|
|
922
|
+
if project:
|
|
923
|
+
status_stmt = status_stmt.where(self.runs.c.project == project)
|
|
924
|
+
|
|
925
|
+
status_rows = conn.execute(status_stmt).fetchall()
|
|
926
|
+
status_map = {row[0]: row[1] for row in status_rows if row[0]}
|
|
927
|
+
|
|
928
|
+
completed_runs = status_map.get("completed", 0)
|
|
929
|
+
failed_runs = status_map.get("failed", 0)
|
|
930
|
+
|
|
931
|
+
# 7. Avg duration (only completed runs)
|
|
932
|
+
dur_stmt = select(func.avg(self.runs.c.duration)).where(self.runs.c.status == "completed")
|
|
933
|
+
if project:
|
|
934
|
+
dur_stmt = dur_stmt.where(self.runs.c.project == project)
|
|
935
|
+
|
|
936
|
+
avg_duration = conn.execute(dur_stmt).scalar() or 0.0
|
|
905
937
|
|
|
906
938
|
return {
|
|
939
|
+
# Frontend-friendly keys
|
|
940
|
+
"pipelines": total_pipelines,
|
|
941
|
+
"runs": total_runs,
|
|
942
|
+
"artifacts": total_artifacts,
|
|
943
|
+
"completed_runs": completed_runs,
|
|
944
|
+
"failed_runs": failed_runs,
|
|
945
|
+
"avg_duration": avg_duration,
|
|
946
|
+
# Backward compatibility
|
|
907
947
|
"total_runs": total_runs,
|
|
908
948
|
"total_pipelines": total_pipelines,
|
|
909
949
|
"total_experiments": total_experiments,
|
flowyml/ui/backend/main.py
CHANGED
|
@@ -24,6 +24,7 @@ from flowyml.ui.backend.routers import (
|
|
|
24
24
|
metrics,
|
|
25
25
|
client,
|
|
26
26
|
stats,
|
|
27
|
+
websocket,
|
|
27
28
|
)
|
|
28
29
|
|
|
29
30
|
app = FastAPI(
|
|
@@ -77,6 +78,7 @@ app.include_router(metrics.router, prefix="/api/metrics", tags=["metrics"])
|
|
|
77
78
|
app.include_router(plugins.router, prefix="/api", tags=["plugins"])
|
|
78
79
|
app.include_router(client.router, prefix="/api/client", tags=["client"])
|
|
79
80
|
app.include_router(stats.router, prefix="/api/stats", tags=["stats"])
|
|
81
|
+
app.include_router(websocket.router, tags=["websocket"])
|
|
80
82
|
|
|
81
83
|
|
|
82
84
|
# Static file serving for frontend
|
|
@@ -476,6 +476,42 @@ async def download_asset(artifact_id: str):
|
|
|
476
476
|
)
|
|
477
477
|
|
|
478
478
|
|
|
479
|
+
@router.get("/{artifact_id}/content")
|
|
480
|
+
async def get_asset_content(artifact_id: str):
|
|
481
|
+
"""Get the artifact content for inline viewing."""
|
|
482
|
+
import mimetypes
|
|
483
|
+
|
|
484
|
+
asset, _ = _find_asset_with_store(artifact_id)
|
|
485
|
+
if not asset:
|
|
486
|
+
raise HTTPException(status_code=404, detail="Asset not found")
|
|
487
|
+
|
|
488
|
+
artifact_path = asset.get("path")
|
|
489
|
+
if not artifact_path:
|
|
490
|
+
raise HTTPException(status_code=404, detail="Artifact path not available")
|
|
491
|
+
|
|
492
|
+
# Handle relative paths for local store
|
|
493
|
+
from flowyml.utils.config import get_config
|
|
494
|
+
|
|
495
|
+
config = get_config()
|
|
496
|
+
|
|
497
|
+
file_path = Path(artifact_path)
|
|
498
|
+
if not file_path.is_absolute():
|
|
499
|
+
file_path = config.artifacts_dir / file_path
|
|
500
|
+
|
|
501
|
+
if not file_path.exists():
|
|
502
|
+
raise HTTPException(status_code=404, detail="Artifact file not found on disk")
|
|
503
|
+
|
|
504
|
+
# Guess mime type
|
|
505
|
+
mime_type, _ = mimetypes.guess_type(file_path.name)
|
|
506
|
+
if not mime_type:
|
|
507
|
+
mime_type = "text/plain" # Default fallback
|
|
508
|
+
|
|
509
|
+
return FileResponse(
|
|
510
|
+
path=file_path,
|
|
511
|
+
media_type=mime_type,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
|
|
479
515
|
class ProjectUpdate(BaseModel):
|
|
480
516
|
project_name: str
|
|
481
517
|
|
|
@@ -97,10 +97,10 @@ async def execute_pipeline(
|
|
|
97
97
|
run_kwargs = request.parameters.copy()
|
|
98
98
|
|
|
99
99
|
if request.retry_count > 0:
|
|
100
|
-
from flowyml.core.
|
|
100
|
+
from flowyml.core.retry_policy import OrchestratorRetryPolicy
|
|
101
101
|
|
|
102
102
|
run_kwargs["retry_policy"] = OrchestratorRetryPolicy(
|
|
103
|
-
|
|
103
|
+
max_attempts=min(request.retry_count, 5), # Cap at 5
|
|
104
104
|
)
|
|
105
105
|
|
|
106
106
|
result = pipeline.run(**run_kwargs)
|
|
@@ -146,6 +146,24 @@ async def get_run(run_id: str):
|
|
|
146
146
|
run, _ = _find_run(run_id)
|
|
147
147
|
if not run:
|
|
148
148
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
149
|
+
|
|
150
|
+
# Mark dead steps
|
|
151
|
+
dead_steps = _get_dead_steps(run_id)
|
|
152
|
+
if dead_steps and "steps" in run:
|
|
153
|
+
for step_name in dead_steps:
|
|
154
|
+
if step_name in run["steps"]:
|
|
155
|
+
# Only mark as dead if it was running
|
|
156
|
+
if run["steps"][step_name].get("status") == "running":
|
|
157
|
+
run["steps"][step_name]["status"] = "dead"
|
|
158
|
+
run["steps"][step_name]["success"] = False
|
|
159
|
+
|
|
160
|
+
# Inject heartbeat timestamps
|
|
161
|
+
with _heartbeat_lock:
|
|
162
|
+
if run_id in _heartbeat_timestamps:
|
|
163
|
+
for step_name, ts in _heartbeat_timestamps[run_id].items():
|
|
164
|
+
if step_name in run.get("steps", {}):
|
|
165
|
+
run["steps"][step_name]["last_heartbeat"] = ts
|
|
166
|
+
|
|
149
167
|
return run
|
|
150
168
|
|
|
151
169
|
|
|
@@ -273,3 +291,196 @@ async def get_cloud_status(run_id: str):
|
|
|
273
291
|
"cloud_status": cloud_status,
|
|
274
292
|
"cloud_error": cloud_error,
|
|
275
293
|
}
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class HeartbeatRequest(BaseModel):
|
|
297
|
+
step_name: str
|
|
298
|
+
status: str = "running"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# In-memory storage for heartbeat timestamps
|
|
302
|
+
# Format: {run_id: {step_name: last_heartbeat_timestamp}}
|
|
303
|
+
_heartbeat_timestamps: dict[str, dict[str, float]] = {}
|
|
304
|
+
_heartbeat_lock = __import__("threading").Lock()
|
|
305
|
+
|
|
306
|
+
# Heartbeat interval in seconds (should match executor's interval)
|
|
307
|
+
HEARTBEAT_INTERVAL = 5
|
|
308
|
+
# Number of missed heartbeats before marking step as dead
|
|
309
|
+
DEAD_THRESHOLD = 3
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _record_heartbeat(run_id: str, step_name: str) -> None:
|
|
313
|
+
"""Record heartbeat timestamp for a step."""
|
|
314
|
+
import time
|
|
315
|
+
|
|
316
|
+
with _heartbeat_lock:
|
|
317
|
+
if run_id not in _heartbeat_timestamps:
|
|
318
|
+
_heartbeat_timestamps[run_id] = {}
|
|
319
|
+
_heartbeat_timestamps[run_id][step_name] = time.time()
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _get_dead_steps(run_id: str) -> list[str]:
|
|
323
|
+
"""Get list of steps that have missed too many heartbeats."""
|
|
324
|
+
import time
|
|
325
|
+
|
|
326
|
+
dead_steps = []
|
|
327
|
+
timeout = HEARTBEAT_INTERVAL * DEAD_THRESHOLD
|
|
328
|
+
|
|
329
|
+
with _heartbeat_lock:
|
|
330
|
+
if run_id not in _heartbeat_timestamps:
|
|
331
|
+
return []
|
|
332
|
+
|
|
333
|
+
current_time = time.time()
|
|
334
|
+
for step_name, last_heartbeat in _heartbeat_timestamps[run_id].items():
|
|
335
|
+
if current_time - last_heartbeat > timeout:
|
|
336
|
+
dead_steps.append(step_name)
|
|
337
|
+
|
|
338
|
+
return dead_steps
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _cleanup_heartbeats(run_id: str) -> None:
|
|
342
|
+
"""Remove heartbeat tracking for a completed run."""
|
|
343
|
+
with _heartbeat_lock:
|
|
344
|
+
_heartbeat_timestamps.pop(run_id, None)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
@router.post("/{run_id}/steps/{step_name}/heartbeat")
|
|
348
|
+
async def step_heartbeat(run_id: str, step_name: str, heartbeat: HeartbeatRequest):
|
|
349
|
+
"""Receive heartbeat from a running step.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
dict: Instructions for the step (e.g., {"action": "continue"} or {"action": "stop"})
|
|
353
|
+
"""
|
|
354
|
+
store = _find_store_for_run(run_id)
|
|
355
|
+
|
|
356
|
+
# Record heartbeat timestamp
|
|
357
|
+
_record_heartbeat(run_id, step_name)
|
|
358
|
+
|
|
359
|
+
# Check if run is marked for stopping
|
|
360
|
+
run = store.load_run(run_id)
|
|
361
|
+
if not run:
|
|
362
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
363
|
+
|
|
364
|
+
run_status = run.get("status")
|
|
365
|
+
if run_status in ["stopping", "stopped", "cancelled", "cancelling"]:
|
|
366
|
+
return {"action": "stop"}
|
|
367
|
+
|
|
368
|
+
return {"action": "continue"}
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
@router.get("/{run_id}/dead-steps")
|
|
372
|
+
async def get_dead_steps(run_id: str):
|
|
373
|
+
"""Get list of steps that appear to be dead (missed heartbeats)."""
|
|
374
|
+
dead_steps = _get_dead_steps(run_id)
|
|
375
|
+
return {"dead_steps": dead_steps}
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
@router.post("/{run_id}/stop")
|
|
379
|
+
async def stop_run(run_id: str):
|
|
380
|
+
"""Signal a run to stop."""
|
|
381
|
+
store = _find_store_for_run(run_id)
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
# Update run status to STOPPING
|
|
385
|
+
# This will be picked up by the next heartbeat
|
|
386
|
+
store.update_run_status(run_id, "stopping")
|
|
387
|
+
return {"status": "success", "message": "Stop signal sent"}
|
|
388
|
+
except Exception as e:
|
|
389
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
class LogChunk(BaseModel):
|
|
393
|
+
content: str
|
|
394
|
+
level: str = "INFO"
|
|
395
|
+
timestamp: str | None = None
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
@router.post("/{run_id}/steps/{step_name}/logs")
|
|
399
|
+
async def post_step_logs(run_id: str, step_name: str, log_chunk: LogChunk):
|
|
400
|
+
"""Receive log chunk from a running step."""
|
|
401
|
+
import anyio
|
|
402
|
+
|
|
403
|
+
from flowyml.utils.config import get_config
|
|
404
|
+
|
|
405
|
+
# Store logs in the runs directory
|
|
406
|
+
runs_dir = get_config().runs_dir
|
|
407
|
+
log_dir = runs_dir / run_id / "logs"
|
|
408
|
+
log_dir.mkdir(parents=True, exist_ok=True)
|
|
409
|
+
|
|
410
|
+
log_file = log_dir / f"{step_name}.log"
|
|
411
|
+
|
|
412
|
+
# Append log content
|
|
413
|
+
timestamp = log_chunk.timestamp or ""
|
|
414
|
+
line = f"[{timestamp}] [{log_chunk.level}] {log_chunk.content}\n"
|
|
415
|
+
|
|
416
|
+
def write_log():
|
|
417
|
+
with open(log_file, "a") as f:
|
|
418
|
+
f.write(line)
|
|
419
|
+
|
|
420
|
+
await anyio.to_thread.run_sync(write_log)
|
|
421
|
+
|
|
422
|
+
# Broadcast to WebSocket clients
|
|
423
|
+
try:
|
|
424
|
+
from flowyml.ui.backend.routers.websocket import manager
|
|
425
|
+
|
|
426
|
+
await manager.broadcast_log(run_id, step_name, log_chunk.content)
|
|
427
|
+
except Exception:
|
|
428
|
+
pass # Ignore WebSocket broadcast failures
|
|
429
|
+
|
|
430
|
+
return {"status": "success"}
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
@router.get("/{run_id}/steps/{step_name}/logs")
|
|
434
|
+
async def get_step_logs(run_id: str, step_name: str, offset: int = 0):
|
|
435
|
+
"""Get logs for a specific step."""
|
|
436
|
+
import anyio
|
|
437
|
+
|
|
438
|
+
from flowyml.utils.config import get_config
|
|
439
|
+
|
|
440
|
+
runs_dir = get_config().runs_dir
|
|
441
|
+
log_file = runs_dir / run_id / "logs" / f"{step_name}.log"
|
|
442
|
+
|
|
443
|
+
if not log_file.exists():
|
|
444
|
+
return {"logs": "", "offset": 0, "has_more": False}
|
|
445
|
+
|
|
446
|
+
def read_log():
|
|
447
|
+
with open(log_file) as f:
|
|
448
|
+
return f.read()
|
|
449
|
+
|
|
450
|
+
content = await anyio.to_thread.run_sync(read_log)
|
|
451
|
+
|
|
452
|
+
# Return content from offset
|
|
453
|
+
if offset > 0 and offset < len(content):
|
|
454
|
+
content = content[offset:]
|
|
455
|
+
|
|
456
|
+
return {
|
|
457
|
+
"logs": content,
|
|
458
|
+
"offset": offset + len(content),
|
|
459
|
+
"has_more": False, # For now, always return all available
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
@router.get("/{run_id}/logs")
|
|
464
|
+
async def get_run_logs(run_id: str):
|
|
465
|
+
"""Get all logs for a run."""
|
|
466
|
+
import anyio
|
|
467
|
+
|
|
468
|
+
from flowyml.utils.config import get_config
|
|
469
|
+
|
|
470
|
+
runs_dir = get_config().runs_dir
|
|
471
|
+
log_dir = runs_dir / run_id / "logs"
|
|
472
|
+
|
|
473
|
+
if not log_dir.exists():
|
|
474
|
+
return {"logs": {}}
|
|
475
|
+
|
|
476
|
+
def read_all_logs():
|
|
477
|
+
logs = {}
|
|
478
|
+
for log_file in log_dir.glob("*.log"):
|
|
479
|
+
step_name = log_file.stem
|
|
480
|
+
with open(log_file) as f:
|
|
481
|
+
logs[step_name] = f.read()
|
|
482
|
+
return logs
|
|
483
|
+
|
|
484
|
+
logs = await anyio.to_thread.run_sync(read_all_logs)
|
|
485
|
+
|
|
486
|
+
return {"logs": logs}
|
|
@@ -5,10 +5,10 @@ router = APIRouter()
|
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@router.get("/")
|
|
8
|
-
async def get_global_stats():
|
|
8
|
+
async def get_global_stats(project: str | None = None):
|
|
9
9
|
"""Get global statistics."""
|
|
10
10
|
try:
|
|
11
11
|
store = get_store()
|
|
12
|
-
return store.get_statistics()
|
|
12
|
+
return store.get_statistics(project=project)
|
|
13
13
|
except Exception as e:
|
|
14
14
|
raise HTTPException(status_code=500, detail=str(e))
|