pyoco 0.3.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyoco/cli/main.py +182 -23
- pyoco/client.py +29 -9
- pyoco/core/context.py +81 -1
- pyoco/core/engine.py +182 -3
- pyoco/core/exceptions.py +15 -0
- pyoco/core/models.py +130 -1
- pyoco/discovery/loader.py +32 -1
- pyoco/discovery/plugins.py +148 -0
- pyoco/dsl/expressions.py +160 -0
- pyoco/dsl/nodes.py +56 -0
- pyoco/dsl/syntax.py +241 -95
- pyoco/dsl/validator.py +104 -0
- pyoco/server/api.py +59 -18
- pyoco/server/metrics.py +113 -0
- pyoco/server/models.py +2 -0
- pyoco/server/store.py +153 -16
- pyoco/server/webhook.py +108 -0
- pyoco/socketless_reset.py +7 -0
- pyoco/worker/runner.py +3 -8
- {pyoco-0.3.0.dist-info → pyoco-0.5.1.dist-info}/METADATA +16 -1
- pyoco-0.5.1.dist-info/RECORD +33 -0
- pyoco-0.3.0.dist-info/RECORD +0 -25
- {pyoco-0.3.0.dist-info → pyoco-0.5.1.dist-info}/WHEEL +0 -0
- {pyoco-0.3.0.dist-info → pyoco-0.5.1.dist-info}/top_level.txt +0 -0
pyoco/server/api.py
CHANGED
|
@@ -1,37 +1,43 @@
|
|
|
1
|
-
from fastapi import FastAPI, HTTPException
|
|
2
|
-
from
|
|
1
|
+
from fastapi import FastAPI, HTTPException, Query
|
|
2
|
+
from fastapi.responses import PlainTextResponse
|
|
3
|
+
from typing import List, Optional, Dict, Any
|
|
3
4
|
from .store import StateStore
|
|
4
5
|
from .models import (
|
|
5
|
-
RunSubmitRequest, RunResponse,
|
|
6
|
+
RunSubmitRequest, RunResponse,
|
|
6
7
|
WorkerPollRequest, WorkerPollResponse,
|
|
7
8
|
WorkerHeartbeatRequest, WorkerHeartbeatResponse
|
|
8
9
|
)
|
|
9
|
-
from ..core.models import
|
|
10
|
+
from ..core.models import RunStatus
|
|
11
|
+
from .metrics import metrics, metrics_content_type
|
|
10
12
|
|
|
11
13
|
app = FastAPI(title="Pyoco Kanban Server")
|
|
12
14
|
store = StateStore()
|
|
13
15
|
|
|
14
16
|
@app.post("/runs", response_model=RunResponse)
|
|
15
|
-
def submit_run(req: RunSubmitRequest):
|
|
17
|
+
async def submit_run(req: RunSubmitRequest):
|
|
16
18
|
run_ctx = store.create_run(req.flow_name, req.params)
|
|
17
19
|
return RunResponse(run_id=run_ctx.run_id, status=run_ctx.status)
|
|
18
20
|
|
|
19
|
-
@app.get("/runs"
|
|
20
|
-
def list_runs(
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
@app.get("/runs")
|
|
22
|
+
async def list_runs(
|
|
23
|
+
status: Optional[str] = None,
|
|
24
|
+
flow: Optional[str] = None,
|
|
25
|
+
limit: Optional[int] = Query(default=None, ge=1, le=200),
|
|
26
|
+
):
|
|
27
|
+
status_enum = _parse_status(status)
|
|
28
|
+
limit_value = limit if isinstance(limit, int) else None
|
|
29
|
+
runs = store.list_runs(status=status_enum, flow=flow, limit=limit_value)
|
|
30
|
+
return [store.export_run(r) for r in runs]
|
|
25
31
|
|
|
26
|
-
@app.get("/runs/{run_id}"
|
|
27
|
-
def get_run(run_id: str):
|
|
32
|
+
@app.get("/runs/{run_id}")
|
|
33
|
+
async def get_run(run_id: str):
|
|
28
34
|
run = store.get_run(run_id)
|
|
29
35
|
if not run:
|
|
30
36
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
31
|
-
return run
|
|
37
|
+
return store.export_run(run)
|
|
32
38
|
|
|
33
39
|
@app.post("/runs/{run_id}/cancel")
|
|
34
|
-
def cancel_run(run_id: str):
|
|
40
|
+
async def cancel_run(run_id: str):
|
|
35
41
|
run = store.get_run(run_id)
|
|
36
42
|
if not run:
|
|
37
43
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
@@ -39,7 +45,7 @@ def cancel_run(run_id: str):
|
|
|
39
45
|
return {"status": "CANCELLING"}
|
|
40
46
|
|
|
41
47
|
@app.post("/workers/poll", response_model=WorkerPollResponse)
|
|
42
|
-
def poll_work(req: WorkerPollRequest):
|
|
48
|
+
async def poll_work(req: WorkerPollRequest):
|
|
43
49
|
# In v0.3.0, we ignore worker_id and tags for simplicity
|
|
44
50
|
run = store.dequeue()
|
|
45
51
|
if run:
|
|
@@ -58,14 +64,49 @@ def poll_work(req: WorkerPollRequest):
|
|
|
58
64
|
return WorkerPollResponse()
|
|
59
65
|
|
|
60
66
|
@app.post("/runs/{run_id}/heartbeat", response_model=WorkerHeartbeatResponse)
|
|
61
|
-
def heartbeat(run_id: str, req: WorkerHeartbeatRequest):
|
|
67
|
+
async def heartbeat(run_id: str, req: WorkerHeartbeatRequest):
|
|
62
68
|
run = store.get_run(run_id)
|
|
63
69
|
if not run:
|
|
64
70
|
raise HTTPException(status_code=404, detail="Run not found")
|
|
65
71
|
|
|
66
|
-
store.update_run(
|
|
72
|
+
store.update_run(
|
|
73
|
+
run_id,
|
|
74
|
+
status=req.run_status,
|
|
75
|
+
task_states=req.task_states,
|
|
76
|
+
task_records=req.task_records,
|
|
77
|
+
logs=req.logs
|
|
78
|
+
)
|
|
67
79
|
|
|
68
80
|
# Check if cancellation was requested
|
|
69
81
|
cancel_requested = (run.status == RunStatus.CANCELLING)
|
|
70
82
|
|
|
71
83
|
return WorkerHeartbeatResponse(cancel_requested=cancel_requested)
|
|
84
|
+
|
|
85
|
+
@app.get("/runs/{run_id}/logs")
|
|
86
|
+
async def get_logs(run_id: str, task: Optional[str] = None, tail: Optional[int] = None):
|
|
87
|
+
run = store.get_run(run_id)
|
|
88
|
+
if not run:
|
|
89
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
90
|
+
logs = run.logs
|
|
91
|
+
if task:
|
|
92
|
+
logs = [entry for entry in logs if entry["task"] == task]
|
|
93
|
+
if tail:
|
|
94
|
+
logs = logs[-tail:]
|
|
95
|
+
return {"run_status": run.status, "logs": logs}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@app.get("/metrics")
|
|
99
|
+
async def prometheus_metrics():
|
|
100
|
+
payload = metrics.render_latest()
|
|
101
|
+
return PlainTextResponse(payload, media_type=metrics_content_type())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _parse_status(value: Optional[str]) -> Optional[RunStatus]:
|
|
105
|
+
if not value:
|
|
106
|
+
return None
|
|
107
|
+
if isinstance(value, RunStatus):
|
|
108
|
+
return value
|
|
109
|
+
try:
|
|
110
|
+
return RunStatus(value)
|
|
111
|
+
except ValueError:
|
|
112
|
+
raise HTTPException(status_code=400, detail=f"Invalid status '{value}'")
|
pyoco/server/metrics.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from prometheus_client import (
|
|
6
|
+
CollectorRegistry,
|
|
7
|
+
CONTENT_TYPE_LATEST,
|
|
8
|
+
Counter,
|
|
9
|
+
Gauge,
|
|
10
|
+
Histogram,
|
|
11
|
+
generate_latest,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from ..core.models import RunStatus
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
_DEFAULT_BUCKETS = (
|
|
18
|
+
0.05,
|
|
19
|
+
0.1,
|
|
20
|
+
0.25,
|
|
21
|
+
0.5,
|
|
22
|
+
1.0,
|
|
23
|
+
2.5,
|
|
24
|
+
5.0,
|
|
25
|
+
10.0,
|
|
26
|
+
30.0,
|
|
27
|
+
60.0,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class MetricsSink:
|
|
32
|
+
"""
|
|
33
|
+
Small wrapper that owns a CollectorRegistry so tests can reset easily.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self) -> None:
|
|
37
|
+
self.registry = CollectorRegistry()
|
|
38
|
+
self._init_metrics()
|
|
39
|
+
|
|
40
|
+
def _init_metrics(self) -> None:
|
|
41
|
+
self.runs_total = Counter(
|
|
42
|
+
"pyoco_runs_total",
|
|
43
|
+
"Total runs observed by status transitions.",
|
|
44
|
+
["status"],
|
|
45
|
+
registry=self.registry,
|
|
46
|
+
)
|
|
47
|
+
self.runs_in_progress = Gauge(
|
|
48
|
+
"pyoco_runs_in_progress",
|
|
49
|
+
"Number of runs currently executing (RUNNING).",
|
|
50
|
+
registry=self.registry,
|
|
51
|
+
)
|
|
52
|
+
self.task_duration = Histogram(
|
|
53
|
+
"pyoco_task_duration_seconds",
|
|
54
|
+
"Observed task durations.",
|
|
55
|
+
["task"],
|
|
56
|
+
buckets=_DEFAULT_BUCKETS,
|
|
57
|
+
registry=self.registry,
|
|
58
|
+
)
|
|
59
|
+
self.run_duration = Histogram(
|
|
60
|
+
"pyoco_run_duration_seconds",
|
|
61
|
+
"Observed end-to-end run durations.",
|
|
62
|
+
["flow"],
|
|
63
|
+
buckets=_DEFAULT_BUCKETS,
|
|
64
|
+
registry=self.registry,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def reset(self) -> None:
|
|
68
|
+
self.__init__()
|
|
69
|
+
|
|
70
|
+
def record_status_transition(
|
|
71
|
+
self,
|
|
72
|
+
previous: Optional[RunStatus],
|
|
73
|
+
new_status: RunStatus,
|
|
74
|
+
) -> None:
|
|
75
|
+
status_value = new_status.value if hasattr(new_status, "value") else str(new_status)
|
|
76
|
+
self.runs_total.labels(status=status_value).inc()
|
|
77
|
+
|
|
78
|
+
prev_value = previous.value if hasattr(previous, "value") else previous
|
|
79
|
+
if status_value == RunStatus.RUNNING.value:
|
|
80
|
+
if prev_value != RunStatus.RUNNING.value:
|
|
81
|
+
self.runs_in_progress.inc()
|
|
82
|
+
elif prev_value == RunStatus.RUNNING.value:
|
|
83
|
+
self.runs_in_progress.dec()
|
|
84
|
+
|
|
85
|
+
def record_task_duration(self, task_name: str, duration_ms: Optional[float]) -> None:
|
|
86
|
+
if duration_ms is None:
|
|
87
|
+
return
|
|
88
|
+
if duration_ms < 0:
|
|
89
|
+
return
|
|
90
|
+
self.task_duration.labels(task=task_name).observe(duration_ms / 1000.0)
|
|
91
|
+
|
|
92
|
+
def record_run_duration(
|
|
93
|
+
self,
|
|
94
|
+
flow_name: str,
|
|
95
|
+
start_time: Optional[float],
|
|
96
|
+
end_time: Optional[float],
|
|
97
|
+
) -> None:
|
|
98
|
+
if start_time is None or end_time is None:
|
|
99
|
+
return
|
|
100
|
+
duration = end_time - start_time
|
|
101
|
+
if duration < 0:
|
|
102
|
+
return
|
|
103
|
+
self.run_duration.labels(flow=flow_name).observe(duration)
|
|
104
|
+
|
|
105
|
+
def render_latest(self) -> bytes:
|
|
106
|
+
return generate_latest(self.registry)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
metrics = MetricsSink()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def metrics_content_type() -> str:
|
|
113
|
+
return CONTENT_TYPE_LATEST
|
pyoco/server/models.py
CHANGED
|
@@ -22,6 +22,8 @@ class WorkerPollResponse(BaseModel):
|
|
|
22
22
|
|
|
23
23
|
class WorkerHeartbeatRequest(BaseModel):
|
|
24
24
|
task_states: Dict[str, TaskState]
|
|
25
|
+
task_records: Dict[str, Any] = {}
|
|
26
|
+
logs: List[Dict[str, Any]] = []
|
|
25
27
|
run_status: RunStatus
|
|
26
28
|
|
|
27
29
|
class WorkerHeartbeatResponse(BaseModel):
|
pyoco/server/store.py
CHANGED
|
@@ -1,38 +1,63 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
import time
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, Optional, Any
|
|
7
|
+
from ..core.models import RunContext, RunStatus, TaskState
|
|
8
|
+
from .metrics import metrics
|
|
9
|
+
from .webhook import webhook_notifier
|
|
10
|
+
|
|
11
|
+
MAX_RUN_HISTORY = int(os.getenv("PYOCO_MAX_RUN_HISTORY", "50"))
|
|
12
|
+
RUN_ARCHIVE_DIR = Path(os.getenv("PYOCO_RUN_ARCHIVE_DIR", "artifacts/runs"))
|
|
13
|
+
MAX_LOG_BYTES_PER_TASK = int(os.getenv("PYOCO_MAX_LOG_BYTES", str(1024 * 1024)))
|
|
5
14
|
|
|
6
15
|
class StateStore:
|
|
7
16
|
def __init__(self):
|
|
8
17
|
self.runs: Dict[str, RunContext] = {}
|
|
9
18
|
self.queue: List[str] = []
|
|
19
|
+
self.history: List[str] = []
|
|
20
|
+
self.max_runs = MAX_RUN_HISTORY
|
|
21
|
+
self.archive_dir = RUN_ARCHIVE_DIR
|
|
22
|
+
self.log_limit_bytes = MAX_LOG_BYTES_PER_TASK
|
|
23
|
+
self.metrics = metrics
|
|
24
|
+
self.webhook = webhook_notifier
|
|
10
25
|
|
|
11
26
|
def create_run(self, flow_name: str, params: Dict) -> RunContext:
|
|
12
27
|
run_id = str(uuid.uuid4())
|
|
13
28
|
run_ctx = RunContext(
|
|
14
29
|
run_id=run_id,
|
|
30
|
+
flow_name=flow_name,
|
|
31
|
+
params=params or {},
|
|
15
32
|
status=RunStatus.PENDING,
|
|
16
33
|
start_time=time.time()
|
|
17
34
|
)
|
|
18
|
-
# Store extra metadata if needed (flow_name, params)
|
|
19
|
-
# For now, RunContext doesn't have flow_name/params fields in core.models.
|
|
20
|
-
# We might need to extend RunContext or store them separately.
|
|
21
|
-
# Let's attach them dynamically for now or assume the worker knows.
|
|
22
|
-
# Actually, the worker needs flow_name and params to run.
|
|
23
|
-
# We should store them in the store alongside the context.
|
|
24
|
-
run_ctx.flow_name = flow_name
|
|
25
|
-
run_ctx.params = params
|
|
26
35
|
|
|
27
36
|
self.runs[run_id] = run_ctx
|
|
28
37
|
self.queue.append(run_id)
|
|
38
|
+
self.history.append(run_id)
|
|
39
|
+
self._enforce_retention()
|
|
40
|
+
self.metrics.record_status_transition(None, run_ctx.status)
|
|
29
41
|
return run_ctx
|
|
30
42
|
|
|
31
43
|
def get_run(self, run_id: str) -> Optional[RunContext]:
|
|
32
44
|
return self.runs.get(run_id)
|
|
33
45
|
|
|
34
|
-
def list_runs(
|
|
35
|
-
|
|
46
|
+
def list_runs(
|
|
47
|
+
self,
|
|
48
|
+
status: Optional[RunStatus] = None,
|
|
49
|
+
flow: Optional[str] = None,
|
|
50
|
+
limit: Optional[int] = None,
|
|
51
|
+
) -> List[RunContext]:
|
|
52
|
+
runs = list(self.runs.values())
|
|
53
|
+
if status:
|
|
54
|
+
runs = [r for r in runs if r.status == status]
|
|
55
|
+
if flow:
|
|
56
|
+
runs = [r for r in runs if r.flow_name == flow]
|
|
57
|
+
runs.sort(key=lambda r: r.start_time or 0, reverse=True)
|
|
58
|
+
if limit:
|
|
59
|
+
runs = runs[:limit]
|
|
60
|
+
return runs
|
|
36
61
|
|
|
37
62
|
def dequeue(self, tags: List[str] = None) -> Optional[RunContext]:
|
|
38
63
|
# Simple FIFO queue for now. Tags ignored in v0.3.0 MVP.
|
|
@@ -53,11 +78,12 @@ class StateStore:
|
|
|
53
78
|
|
|
54
79
|
return None
|
|
55
80
|
|
|
56
|
-
def update_run(self, run_id: str, status: RunStatus = None, task_states: Dict = None):
|
|
81
|
+
def update_run(self, run_id: str, status: RunStatus = None, task_states: Dict = None, task_records: Dict = None, logs: List[Dict[str, Any]] = None):
|
|
57
82
|
run = self.runs.get(run_id)
|
|
58
83
|
if not run:
|
|
59
84
|
return
|
|
60
|
-
|
|
85
|
+
previous_status = run.status
|
|
86
|
+
|
|
61
87
|
if status:
|
|
62
88
|
# State transition check
|
|
63
89
|
# If server has CANCELLING, ignore RUNNING from worker
|
|
@@ -70,13 +96,124 @@ class StateStore:
|
|
|
70
96
|
if not run.end_time:
|
|
71
97
|
run.end_time = time.time()
|
|
72
98
|
|
|
99
|
+
if run.status != previous_status:
|
|
100
|
+
self.metrics.record_status_transition(previous_status, run.status)
|
|
101
|
+
|
|
73
102
|
if task_states:
|
|
74
|
-
|
|
103
|
+
for name, state in task_states.items():
|
|
104
|
+
run.tasks[name] = TaskState(state) if isinstance(state, str) else state
|
|
105
|
+
if task_records:
|
|
106
|
+
for name, record in task_records.items():
|
|
107
|
+
info = run.ensure_task_record(name)
|
|
108
|
+
state_val = record.get("state")
|
|
109
|
+
if state_val:
|
|
110
|
+
info.state = TaskState(state_val) if isinstance(state_val, str) else state_val
|
|
111
|
+
info.started_at = record.get("started_at", info.started_at)
|
|
112
|
+
info.ended_at = record.get("ended_at", info.ended_at)
|
|
113
|
+
info.duration_ms = record.get("duration_ms", info.duration_ms)
|
|
114
|
+
info.error = record.get("error", info.error)
|
|
115
|
+
info.traceback = record.get("traceback", info.traceback)
|
|
116
|
+
info.inputs = record.get("inputs", info.inputs)
|
|
117
|
+
info.output = record.get("output", info.output)
|
|
118
|
+
info.artifacts = record.get("artifacts", info.artifacts)
|
|
119
|
+
self._record_task_metrics(run, name, info)
|
|
120
|
+
if logs:
|
|
121
|
+
for entry in logs:
|
|
122
|
+
task_name = entry.get("task") or "unknown"
|
|
123
|
+
text = entry.get("text", "")
|
|
124
|
+
encoded_len = len(text.encode("utf-8"))
|
|
125
|
+
current = run.log_bytes.get(task_name, 0)
|
|
126
|
+
if current >= self.log_limit_bytes:
|
|
127
|
+
continue
|
|
128
|
+
if current + encoded_len > self.log_limit_bytes:
|
|
129
|
+
allowed = max(self.log_limit_bytes - current, 0)
|
|
130
|
+
truncated_text = text[:allowed] + "\n[log truncated]"
|
|
131
|
+
entry = dict(entry)
|
|
132
|
+
entry["text"] = truncated_text
|
|
133
|
+
run.log_bytes[task_name] = self.log_limit_bytes
|
|
134
|
+
else:
|
|
135
|
+
run.log_bytes[task_name] = current + encoded_len
|
|
136
|
+
run.logs.append(entry)
|
|
137
|
+
if status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELLED]:
|
|
138
|
+
self._enforce_retention()
|
|
139
|
+
if run.end_time and not run.metrics_run_observed:
|
|
140
|
+
self.metrics.record_run_duration(run.flow_name, run.start_time, run.end_time)
|
|
141
|
+
run.metrics_run_observed = True
|
|
142
|
+
if run.status in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELLED]:
|
|
143
|
+
if run.webhook_notified_status != run.status.value:
|
|
144
|
+
if self.webhook.notify_run(run):
|
|
145
|
+
run.webhook_notified_status = run.status.value
|
|
75
146
|
|
|
76
147
|
def cancel_run(self, run_id: str):
|
|
77
148
|
run = self.runs.get(run_id)
|
|
78
149
|
if not run:
|
|
79
150
|
return
|
|
80
|
-
|
|
151
|
+
previous = run.status
|
|
81
152
|
if run.status in [RunStatus.PENDING, RunStatus.RUNNING]:
|
|
82
153
|
run.status = RunStatus.CANCELLING
|
|
154
|
+
if run.status != previous:
|
|
155
|
+
self.metrics.record_status_transition(previous, run.status)
|
|
156
|
+
|
|
157
|
+
def export_run(self, run: RunContext) -> Dict[str, Any]:
|
|
158
|
+
return {
|
|
159
|
+
"run_id": run.run_id,
|
|
160
|
+
"flow_name": run.flow_name,
|
|
161
|
+
"params": run.params,
|
|
162
|
+
"status": run.status.value if hasattr(run.status, "value") else run.status,
|
|
163
|
+
"start_time": run.start_time,
|
|
164
|
+
"end_time": run.end_time,
|
|
165
|
+
"tasks": {name: state.value if hasattr(state, "value") else state for name, state in run.tasks.items()},
|
|
166
|
+
"task_records": run.serialize_task_records(),
|
|
167
|
+
"logs": run.logs,
|
|
168
|
+
"metadata": run.metadata,
|
|
169
|
+
"run_duration_ms": self._run_duration_ms(run),
|
|
170
|
+
"task_summary": self._build_task_summary(run),
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
def _enforce_retention(self):
|
|
174
|
+
removable_ids = [rid for rid in self.history if rid in self.runs]
|
|
175
|
+
while len(self.runs) > self.max_runs and removable_ids:
|
|
176
|
+
run_id = removable_ids.pop(0)
|
|
177
|
+
run = self.runs.get(run_id)
|
|
178
|
+
if not run:
|
|
179
|
+
continue
|
|
180
|
+
if run.status not in [RunStatus.COMPLETED, RunStatus.FAILED, RunStatus.CANCELLED]:
|
|
181
|
+
self.history.append(run_id)
|
|
182
|
+
continue
|
|
183
|
+
self._spill_run(run)
|
|
184
|
+
self.runs.pop(run_id, None)
|
|
185
|
+
if run_id in self.queue:
|
|
186
|
+
self.queue.remove(run_id)
|
|
187
|
+
self.history = [rid for rid in self.history if rid in self.runs]
|
|
188
|
+
|
|
189
|
+
def _spill_run(self, run: RunContext):
|
|
190
|
+
try:
|
|
191
|
+
self.archive_dir.mkdir(parents=True, exist_ok=True)
|
|
192
|
+
path = self.archive_dir / f"{run.run_id}.json"
|
|
193
|
+
with path.open("w", encoding="utf-8") as fp:
|
|
194
|
+
json.dump(self.export_run(run), fp, indent=2)
|
|
195
|
+
except Exception:
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
def _record_task_metrics(self, run: RunContext, task_name: str, record):
|
|
199
|
+
if task_name in run.metrics_recorded_tasks:
|
|
200
|
+
return
|
|
201
|
+
if record.duration_ms is None or record.ended_at is None:
|
|
202
|
+
return
|
|
203
|
+
self.metrics.record_task_duration(task_name, record.duration_ms)
|
|
204
|
+
run.metrics_recorded_tasks.add(task_name)
|
|
205
|
+
|
|
206
|
+
def _run_duration_ms(self, run: RunContext) -> Optional[float]:
|
|
207
|
+
if run.start_time and run.end_time:
|
|
208
|
+
return (run.end_time - run.start_time) * 1000.0
|
|
209
|
+
return None
|
|
210
|
+
|
|
211
|
+
def _build_task_summary(self, run: RunContext) -> Dict[str, Any]:
|
|
212
|
+
summary: Dict[str, Any] = {}
|
|
213
|
+
for name, record in run.task_records.items():
|
|
214
|
+
summary[name] = {
|
|
215
|
+
"state": record.state.value if hasattr(record.state, "value") else record.state,
|
|
216
|
+
"duration_ms": record.duration_ms,
|
|
217
|
+
"ended_at": record.ended_at,
|
|
218
|
+
}
|
|
219
|
+
return summary
|
pyoco/server/webhook.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from typing import Any, Callable, Dict, Optional
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
from ..core.models import RunContext, RunStatus
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class WebhookNotifier:
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
self.url: Optional[str] = None
|
|
16
|
+
self.timeout: float = 3.0
|
|
17
|
+
self.retries: int = 1
|
|
18
|
+
self.secret: Optional[str] = None
|
|
19
|
+
self.extra_headers: Dict[str, str] = {}
|
|
20
|
+
self._sender: Optional[Callable[[str, Dict[str, Any], Dict[str, str], float], None]] = None
|
|
21
|
+
self.last_error: Optional[str] = None
|
|
22
|
+
self.load_from_env()
|
|
23
|
+
|
|
24
|
+
def load_from_env(self) -> None:
|
|
25
|
+
self.url = os.getenv("PYOCO_WEBHOOK_URL") or None
|
|
26
|
+
self.timeout = float(os.getenv("PYOCO_WEBHOOK_TIMEOUT", "3.0"))
|
|
27
|
+
self.retries = int(os.getenv("PYOCO_WEBHOOK_RETRIES", "1"))
|
|
28
|
+
self.secret = os.getenv("PYOCO_WEBHOOK_SECRET") or None
|
|
29
|
+
self.extra_headers = {}
|
|
30
|
+
self.last_error = None
|
|
31
|
+
self._sender = None
|
|
32
|
+
|
|
33
|
+
def configure(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
url: Optional[str] = None,
|
|
37
|
+
timeout: Optional[float] = None,
|
|
38
|
+
retries: Optional[int] = None,
|
|
39
|
+
secret: Optional[str] = None,
|
|
40
|
+
headers: Optional[Dict[str, str]] = None,
|
|
41
|
+
sender: Optional[Callable[[str, Dict[str, Any], Dict[str, str], float], None]] = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
if url is not None:
|
|
44
|
+
self.url = url
|
|
45
|
+
if timeout is not None:
|
|
46
|
+
self.timeout = timeout
|
|
47
|
+
if retries is not None:
|
|
48
|
+
self.retries = max(1, retries)
|
|
49
|
+
if secret is not None:
|
|
50
|
+
self.secret = secret
|
|
51
|
+
if headers is not None:
|
|
52
|
+
self.extra_headers = dict(headers)
|
|
53
|
+
if sender is not None:
|
|
54
|
+
self._sender = sender
|
|
55
|
+
self.last_error = None
|
|
56
|
+
|
|
57
|
+
def notify_run(self, run: RunContext) -> bool:
|
|
58
|
+
if not self.url:
|
|
59
|
+
return False
|
|
60
|
+
payload = self._build_payload(run)
|
|
61
|
+
sender = self._sender or self._http_sender
|
|
62
|
+
headers = {"Content-Type": "application/json", **self.extra_headers}
|
|
63
|
+
if self.secret:
|
|
64
|
+
headers.setdefault("X-Pyoco-Token", self.secret)
|
|
65
|
+
|
|
66
|
+
last_exc: Optional[Exception] = None
|
|
67
|
+
for attempt in range(self.retries):
|
|
68
|
+
try:
|
|
69
|
+
sender(self.url, payload, headers, self.timeout)
|
|
70
|
+
self.last_error = None
|
|
71
|
+
return True
|
|
72
|
+
except Exception as exc: # pragma: no cover - retries captured via tests
|
|
73
|
+
last_exc = exc
|
|
74
|
+
time.sleep(min(0.5, 0.1 * (attempt + 1)))
|
|
75
|
+
if last_exc:
|
|
76
|
+
self.last_error = str(last_exc)
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
def _http_sender(
|
|
80
|
+
self,
|
|
81
|
+
url: str,
|
|
82
|
+
payload: Dict[str, Any],
|
|
83
|
+
headers: Dict[str, str],
|
|
84
|
+
timeout: float,
|
|
85
|
+
) -> None:
|
|
86
|
+
httpx.post(url, json=payload, headers=headers, timeout=timeout)
|
|
87
|
+
|
|
88
|
+
def _build_payload(self, run: RunContext) -> Dict[str, Any]:
|
|
89
|
+
duration_ms = None
|
|
90
|
+
if run.start_time and run.end_time:
|
|
91
|
+
duration_ms = (run.end_time - run.start_time) * 1000.0
|
|
92
|
+
return {
|
|
93
|
+
"event": f"run.{run.status.value.lower()}",
|
|
94
|
+
"run_id": run.run_id,
|
|
95
|
+
"flow_name": run.flow_name,
|
|
96
|
+
"status": run.status.value if isinstance(run.status, RunStatus) else run.status,
|
|
97
|
+
"started_at": run.start_time,
|
|
98
|
+
"ended_at": run.end_time,
|
|
99
|
+
"duration_ms": duration_ms,
|
|
100
|
+
"tasks": run.serialize_task_records(),
|
|
101
|
+
"metadata": run.metadata,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def reset(self) -> None:
|
|
105
|
+
self.load_from_env()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
webhook_notifier = WebhookNotifier()
|
pyoco/worker/runner.py
CHANGED
|
@@ -21,11 +21,7 @@ class RemoteTraceBackend(TraceBackend):
|
|
|
21
21
|
def _send_heartbeat(self, force=False):
|
|
22
22
|
now = time.time()
|
|
23
23
|
if force or (now - self.last_heartbeat > self.heartbeat_interval):
|
|
24
|
-
cancel = self.client.heartbeat(
|
|
25
|
-
self.run_ctx.run_id,
|
|
26
|
-
self.run_ctx.tasks,
|
|
27
|
-
self.run_ctx.status
|
|
28
|
-
)
|
|
24
|
+
cancel = self.client.heartbeat(self.run_ctx)
|
|
29
25
|
if cancel and self.run_ctx.status not in [RunStatus.CANCELLING, RunStatus.CANCELLED]:
|
|
30
26
|
print(f"🛑 Cancellation requested from server for run {self.run_ctx.run_id}")
|
|
31
27
|
self.run_ctx.status = RunStatus.CANCELLING
|
|
@@ -162,10 +158,9 @@ class Worker:
|
|
|
162
158
|
engine.run(flow, params=params, run_context=run_ctx)
|
|
163
159
|
print(f"✅ Job {run_id} completed: {run_ctx.status}")
|
|
164
160
|
# Send final heartbeat
|
|
165
|
-
self.client.heartbeat(
|
|
161
|
+
self.client.heartbeat(run_ctx)
|
|
166
162
|
except Exception as e:
|
|
167
163
|
print(f"💥 Job {run_id} failed: {e}")
|
|
168
164
|
# Heartbeat one last time
|
|
169
165
|
run_ctx.status = RunStatus.FAILED
|
|
170
|
-
self.client.heartbeat(
|
|
171
|
-
|
|
166
|
+
self.client.heartbeat(run_ctx)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pyoco
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: A workflow engine with sugar syntax
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -8,6 +8,7 @@ Requires-Dist: pyyaml>=6.0.3
|
|
|
8
8
|
Requires-Dist: fastapi>=0.100.0
|
|
9
9
|
Requires-Dist: uvicorn>=0.20.0
|
|
10
10
|
Requires-Dist: httpx>=0.24.0
|
|
11
|
+
Requires-Dist: prometheus-client>=0.20.0
|
|
11
12
|
|
|
12
13
|
# 🐇 Pyoco
|
|
13
14
|
|
|
@@ -132,6 +133,20 @@ Or via CLI flag:
|
|
|
132
133
|
pyoco run --non-cute ...
|
|
133
134
|
```
|
|
134
135
|
|
|
136
|
+
## 🔭 Observability Bridge (v0.5)
|
|
137
|
+
|
|
138
|
+
- `/metrics` exposes Prometheus counters (`pyoco_runs_total`, `pyoco_runs_in_progress`) and histograms (`pyoco_task_duration_seconds`, `pyoco_run_duration_seconds`). Point Grafana/Prometheus at it to watch pipelines without opening sockets.
|
|
139
|
+
- `/runs` now accepts `status`, `flow`, `limit` query params; `/runs/{id}/logs?tail=100` fetches only the latest snippets for dashboards.
|
|
140
|
+
- Webhook notifications fire when runs COMPLETE/FAIL—configure via `PYOCO_WEBHOOK_*` env vars and forward to Slack or your alerting stack.
|
|
141
|
+
- Import `docs/grafana_pyoco_cute.json` for a lavender/orange starter dashboard (3 panels: in-progress count, completion trend, per-flow latency).
|
|
142
|
+
- 詳細な手順は [docs/observability.md](docs/observability.md) を参照してください。
|
|
143
|
+
|
|
144
|
+
## 🧩 Plug-ins
|
|
145
|
+
|
|
146
|
+
Need to share domain-specific tasks? Publish an entry point under `pyoco.tasks` and pyoco will auto-load it. In v0.5.1 we recommend **Task subclasses first** (callables still work with warnings). See [docs/plugins.md](docs/plugins.md) for examples, quickstart, and `pyoco plugins list` / `pyoco plugins lint`.
|
|
147
|
+
|
|
148
|
+
**Big data note:** pass handles, not copies. For large tensors/images, stash paths or handles in `ctx.artifacts`/`ctx.scratch` and let downstream tasks materialize only when needed. For lazy pipelines (e.g., DataPipe), log the pipeline when you actually iterate (typically the training task) instead of materializing upstream.
|
|
149
|
+
|
|
135
150
|
## 📚 Documentation
|
|
136
151
|
|
|
137
152
|
- [Tutorials](docs/tutorial/index.md)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
pyoco/__init__.py,sha256=E2pgDGvGRSVon7dSqIM4UD55LgVpf4jiZZA-70kOcuw,409
|
|
2
|
+
pyoco/client.py,sha256=Y95NmMsOKTJ9AZJEg_OzHamC_w32YWmSVS653mpqHVQ,3141
|
|
3
|
+
pyoco/socketless_reset.py,sha256=KsAF4I23_Kbhy9fIWFARzV5QaIOQqbl0U0yPb8a34sM,129
|
|
4
|
+
pyoco/cli/entry.py,sha256=zPIG0Gx-cFO8Cf1Z3wD3Ifz_2sHaryHZ6mCRri2WEqE,93
|
|
5
|
+
pyoco/cli/main.py,sha256=LbhgTgRw9Tr_04hiYLqLP64jdnE1RA8B9Rasetgc_MM,18557
|
|
6
|
+
pyoco/core/base_task.py,sha256=z7hOFntAPv4yCADapS-fhtLe5eWqaO8k3T1r05YEEUE,2106
|
|
7
|
+
pyoco/core/context.py,sha256=TeCUriOmg7qZB3nMRu8HPdPshMW6pMVx48xZLY6a-A4,6524
|
|
8
|
+
pyoco/core/engine.py,sha256=iX2Id8ryFt-xeZgraqnF3uqkI6ubiZt5NBNYWX6Qv1s,24166
|
|
9
|
+
pyoco/core/exceptions.py,sha256=G82KY8PCnAhp3IDDIG8--Uh3EfVa192zei3l6ihfShI,565
|
|
10
|
+
pyoco/core/models.py,sha256=8faYURF43-7IebqzTIorHxpCeC4TZfoXWjGyPNaWhyI,10501
|
|
11
|
+
pyoco/discovery/loader.py,sha256=L9Wb2i-d1Hv3EiTFUvuR2mrv7Fc9vt5Bv9ZRuRqAzSg,6132
|
|
12
|
+
pyoco/discovery/plugins.py,sha256=r1KY-OwWXSSe6arVOdfK72pGaI3tpumucg9cXEXA-Z0,4873
|
|
13
|
+
pyoco/dsl/__init__.py,sha256=xWdb60pSRL8lNFk4GHF3EJ4hon0uiWqpv264g6-4gdg,45
|
|
14
|
+
pyoco/dsl/expressions.py,sha256=BtEIxPSf3BU-wPNEicIqX_TVZ4fAnlWGrzrrfc6pU1g,4875
|
|
15
|
+
pyoco/dsl/nodes.py,sha256=qDiIEsAJHnD8dpuOd-Rpy6OORCW6KDW_BdYiA2BKu18,1041
|
|
16
|
+
pyoco/dsl/syntax.py,sha256=kYP5uGbwxmkSd_zeSksax8iWm_7UlRW5JxE9_DoSqbk,8638
|
|
17
|
+
pyoco/dsl/validator.py,sha256=HXjcc-GzjH72YByaNxAg_7YOZsVsFDFnUaenVwd5PbY,3576
|
|
18
|
+
pyoco/schemas/config.py,sha256=KkGZK3GxTHoIHEGb4f4k8GE2W-aBN4iPzmc_HrwuROU,1735
|
|
19
|
+
pyoco/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
pyoco/server/api.py,sha256=vu2ieDZgHbi8cysO2rS-lcxqWiSQprIcqRn6GkwTtKo,3890
|
|
21
|
+
pyoco/server/metrics.py,sha256=92sHZKka_yBNBGlHZgRIteywx97aoTa-MnXh3UJ0HJY,2952
|
|
22
|
+
pyoco/server/models.py,sha256=ir5AuvyXQigmaynA7bS_0RNJcJo2VtpJl0GjRZrj2rU,786
|
|
23
|
+
pyoco/server/store.py,sha256=ITYAV1QlPWDnceywqjjJZW9E0CyocFlPmqqfjcoM-wA,9133
|
|
24
|
+
pyoco/server/webhook.py,sha256=fBSLWTDN7sIWSK0AUVuiCSdVVBFV_AyP-XEKOcdMXmQ,3643
|
|
25
|
+
pyoco/trace/backend.py,sha256=a1css94_lhO4SGSPHZ1f59HJqFQtZ5Sjx09Kw7v5bsk,617
|
|
26
|
+
pyoco/trace/console.py,sha256=I-BcF405OGLWoacJWeke8vTT9M5JxSBpJL-NazVyxb4,1742
|
|
27
|
+
pyoco/worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
28
|
+
pyoco/worker/client.py,sha256=862KccXRtfG7zd9ZSLqrpVSV6ev8zeuEHHdtAfLghiM,1557
|
|
29
|
+
pyoco/worker/runner.py,sha256=hyKn5NbuIuF-109CnQbYc8laKbWmwe9ChaLrNUtsVIg,6367
|
|
30
|
+
pyoco-0.5.1.dist-info/METADATA,sha256=JLUsGfujXl71AvCSuKDc52v2FjSxlWcIocGyCCzHnrU,5642
|
|
31
|
+
pyoco-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
32
|
+
pyoco-0.5.1.dist-info/top_level.txt,sha256=2JRVocfaWRbX1VJ3zq1c5wQaOK6fMARS6ptVFWyvRF4,6
|
|
33
|
+
pyoco-0.5.1.dist-info/RECORD,,
|