fraclab-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +1601 -0
- fraclab_sdk/__init__.py +34 -0
- fraclab_sdk/algorithm/__init__.py +13 -0
- fraclab_sdk/algorithm/export.py +1 -0
- fraclab_sdk/algorithm/library.py +378 -0
- fraclab_sdk/cli.py +381 -0
- fraclab_sdk/config.py +54 -0
- fraclab_sdk/devkit/__init__.py +25 -0
- fraclab_sdk/devkit/compile.py +342 -0
- fraclab_sdk/devkit/export.py +354 -0
- fraclab_sdk/devkit/validate.py +1043 -0
- fraclab_sdk/errors.py +124 -0
- fraclab_sdk/materialize/__init__.py +8 -0
- fraclab_sdk/materialize/fsops.py +125 -0
- fraclab_sdk/materialize/hash.py +28 -0
- fraclab_sdk/materialize/materializer.py +241 -0
- fraclab_sdk/models/__init__.py +52 -0
- fraclab_sdk/models/bundle_manifest.py +51 -0
- fraclab_sdk/models/dataspec.py +65 -0
- fraclab_sdk/models/drs.py +47 -0
- fraclab_sdk/models/output_contract.py +111 -0
- fraclab_sdk/models/run_output_manifest.py +119 -0
- fraclab_sdk/results/__init__.py +25 -0
- fraclab_sdk/results/preview.py +150 -0
- fraclab_sdk/results/reader.py +329 -0
- fraclab_sdk/run/__init__.py +10 -0
- fraclab_sdk/run/logs.py +42 -0
- fraclab_sdk/run/manager.py +403 -0
- fraclab_sdk/run/subprocess_runner.py +153 -0
- fraclab_sdk/runtime/__init__.py +11 -0
- fraclab_sdk/runtime/artifacts.py +303 -0
- fraclab_sdk/runtime/data_client.py +123 -0
- fraclab_sdk/runtime/runner_main.py +286 -0
- fraclab_sdk/runtime/snapshot_provider.py +1 -0
- fraclab_sdk/selection/__init__.py +11 -0
- fraclab_sdk/selection/model.py +247 -0
- fraclab_sdk/selection/validate.py +54 -0
- fraclab_sdk/snapshot/__init__.py +12 -0
- fraclab_sdk/snapshot/index.py +94 -0
- fraclab_sdk/snapshot/library.py +205 -0
- fraclab_sdk/snapshot/loader.py +217 -0
- fraclab_sdk/specs/manifest.py +89 -0
- fraclab_sdk/utils/io.py +32 -0
- fraclab_sdk-0.1.0.dist-info/METADATA +1622 -0
- fraclab_sdk-0.1.0.dist-info/RECORD +47 -0
- fraclab_sdk-0.1.0.dist-info/WHEEL +4 -0
- fraclab_sdk-0.1.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
"""Run manager implementation."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
import sys
|
|
6
|
+
import uuid
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from fraclab_sdk.algorithm import AlgorithmLibrary
|
|
14
|
+
from fraclab_sdk.config import SDKConfig
|
|
15
|
+
from fraclab_sdk.errors import RunError
|
|
16
|
+
from fraclab_sdk.materialize import Materializer
|
|
17
|
+
from fraclab_sdk.run.logs import tail_stderr, tail_stdout
|
|
18
|
+
from fraclab_sdk.run.subprocess_runner import SubprocessRunner
|
|
19
|
+
from fraclab_sdk.selection.model import SelectionModel
|
|
20
|
+
from fraclab_sdk.snapshot import SnapshotLibrary
|
|
21
|
+
from fraclab_sdk.utils.io import atomic_write_json
|
|
22
|
+
from fraclab_sdk.utils.io import atomic_write_json
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RunStatus(Enum):
|
|
26
|
+
"""Status of a run."""
|
|
27
|
+
|
|
28
|
+
PENDING = "pending"
|
|
29
|
+
RUNNING = "running"
|
|
30
|
+
SUCCEEDED = "succeeded"
|
|
31
|
+
FAILED = "failed"
|
|
32
|
+
TIMEOUT = "timeout"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class RunMeta:
|
|
37
|
+
"""Metadata for a run."""
|
|
38
|
+
|
|
39
|
+
run_id: str
|
|
40
|
+
snapshot_id: str
|
|
41
|
+
algorithm_id: str
|
|
42
|
+
algorithm_version: str
|
|
43
|
+
status: RunStatus
|
|
44
|
+
created_at: str
|
|
45
|
+
started_at: str | None = None
|
|
46
|
+
completed_at: str | None = None
|
|
47
|
+
error: str | None = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class RunResult:
|
|
52
|
+
"""Result of run execution."""
|
|
53
|
+
|
|
54
|
+
run_id: str
|
|
55
|
+
status: RunStatus
|
|
56
|
+
exit_code: int | None = None
|
|
57
|
+
error: str | None = None
|
|
58
|
+
stdout: str | None = None
|
|
59
|
+
stderr: str | None = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class RunIndex:
|
|
63
|
+
"""Manages the run index file."""
|
|
64
|
+
|
|
65
|
+
def __init__(self, runs_dir: Path) -> None:
|
|
66
|
+
"""Initialize run index."""
|
|
67
|
+
self._runs_dir = runs_dir
|
|
68
|
+
self._index_path = runs_dir / "index.json"
|
|
69
|
+
|
|
70
|
+
def _load(self) -> dict[str, dict]:
|
|
71
|
+
"""Load index from disk."""
|
|
72
|
+
if not self._index_path.exists():
|
|
73
|
+
return {}
|
|
74
|
+
return json.loads(self._index_path.read_text())
|
|
75
|
+
|
|
76
|
+
def _save(self, data: dict[str, dict]) -> None:
|
|
77
|
+
"""Save index to disk."""
|
|
78
|
+
self._runs_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
atomic_write_json(self._index_path, data)
|
|
80
|
+
|
|
81
|
+
def add(self, meta: RunMeta) -> None:
|
|
82
|
+
"""Add a run to the index."""
|
|
83
|
+
data = self._load()
|
|
84
|
+
data[meta.run_id] = {
|
|
85
|
+
"run_id": meta.run_id,
|
|
86
|
+
"snapshot_id": meta.snapshot_id,
|
|
87
|
+
"algorithm_id": meta.algorithm_id,
|
|
88
|
+
"algorithm_version": meta.algorithm_version,
|
|
89
|
+
"status": meta.status.value,
|
|
90
|
+
"created_at": meta.created_at,
|
|
91
|
+
"started_at": meta.started_at,
|
|
92
|
+
"completed_at": meta.completed_at,
|
|
93
|
+
"error": meta.error,
|
|
94
|
+
}
|
|
95
|
+
self._save(data)
|
|
96
|
+
|
|
97
|
+
def update(self, meta: RunMeta) -> None:
|
|
98
|
+
"""Update a run in the index."""
|
|
99
|
+
self.add(meta)
|
|
100
|
+
|
|
101
|
+
def remove(self, run_id: str) -> None:
|
|
102
|
+
"""Remove a run from the index."""
|
|
103
|
+
data = self._load()
|
|
104
|
+
if run_id in data:
|
|
105
|
+
del data[run_id]
|
|
106
|
+
self._save(data)
|
|
107
|
+
|
|
108
|
+
def get(self, run_id: str) -> RunMeta | None:
|
|
109
|
+
"""Get run metadata."""
|
|
110
|
+
data = self._load()
|
|
111
|
+
if run_id not in data:
|
|
112
|
+
return None
|
|
113
|
+
entry = data[run_id]
|
|
114
|
+
return RunMeta(
|
|
115
|
+
run_id=entry["run_id"],
|
|
116
|
+
snapshot_id=entry["snapshot_id"],
|
|
117
|
+
algorithm_id=entry["algorithm_id"],
|
|
118
|
+
algorithm_version=entry["algorithm_version"],
|
|
119
|
+
status=self._coerce_status(entry.get("status", "")),
|
|
120
|
+
created_at=entry["created_at"],
|
|
121
|
+
started_at=entry.get("started_at"),
|
|
122
|
+
completed_at=entry.get("completed_at"),
|
|
123
|
+
error=entry.get("error"),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def list_all(self) -> list[RunMeta]:
|
|
127
|
+
"""List all runs."""
|
|
128
|
+
data = self._load()
|
|
129
|
+
return [
|
|
130
|
+
RunMeta(
|
|
131
|
+
run_id=entry["run_id"],
|
|
132
|
+
snapshot_id=entry["snapshot_id"],
|
|
133
|
+
algorithm_id=entry["algorithm_id"],
|
|
134
|
+
algorithm_version=entry["algorithm_version"],
|
|
135
|
+
status=self._coerce_status(entry.get("status", "")),
|
|
136
|
+
created_at=entry["created_at"],
|
|
137
|
+
started_at=entry.get("started_at"),
|
|
138
|
+
completed_at=entry.get("completed_at"),
|
|
139
|
+
error=entry.get("error"),
|
|
140
|
+
)
|
|
141
|
+
for entry in data.values()
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _coerce_status(value: str) -> RunStatus:
|
|
146
|
+
"""Map legacy statuses to new enum."""
|
|
147
|
+
mapping = {
|
|
148
|
+
"completed": RunStatus.SUCCEEDED,
|
|
149
|
+
"failed": RunStatus.FAILED,
|
|
150
|
+
"pending": RunStatus.PENDING,
|
|
151
|
+
"running": RunStatus.RUNNING,
|
|
152
|
+
"timeout": RunStatus.TIMEOUT,
|
|
153
|
+
"succeeded": RunStatus.SUCCEEDED,
|
|
154
|
+
}
|
|
155
|
+
if value in mapping:
|
|
156
|
+
return mapping[value]
|
|
157
|
+
try:
|
|
158
|
+
return RunStatus(value)
|
|
159
|
+
except Exception:
|
|
160
|
+
return RunStatus.FAILED
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class RunManager:
|
|
164
|
+
"""Manages algorithm runs."""
|
|
165
|
+
|
|
166
|
+
def __init__(self, config: SDKConfig | None = None) -> None:
|
|
167
|
+
"""Initialize run manager.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
config: SDK configuration. If None, uses default.
|
|
171
|
+
"""
|
|
172
|
+
self._config = config or SDKConfig()
|
|
173
|
+
self._index = RunIndex(self._config.runs_dir)
|
|
174
|
+
self._snapshot_lib = SnapshotLibrary(self._config)
|
|
175
|
+
self._algorithm_lib = AlgorithmLibrary(self._config)
|
|
176
|
+
self._materializer = Materializer()
|
|
177
|
+
|
|
178
|
+
def create_run(
|
|
179
|
+
self,
|
|
180
|
+
snapshot_id: str,
|
|
181
|
+
algorithm_id: str,
|
|
182
|
+
algorithm_version: str,
|
|
183
|
+
selection: SelectionModel,
|
|
184
|
+
params: dict[str, Any],
|
|
185
|
+
) -> str:
|
|
186
|
+
"""Create a new run.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
snapshot_id: The snapshot ID.
|
|
190
|
+
algorithm_id: The algorithm ID.
|
|
191
|
+
algorithm_version: The algorithm version.
|
|
192
|
+
selection: The selection model with selected items.
|
|
193
|
+
params: Algorithm parameters.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
The run ID.
|
|
197
|
+
|
|
198
|
+
Raises:
|
|
199
|
+
RunError: If run creation fails.
|
|
200
|
+
"""
|
|
201
|
+
# Validate selection
|
|
202
|
+
errors = selection.validate()
|
|
203
|
+
if errors:
|
|
204
|
+
error_msgs = [f"{e.dataset_key}: {e.message}" for e in errors]
|
|
205
|
+
raise RunError(f"Selection validation failed: {'; '.join(error_msgs)}")
|
|
206
|
+
|
|
207
|
+
# Get handles
|
|
208
|
+
snapshot = self._snapshot_lib.get_snapshot(snapshot_id)
|
|
209
|
+
algorithm = self._algorithm_lib.get_algorithm(algorithm_id, algorithm_version)
|
|
210
|
+
|
|
211
|
+
# Generate run ID
|
|
212
|
+
run_id = str(uuid.uuid4())[:8]
|
|
213
|
+
|
|
214
|
+
# Create run directory
|
|
215
|
+
self._config.ensure_dirs()
|
|
216
|
+
run_dir = self._config.runs_dir / run_id
|
|
217
|
+
run_dir.mkdir(parents=True)
|
|
218
|
+
|
|
219
|
+
# Build run DataSpec
|
|
220
|
+
run_ds = selection.build_run_ds()
|
|
221
|
+
|
|
222
|
+
# Build run context
|
|
223
|
+
run_context = {
|
|
224
|
+
"runId": run_id,
|
|
225
|
+
"snapshotId": snapshot_id,
|
|
226
|
+
"algorithmId": algorithm_id,
|
|
227
|
+
"algorithmVersion": algorithm_version,
|
|
228
|
+
"contractVersion": algorithm.manifest.contractVersion,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
# Materialize input
|
|
232
|
+
self._materializer.materialize(
|
|
233
|
+
run_dir=run_dir,
|
|
234
|
+
snapshot=snapshot,
|
|
235
|
+
run_ds=run_ds,
|
|
236
|
+
drs=algorithm.drs,
|
|
237
|
+
params=params,
|
|
238
|
+
run_context=run_context,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
# Create run metadata
|
|
242
|
+
meta = RunMeta(
|
|
243
|
+
run_id=run_id,
|
|
244
|
+
snapshot_id=snapshot_id,
|
|
245
|
+
algorithm_id=algorithm_id,
|
|
246
|
+
algorithm_version=algorithm_version,
|
|
247
|
+
status=RunStatus.PENDING,
|
|
248
|
+
created_at=datetime.now().isoformat(),
|
|
249
|
+
)
|
|
250
|
+
self._index.add(meta)
|
|
251
|
+
|
|
252
|
+
# Write run_meta.json
|
|
253
|
+
run_meta_path = run_dir / "run_meta.json"
|
|
254
|
+
run_meta_path.write_text(
|
|
255
|
+
json.dumps(
|
|
256
|
+
{
|
|
257
|
+
"run_id": run_id,
|
|
258
|
+
"snapshot_id": snapshot_id,
|
|
259
|
+
"algorithm_id": algorithm_id,
|
|
260
|
+
"algorithm_version": algorithm_version,
|
|
261
|
+
"created_at": meta.created_at,
|
|
262
|
+
},
|
|
263
|
+
indent=2,
|
|
264
|
+
)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
return run_id
|
|
268
|
+
|
|
269
|
+
def delete_run(self, run_id: str) -> None:
|
|
270
|
+
"""Delete a run and its outputs."""
|
|
271
|
+
run_dir = self._config.runs_dir / run_id
|
|
272
|
+
if run_dir.exists():
|
|
273
|
+
shutil.rmtree(run_dir)
|
|
274
|
+
self._index.remove(run_id)
|
|
275
|
+
|
|
276
|
+
def execute(
|
|
277
|
+
self,
|
|
278
|
+
run_id: str,
|
|
279
|
+
timeout_s: int | None = None,
|
|
280
|
+
) -> RunResult:
|
|
281
|
+
"""Execute a run.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
run_id: The run ID.
|
|
285
|
+
timeout_s: Optional timeout in seconds.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
RunResult with execution outcome.
|
|
289
|
+
|
|
290
|
+
Raises:
|
|
291
|
+
RunError: If run not found or already executed.
|
|
292
|
+
"""
|
|
293
|
+
meta = self._index.get(run_id)
|
|
294
|
+
if meta is None:
|
|
295
|
+
raise RunError(f"Run not found: {run_id}")
|
|
296
|
+
|
|
297
|
+
if meta.status not in (RunStatus.PENDING, RunStatus.FAILED, RunStatus.TIMEOUT):
|
|
298
|
+
raise RunError(f"Run {run_id} already executed with status {meta.status}")
|
|
299
|
+
|
|
300
|
+
run_dir = self._config.runs_dir / run_id
|
|
301
|
+
algorithm = self._algorithm_lib.get_algorithm(
|
|
302
|
+
meta.algorithm_id, meta.algorithm_version
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# Update status to running
|
|
306
|
+
meta.status = RunStatus.RUNNING
|
|
307
|
+
meta.started_at = datetime.now().isoformat()
|
|
308
|
+
self._index.update(meta)
|
|
309
|
+
|
|
310
|
+
# Execute via subprocess runner (streaming logs)
|
|
311
|
+
cmd = [
|
|
312
|
+
sys.executable,
|
|
313
|
+
"-m",
|
|
314
|
+
"fraclab_sdk.runtime.runner_main",
|
|
315
|
+
str(run_dir),
|
|
316
|
+
str(algorithm.algorithm_path),
|
|
317
|
+
]
|
|
318
|
+
|
|
319
|
+
stdout_log = run_dir / "output" / "_logs" / "stdout.log"
|
|
320
|
+
stderr_log = run_dir / "output" / "_logs" / "stderr.log"
|
|
321
|
+
execute_meta = run_dir / "output" / "_logs" / "execute.json"
|
|
322
|
+
|
|
323
|
+
runner = SubprocessRunner(cmd=cmd, cwd=run_dir, timeout_s=timeout_s)
|
|
324
|
+
exit_code, timed_out = runner.run(stdout_log, stderr_log, execute_meta)
|
|
325
|
+
|
|
326
|
+
error = None
|
|
327
|
+
if timed_out:
|
|
328
|
+
error = f"Timeout after {timeout_s}s"
|
|
329
|
+
elif exit_code != 0:
|
|
330
|
+
error = f"Exit code: {exit_code}"
|
|
331
|
+
|
|
332
|
+
# Update final status
|
|
333
|
+
if timed_out:
|
|
334
|
+
meta.status = RunStatus.TIMEOUT
|
|
335
|
+
elif exit_code == 0:
|
|
336
|
+
meta.status = RunStatus.SUCCEEDED
|
|
337
|
+
else:
|
|
338
|
+
meta.status = RunStatus.FAILED
|
|
339
|
+
meta.completed_at = datetime.now().isoformat()
|
|
340
|
+
meta.error = error
|
|
341
|
+
self._index.update(meta)
|
|
342
|
+
|
|
343
|
+
return RunResult(
|
|
344
|
+
run_id=run_id,
|
|
345
|
+
status=meta.status,
|
|
346
|
+
exit_code=exit_code,
|
|
347
|
+
error=error,
|
|
348
|
+
stdout=tail_stdout(run_dir),
|
|
349
|
+
stderr=tail_stderr(run_dir),
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
def get_run_status(self, run_id: str) -> RunStatus:
|
|
353
|
+
"""Get the status of a run.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
run_id: The run ID.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Run status.
|
|
360
|
+
|
|
361
|
+
Raises:
|
|
362
|
+
RunError: If run not found.
|
|
363
|
+
"""
|
|
364
|
+
meta = self._index.get(run_id)
|
|
365
|
+
if meta is None:
|
|
366
|
+
raise RunError(f"Run not found: {run_id}")
|
|
367
|
+
return meta.status
|
|
368
|
+
|
|
369
|
+
def get_run(self, run_id: str) -> RunMeta:
|
|
370
|
+
"""Get run metadata.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
run_id: The run ID.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Run metadata.
|
|
377
|
+
|
|
378
|
+
Raises:
|
|
379
|
+
RunError: If run not found.
|
|
380
|
+
"""
|
|
381
|
+
meta = self._index.get(run_id)
|
|
382
|
+
if meta is None:
|
|
383
|
+
raise RunError(f"Run not found: {run_id}")
|
|
384
|
+
return meta
|
|
385
|
+
|
|
386
|
+
def get_run_dir(self, run_id: str) -> Path:
|
|
387
|
+
"""Get the run directory path.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
run_id: The run ID.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Path to run directory.
|
|
394
|
+
"""
|
|
395
|
+
return self._config.runs_dir / run_id
|
|
396
|
+
|
|
397
|
+
def list_runs(self) -> list[RunMeta]:
|
|
398
|
+
"""List all runs.
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
List of run metadata.
|
|
402
|
+
"""
|
|
403
|
+
return self._index.list_all()
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""Subprocess runner implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import signal
|
|
8
|
+
import subprocess
|
|
9
|
+
import sys
|
|
10
|
+
import threading
|
|
11
|
+
import time
|
|
12
|
+
from collections.abc import Iterable, Mapping
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
_IS_WINDOWS = sys.platform == "win32"
|
|
16
|
+
|
|
17
|
+
# Grace period for SIGTERM before escalating to SIGKILL (seconds)
|
|
18
|
+
_TERM_GRACE_SECONDS = 2.0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _kill_process_tree(proc: subprocess.Popen) -> str:
|
|
22
|
+
"""Kill a process and its entire tree. Returns the kill strategy used."""
|
|
23
|
+
if _IS_WINDOWS:
|
|
24
|
+
# Windows: try CTRL_BREAK_EVENT first, then kill
|
|
25
|
+
try:
|
|
26
|
+
proc.send_signal(signal.CTRL_BREAK_EVENT)
|
|
27
|
+
try:
|
|
28
|
+
proc.wait(timeout=_TERM_GRACE_SECONDS)
|
|
29
|
+
return "ctrl_break"
|
|
30
|
+
except subprocess.TimeoutExpired:
|
|
31
|
+
pass
|
|
32
|
+
except OSError:
|
|
33
|
+
pass
|
|
34
|
+
proc.kill()
|
|
35
|
+
proc.wait()
|
|
36
|
+
return "kill"
|
|
37
|
+
else:
|
|
38
|
+
# POSIX: use process group kill
|
|
39
|
+
pgid = proc.pid
|
|
40
|
+
try:
|
|
41
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
42
|
+
try:
|
|
43
|
+
proc.wait(timeout=_TERM_GRACE_SECONDS)
|
|
44
|
+
return "killpg_term"
|
|
45
|
+
except subprocess.TimeoutExpired:
|
|
46
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
47
|
+
proc.wait()
|
|
48
|
+
return "killpg_kill"
|
|
49
|
+
except OSError:
|
|
50
|
+
# Fallback if process group doesn't exist
|
|
51
|
+
proc.kill()
|
|
52
|
+
proc.wait()
|
|
53
|
+
return "kill"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SubprocessRunner:
|
|
57
|
+
"""Run subprocess with streaming logs and metadata."""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
cmd: Iterable[str],
|
|
62
|
+
cwd: Path,
|
|
63
|
+
env: Mapping[str, str] | None = None,
|
|
64
|
+
timeout_s: int | None = None,
|
|
65
|
+
) -> None:
|
|
66
|
+
self._cmd = list(cmd)
|
|
67
|
+
self._cwd = Path(cwd)
|
|
68
|
+
self._env = {**os.environ, **(env or {})}
|
|
69
|
+
self._env["PYTHONUNBUFFERED"] = "1"
|
|
70
|
+
self._timeout_s = timeout_s
|
|
71
|
+
|
|
72
|
+
def run(self, stdout_path: Path, stderr_path: Path, execute_path: Path) -> tuple[int, bool]:
|
|
73
|
+
"""Execute the subprocess, streaming logs and writing metadata.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
(return_code, timed_out)
|
|
77
|
+
"""
|
|
78
|
+
stdout_path.parent.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
stderr_path.parent.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
execute_path.parent.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
|
|
82
|
+
start_ts = time.time()
|
|
83
|
+
|
|
84
|
+
# Platform-specific process group setup
|
|
85
|
+
popen_kwargs: dict = {
|
|
86
|
+
"cwd": self._cwd,
|
|
87
|
+
"env": self._env,
|
|
88
|
+
"stdout": subprocess.PIPE,
|
|
89
|
+
"stderr": subprocess.PIPE,
|
|
90
|
+
"text": True,
|
|
91
|
+
"bufsize": 1,
|
|
92
|
+
}
|
|
93
|
+
if _IS_WINDOWS:
|
|
94
|
+
popen_kwargs["creationflags"] = subprocess.CREATE_NEW_PROCESS_GROUP
|
|
95
|
+
else:
|
|
96
|
+
popen_kwargs["start_new_session"] = True
|
|
97
|
+
|
|
98
|
+
proc = subprocess.Popen(self._cmd, **popen_kwargs)
|
|
99
|
+
|
|
100
|
+
timed_out = False
|
|
101
|
+
kill_strategy: str | None = None
|
|
102
|
+
terminated_at: float | None = None
|
|
103
|
+
|
|
104
|
+
def _pipe_to_file(pipe, path: Path):
|
|
105
|
+
with path.open("a", encoding="utf-8") as f:
|
|
106
|
+
for line in pipe:
|
|
107
|
+
f.write(line)
|
|
108
|
+
f.flush()
|
|
109
|
+
|
|
110
|
+
threads: list[threading.Thread] = []
|
|
111
|
+
if proc.stdout:
|
|
112
|
+
t_out = threading.Thread(
|
|
113
|
+
target=_pipe_to_file, args=(proc.stdout, stdout_path), daemon=True
|
|
114
|
+
)
|
|
115
|
+
threads.append(t_out)
|
|
116
|
+
t_out.start()
|
|
117
|
+
if proc.stderr:
|
|
118
|
+
t_err = threading.Thread(
|
|
119
|
+
target=_pipe_to_file, args=(proc.stderr, stderr_path), daemon=True
|
|
120
|
+
)
|
|
121
|
+
threads.append(t_err)
|
|
122
|
+
t_err.start()
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
proc.wait(timeout=self._timeout_s)
|
|
126
|
+
except subprocess.TimeoutExpired:
|
|
127
|
+
timed_out = True
|
|
128
|
+
terminated_at = time.time()
|
|
129
|
+
kill_strategy = _kill_process_tree(proc)
|
|
130
|
+
|
|
131
|
+
for t in threads:
|
|
132
|
+
t.join(timeout=5.0)
|
|
133
|
+
|
|
134
|
+
end_ts = time.time()
|
|
135
|
+
|
|
136
|
+
meta = {
|
|
137
|
+
"cmd": self._cmd,
|
|
138
|
+
"cwd": str(self._cwd),
|
|
139
|
+
"env": {"PYTHONUNBUFFERED": self._env.get("PYTHONUNBUFFERED", "1")},
|
|
140
|
+
"startedAt": start_ts,
|
|
141
|
+
"endedAt": end_ts,
|
|
142
|
+
"returnCode": proc.returncode,
|
|
143
|
+
"timeout": timed_out,
|
|
144
|
+
"timeoutSeconds": self._timeout_s,
|
|
145
|
+
"killStrategy": kill_strategy,
|
|
146
|
+
"terminatedAt": terminated_at,
|
|
147
|
+
}
|
|
148
|
+
execute_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
|
149
|
+
|
|
150
|
+
return proc.returncode or 0, timed_out
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
__all__ = ["SubprocessRunner"]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Runtime components for algorithm execution."""
|
|
2
|
+
|
|
3
|
+
from fraclab_sdk.runtime.artifacts import ArtifactWriter
|
|
4
|
+
from fraclab_sdk.runtime.data_client import DataClient
|
|
5
|
+
from fraclab_sdk.runtime.runner_main import RunContext
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"ArtifactWriter",
|
|
9
|
+
"DataClient",
|
|
10
|
+
"RunContext",
|
|
11
|
+
]
|