fraclab-sdk 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- README.md +1601 -0
- fraclab_sdk/__init__.py +34 -0
- fraclab_sdk/algorithm/__init__.py +13 -0
- fraclab_sdk/algorithm/export.py +1 -0
- fraclab_sdk/algorithm/library.py +378 -0
- fraclab_sdk/cli.py +381 -0
- fraclab_sdk/config.py +54 -0
- fraclab_sdk/devkit/__init__.py +25 -0
- fraclab_sdk/devkit/compile.py +342 -0
- fraclab_sdk/devkit/export.py +354 -0
- fraclab_sdk/devkit/validate.py +1043 -0
- fraclab_sdk/errors.py +124 -0
- fraclab_sdk/materialize/__init__.py +8 -0
- fraclab_sdk/materialize/fsops.py +125 -0
- fraclab_sdk/materialize/hash.py +28 -0
- fraclab_sdk/materialize/materializer.py +241 -0
- fraclab_sdk/models/__init__.py +52 -0
- fraclab_sdk/models/bundle_manifest.py +51 -0
- fraclab_sdk/models/dataspec.py +65 -0
- fraclab_sdk/models/drs.py +47 -0
- fraclab_sdk/models/output_contract.py +111 -0
- fraclab_sdk/models/run_output_manifest.py +119 -0
- fraclab_sdk/results/__init__.py +25 -0
- fraclab_sdk/results/preview.py +150 -0
- fraclab_sdk/results/reader.py +329 -0
- fraclab_sdk/run/__init__.py +10 -0
- fraclab_sdk/run/logs.py +42 -0
- fraclab_sdk/run/manager.py +403 -0
- fraclab_sdk/run/subprocess_runner.py +153 -0
- fraclab_sdk/runtime/__init__.py +11 -0
- fraclab_sdk/runtime/artifacts.py +303 -0
- fraclab_sdk/runtime/data_client.py +123 -0
- fraclab_sdk/runtime/runner_main.py +286 -0
- fraclab_sdk/runtime/snapshot_provider.py +1 -0
- fraclab_sdk/selection/__init__.py +11 -0
- fraclab_sdk/selection/model.py +247 -0
- fraclab_sdk/selection/validate.py +54 -0
- fraclab_sdk/snapshot/__init__.py +12 -0
- fraclab_sdk/snapshot/index.py +94 -0
- fraclab_sdk/snapshot/library.py +205 -0
- fraclab_sdk/snapshot/loader.py +217 -0
- fraclab_sdk/specs/manifest.py +89 -0
- fraclab_sdk/utils/io.py +32 -0
- fraclab_sdk-0.1.0.dist-info/METADATA +1622 -0
- fraclab_sdk-0.1.0.dist-info/RECORD +47 -0
- fraclab_sdk-0.1.0.dist-info/WHEEL +4 -0
- fraclab_sdk-0.1.0.dist-info/entry_points.txt +4 -0
fraclab_sdk/errors.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""SDK error definitions."""
|
|
2
|
+
|
|
3
|
+
from enum import IntEnum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ExitCode(IntEnum):
|
|
7
|
+
"""CLI exit codes for scripting/CI integration."""
|
|
8
|
+
|
|
9
|
+
SUCCESS = 0
|
|
10
|
+
GENERAL_ERROR = 1
|
|
11
|
+
INPUT_ERROR = 2 # Input/parameter errors (validation, path not found)
|
|
12
|
+
RUN_FAILED = 3 # Algorithm execution failed
|
|
13
|
+
TIMEOUT = 4 # Execution timed out
|
|
14
|
+
INTERNAL_ERROR = 5 # Unexpected internal error (bug)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FraclabError(Exception):
|
|
18
|
+
"""Base exception for all Fraclab SDK errors.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
exit_code: Recommended CLI exit code for this error type.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
exit_code: ExitCode = ExitCode.GENERAL_ERROR
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SnapshotError(FraclabError):
|
|
28
|
+
"""Error related to snapshot operations."""
|
|
29
|
+
|
|
30
|
+
exit_code = ExitCode.INPUT_ERROR
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AlgorithmError(FraclabError):
|
|
34
|
+
"""Error related to algorithm operations."""
|
|
35
|
+
|
|
36
|
+
exit_code = ExitCode.INPUT_ERROR
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SelectionError(FraclabError):
|
|
40
|
+
"""Error related to selection operations."""
|
|
41
|
+
|
|
42
|
+
exit_code = ExitCode.INPUT_ERROR
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MaterializeError(FraclabError):
|
|
46
|
+
"""Error related to materialization operations."""
|
|
47
|
+
|
|
48
|
+
exit_code = ExitCode.INTERNAL_ERROR
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class RunError(FraclabError):
|
|
52
|
+
"""Error related to run execution."""
|
|
53
|
+
|
|
54
|
+
exit_code = ExitCode.RUN_FAILED
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class TimeoutError(RunError):
|
|
58
|
+
"""Error when run execution times out."""
|
|
59
|
+
|
|
60
|
+
exit_code = ExitCode.TIMEOUT
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ResultError(FraclabError):
|
|
64
|
+
"""Error related to result reading."""
|
|
65
|
+
|
|
66
|
+
exit_code = ExitCode.INPUT_ERROR
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class HashMismatchError(SnapshotError):
|
|
70
|
+
"""Error when file hash doesn't match expected hash."""
|
|
71
|
+
|
|
72
|
+
def __init__(self, file_name: str, expected: str, actual: str) -> None:
|
|
73
|
+
self.file_name = file_name
|
|
74
|
+
self.expected = expected
|
|
75
|
+
self.actual = actual
|
|
76
|
+
super().__init__(
|
|
77
|
+
f"Hash mismatch for {file_name}: expected {expected[:16]}..., got {actual[:16]}..."
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class PathTraversalError(FraclabError):
|
|
82
|
+
"""Error when a path traversal attempt is detected."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, path: str) -> None:
|
|
85
|
+
self.path = path
|
|
86
|
+
super().__init__(f"Path traversal detected: {path}")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class DatasetKeyError(SelectionError):
|
|
90
|
+
"""Error when a required dataset key is not found."""
|
|
91
|
+
|
|
92
|
+
def __init__(self, dataset_key: str, available_keys: list[str]) -> None:
|
|
93
|
+
self.dataset_key = dataset_key
|
|
94
|
+
self.available_keys = available_keys
|
|
95
|
+
super().__init__(
|
|
96
|
+
f"Dataset key '{dataset_key}' not found. Available: {available_keys}"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class CardinalityError(SelectionError):
|
|
101
|
+
"""Error when selection violates cardinality constraints."""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self, dataset_key: str, cardinality: str, selected_count: int
|
|
105
|
+
) -> None:
|
|
106
|
+
self.dataset_key = dataset_key
|
|
107
|
+
self.cardinality = cardinality
|
|
108
|
+
self.selected_count = selected_count
|
|
109
|
+
super().__init__(
|
|
110
|
+
f"Cardinality violation for '{dataset_key}': "
|
|
111
|
+
f"cardinality={cardinality}, selected={selected_count}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class OutputContainmentError(RunError):
|
|
116
|
+
"""Error when algorithm attempts to write outside output directory."""
|
|
117
|
+
|
|
118
|
+
def __init__(self, attempted_path: str, output_dir: str) -> None:
|
|
119
|
+
self.attempted_path = attempted_path
|
|
120
|
+
self.output_dir = output_dir
|
|
121
|
+
super().__init__(
|
|
122
|
+
f"Output containment violation: attempted to write to {attempted_path}, "
|
|
123
|
+
f"but must be under {output_dir}"
|
|
124
|
+
)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""File system operations for materialization."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def copy_file_smart(src: Path, dst: Path) -> str:
|
|
9
|
+
"""Copy a file using hardlink > symlink > copy fallback strategy.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
src: Source file path.
|
|
13
|
+
dst: Destination file path.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Strategy used: "hardlink", "symlink", or "copy".
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
FileNotFoundError: If source file doesn't exist.
|
|
20
|
+
OSError: If all copy strategies fail.
|
|
21
|
+
"""
|
|
22
|
+
if not src.exists():
|
|
23
|
+
raise FileNotFoundError(f"Source file not found: {src}")
|
|
24
|
+
|
|
25
|
+
# Ensure parent directory exists
|
|
26
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
|
|
28
|
+
# Try hardlink first
|
|
29
|
+
try:
|
|
30
|
+
os.link(src, dst)
|
|
31
|
+
return "hardlink"
|
|
32
|
+
except OSError:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
# Try symlink
|
|
36
|
+
try:
|
|
37
|
+
os.symlink(src.resolve(), dst)
|
|
38
|
+
return "symlink"
|
|
39
|
+
except OSError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
# Fall back to copy
|
|
43
|
+
shutil.copy2(src, dst)
|
|
44
|
+
return "copy"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def copy_directory_smart(src_dir: Path, dst_dir: Path) -> dict[str, int]:
|
|
48
|
+
"""Copy directory contents using smart file copy strategy.
|
|
49
|
+
|
|
50
|
+
Each file in the source directory (including nested) is copied
|
|
51
|
+
using the hardlink > symlink > copy fallback strategy.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
src_dir: Source directory path.
|
|
55
|
+
dst_dir: Destination directory path.
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Dict with counts: {"hardlink": N, "symlink": N, "copy": N}
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
FileNotFoundError: If source directory doesn't exist.
|
|
62
|
+
"""
|
|
63
|
+
if not src_dir.exists():
|
|
64
|
+
raise FileNotFoundError(f"Source directory not found: {src_dir}")
|
|
65
|
+
|
|
66
|
+
counts = {"hardlink": 0, "symlink": 0, "copy": 0}
|
|
67
|
+
|
|
68
|
+
for src_file in src_dir.rglob("*"):
|
|
69
|
+
if src_file.is_file():
|
|
70
|
+
rel_path = src_file.relative_to(src_dir)
|
|
71
|
+
dst_file = dst_dir / rel_path
|
|
72
|
+
strategy = copy_file_smart(src_file, dst_file)
|
|
73
|
+
counts[strategy] += 1
|
|
74
|
+
|
|
75
|
+
return counts
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def extract_ndjson_lines(
|
|
79
|
+
src_path: Path,
|
|
80
|
+
dst_path: Path,
|
|
81
|
+
line_indices: list[int],
|
|
82
|
+
) -> int:
|
|
83
|
+
"""Extract specific lines from ndjson file and write to new file.
|
|
84
|
+
|
|
85
|
+
Lines are written in the order of line_indices (which should be sorted).
|
|
86
|
+
Output file has contiguous line numbers (0..N-1).
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
src_path: Source ndjson file path.
|
|
90
|
+
dst_path: Destination ndjson file path.
|
|
91
|
+
line_indices: List of 0-based line indices to extract (must be sorted).
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Number of lines written.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
FileNotFoundError: If source file doesn't exist.
|
|
98
|
+
IndexError: If line index is out of range.
|
|
99
|
+
"""
|
|
100
|
+
if not src_path.exists():
|
|
101
|
+
raise FileNotFoundError(f"Source file not found: {src_path}")
|
|
102
|
+
|
|
103
|
+
# Ensure parent directory exists
|
|
104
|
+
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
|
105
|
+
|
|
106
|
+
# Read source and extract lines
|
|
107
|
+
with src_path.open("r", encoding="utf-8") as f:
|
|
108
|
+
all_lines = f.readlines()
|
|
109
|
+
|
|
110
|
+
# Validate indices and extract
|
|
111
|
+
extracted = []
|
|
112
|
+
for idx in line_indices:
|
|
113
|
+
if idx < 0 or idx >= len(all_lines):
|
|
114
|
+
raise IndexError(f"Line index {idx} out of range (0-{len(all_lines)-1})")
|
|
115
|
+
extracted.append(all_lines[idx])
|
|
116
|
+
|
|
117
|
+
# Write extracted lines
|
|
118
|
+
with dst_path.open("w", encoding="utf-8") as f:
|
|
119
|
+
for line in extracted:
|
|
120
|
+
# Ensure line ends with newline
|
|
121
|
+
if not line.endswith("\n"):
|
|
122
|
+
line += "\n"
|
|
123
|
+
f.write(line)
|
|
124
|
+
|
|
125
|
+
return len(extracted)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Hash computation utilities."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def compute_sha256(data: bytes) -> str:
|
|
8
|
+
"""Compute SHA256 hash of bytes data.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
data: Bytes to hash.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Hex-encoded SHA256 hash string.
|
|
15
|
+
"""
|
|
16
|
+
return hashlib.sha256(data).hexdigest()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compute_file_sha256(path: Path) -> str:
|
|
20
|
+
"""Compute SHA256 hash of a file.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
path: Path to file.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Hex-encoded SHA256 hash string.
|
|
27
|
+
"""
|
|
28
|
+
return compute_sha256(path.read_bytes())
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""Materializer implementation."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from fraclab_sdk.errors import MaterializeError
|
|
10
|
+
from fraclab_sdk.materialize.fsops import copy_directory_smart, extract_ndjson_lines
|
|
11
|
+
from fraclab_sdk.materialize.hash import compute_sha256
|
|
12
|
+
from fraclab_sdk.models import DRS, DataSpec
|
|
13
|
+
from fraclab_sdk.snapshot.loader import SnapshotHandle
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class MaterializeResult:
|
|
18
|
+
"""Result of materialization."""
|
|
19
|
+
|
|
20
|
+
input_dir: Path
|
|
21
|
+
ds_sha256: str
|
|
22
|
+
drs_sha256: str
|
|
23
|
+
copy_stats: dict[str, dict[str, int]] # dataset_key -> {hardlink, symlink, copy}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Materializer:
|
|
27
|
+
"""Materializes run input from snapshot and selection."""
|
|
28
|
+
|
|
29
|
+
def materialize(
|
|
30
|
+
self,
|
|
31
|
+
run_dir: Path,
|
|
32
|
+
snapshot: SnapshotHandle,
|
|
33
|
+
run_ds: DataSpec,
|
|
34
|
+
drs: DRS,
|
|
35
|
+
params: dict[str, Any],
|
|
36
|
+
run_context: dict[str, Any],
|
|
37
|
+
) -> MaterializeResult:
|
|
38
|
+
"""Materialize run input directory.
|
|
39
|
+
|
|
40
|
+
Creates runs/<run_id>/input/ with:
|
|
41
|
+
- manifest.json (with sha256 hashes)
|
|
42
|
+
- ds.json (run subset, re-indexed)
|
|
43
|
+
- drs.json (from algorithm)
|
|
44
|
+
- params.json
|
|
45
|
+
- run_context.json
|
|
46
|
+
- data/ (layout-aware materialization)
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
run_dir: The run directory (will create input/ subdirectory).
|
|
50
|
+
snapshot: Source snapshot handle.
|
|
51
|
+
run_ds: Run DataSpec (re-indexed from selection).
|
|
52
|
+
drs: Algorithm's DRS.
|
|
53
|
+
params: Algorithm parameters.
|
|
54
|
+
run_context: Run context metadata.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
MaterializeResult with paths and hashes.
|
|
58
|
+
|
|
59
|
+
Raises:
|
|
60
|
+
MaterializeError: If materialization fails.
|
|
61
|
+
"""
|
|
62
|
+
input_dir = run_dir / "input"
|
|
63
|
+
input_dir.mkdir(parents=True, exist_ok=True)
|
|
64
|
+
|
|
65
|
+
# Write ds.json and compute hash
|
|
66
|
+
ds_bytes = self._write_json(input_dir / "ds.json", run_ds.model_dump())
|
|
67
|
+
ds_sha256 = compute_sha256(ds_bytes)
|
|
68
|
+
|
|
69
|
+
# Write drs.json and compute hash
|
|
70
|
+
drs_bytes = self._write_json(input_dir / "drs.json", drs.model_dump())
|
|
71
|
+
drs_sha256 = compute_sha256(drs_bytes)
|
|
72
|
+
|
|
73
|
+
# Write params.json
|
|
74
|
+
self._write_json(input_dir / "params.json", params)
|
|
75
|
+
|
|
76
|
+
# Write run_context.json
|
|
77
|
+
self._write_json(input_dir / "run_context.json", run_context)
|
|
78
|
+
|
|
79
|
+
# Materialize data
|
|
80
|
+
copy_stats = self._materialize_data(input_dir, snapshot, run_ds)
|
|
81
|
+
|
|
82
|
+
# Write manifest.json (last, with computed hashes)
|
|
83
|
+
# Build datasets entry for manifest
|
|
84
|
+
datasets_manifest: dict[str, dict] = {}
|
|
85
|
+
for dataset in run_ds.datasets:
|
|
86
|
+
datasets_manifest[dataset.datasetKey] = {
|
|
87
|
+
"layout": dataset.layout,
|
|
88
|
+
"count": len(dataset.items),
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
manifest = {
|
|
92
|
+
"bundleVersion": "1.0.0",
|
|
93
|
+
"createdAtUs": int(time.time() * 1_000_000),
|
|
94
|
+
"specFiles": {
|
|
95
|
+
"dsPath": "ds.json",
|
|
96
|
+
"drsPath": "drs.json",
|
|
97
|
+
"dsSha256": ds_sha256,
|
|
98
|
+
"drsSha256": drs_sha256,
|
|
99
|
+
},
|
|
100
|
+
"dataRoot": "data",
|
|
101
|
+
"datasets": datasets_manifest,
|
|
102
|
+
}
|
|
103
|
+
self._write_json(input_dir / "manifest.json", manifest)
|
|
104
|
+
|
|
105
|
+
return MaterializeResult(
|
|
106
|
+
input_dir=input_dir,
|
|
107
|
+
ds_sha256=ds_sha256,
|
|
108
|
+
drs_sha256=drs_sha256,
|
|
109
|
+
copy_stats=copy_stats,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def _write_json(self, path: Path, data: Any) -> bytes:
|
|
113
|
+
"""Write JSON file and return bytes written.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
path: File path to write.
|
|
117
|
+
data: Data to serialize as JSON.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Bytes that were written.
|
|
121
|
+
"""
|
|
122
|
+
content = json.dumps(data, indent=2, ensure_ascii=False)
|
|
123
|
+
content_bytes = content.encode("utf-8")
|
|
124
|
+
path.write_bytes(content_bytes)
|
|
125
|
+
return content_bytes
|
|
126
|
+
|
|
127
|
+
def _materialize_data(
|
|
128
|
+
self,
|
|
129
|
+
input_dir: Path,
|
|
130
|
+
snapshot: SnapshotHandle,
|
|
131
|
+
run_ds: DataSpec,
|
|
132
|
+
) -> dict[str, dict[str, int]]:
|
|
133
|
+
"""Materialize data directory with layout-aware copying.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
input_dir: The input directory.
|
|
137
|
+
snapshot: Source snapshot.
|
|
138
|
+
run_ds: Run DataSpec with re-indexed items.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Copy stats per dataset.
|
|
142
|
+
"""
|
|
143
|
+
data_dir = input_dir / "data"
|
|
144
|
+
copy_stats: dict[str, dict[str, int]] = {}
|
|
145
|
+
|
|
146
|
+
for dataset in run_ds.datasets:
|
|
147
|
+
dataset_key = dataset.datasetKey
|
|
148
|
+
layout = dataset.layout
|
|
149
|
+
|
|
150
|
+
if layout == "frame_parquet_item_dirs":
|
|
151
|
+
stats = self._materialize_parquet(
|
|
152
|
+
data_dir, snapshot, dataset_key, dataset.items
|
|
153
|
+
)
|
|
154
|
+
copy_stats[dataset_key] = stats
|
|
155
|
+
elif layout == "object_ndjson_lines":
|
|
156
|
+
self._materialize_ndjson(
|
|
157
|
+
data_dir, snapshot, dataset_key, dataset.items
|
|
158
|
+
)
|
|
159
|
+
copy_stats[dataset_key] = {"ndjson_lines": len(dataset.items)}
|
|
160
|
+
else:
|
|
161
|
+
raise MaterializeError(f"Unknown layout: {layout}")
|
|
162
|
+
|
|
163
|
+
return copy_stats
|
|
164
|
+
|
|
165
|
+
def _materialize_parquet(
|
|
166
|
+
self,
|
|
167
|
+
data_dir: Path,
|
|
168
|
+
snapshot: SnapshotHandle,
|
|
169
|
+
dataset_key: str,
|
|
170
|
+
items: list,
|
|
171
|
+
) -> dict[str, int]:
|
|
172
|
+
"""Materialize parquet item directories with re-indexing.
|
|
173
|
+
|
|
174
|
+
Source: snapshot/data/<datasetKey>/parquet/item-<snapshot_index:05d>/
|
|
175
|
+
Target: run/input/data/<datasetKey>/parquet/item-<run_index:05d>/
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
data_dir: Target data directory.
|
|
179
|
+
snapshot: Source snapshot.
|
|
180
|
+
dataset_key: Dataset key.
|
|
181
|
+
items: List of DataSpecItem (with sourceItemIndex).
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Copy stats {hardlink, symlink, copy}.
|
|
185
|
+
"""
|
|
186
|
+
total_stats = {"hardlink": 0, "symlink": 0, "copy": 0}
|
|
187
|
+
|
|
188
|
+
for run_index, item in enumerate(items):
|
|
189
|
+
snapshot_index = item.sourceItemIndex
|
|
190
|
+
if snapshot_index is None:
|
|
191
|
+
raise MaterializeError(
|
|
192
|
+
f"Item at run index {run_index} missing sourceItemIndex"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
src_dir = snapshot.get_item_dir(dataset_key, snapshot_index)
|
|
196
|
+
dst_dir = data_dir / dataset_key / "parquet" / f"item-{run_index:05d}"
|
|
197
|
+
|
|
198
|
+
if not src_dir.exists():
|
|
199
|
+
raise MaterializeError(f"Source item directory not found: {src_dir}")
|
|
200
|
+
|
|
201
|
+
stats = copy_directory_smart(src_dir, dst_dir)
|
|
202
|
+
for key in total_stats:
|
|
203
|
+
total_stats[key] += stats[key]
|
|
204
|
+
|
|
205
|
+
return total_stats
|
|
206
|
+
|
|
207
|
+
def _materialize_ndjson(
|
|
208
|
+
self,
|
|
209
|
+
data_dir: Path,
|
|
210
|
+
snapshot: SnapshotHandle,
|
|
211
|
+
dataset_key: str,
|
|
212
|
+
items: list,
|
|
213
|
+
) -> None:
|
|
214
|
+
"""Materialize ndjson by extracting selected lines.
|
|
215
|
+
|
|
216
|
+
Extracts lines by snapshot index and writes contiguously.
|
|
217
|
+
Run item 0 = line 0, run item 1 = line 1, etc.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
data_dir: Target data directory.
|
|
221
|
+
snapshot: Source snapshot.
|
|
222
|
+
dataset_key: Dataset key.
|
|
223
|
+
items: List of DataSpecItem (with sourceItemIndex).
|
|
224
|
+
"""
|
|
225
|
+
# Get snapshot indices in order
|
|
226
|
+
snapshot_indices = []
|
|
227
|
+
for run_index, item in enumerate(items):
|
|
228
|
+
snapshot_index = item.sourceItemIndex
|
|
229
|
+
if snapshot_index is None:
|
|
230
|
+
raise MaterializeError(
|
|
231
|
+
f"Item at run index {run_index} missing sourceItemIndex"
|
|
232
|
+
)
|
|
233
|
+
snapshot_indices.append(snapshot_index)
|
|
234
|
+
|
|
235
|
+
src_path = snapshot.get_ndjson_path(dataset_key)
|
|
236
|
+
dst_path = data_dir / dataset_key / "object.ndjson"
|
|
237
|
+
|
|
238
|
+
if not src_path.exists():
|
|
239
|
+
raise MaterializeError(f"Source ndjson not found: {src_path}")
|
|
240
|
+
|
|
241
|
+
extract_ndjson_lines(src_path, dst_path, snapshot_indices)
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Data models for SDK."""
|
|
2
|
+
|
|
3
|
+
from fraclab_sdk.models.bundle_manifest import (
|
|
4
|
+
BundleManifest,
|
|
5
|
+
DatasetEntry,
|
|
6
|
+
DatasetEntryFile,
|
|
7
|
+
SpecFiles,
|
|
8
|
+
)
|
|
9
|
+
from fraclab_sdk.models.dataspec import DataSpec, DataSpecDataset, DataSpecItem
|
|
10
|
+
from fraclab_sdk.models.drs import DRS, DRSDataset
|
|
11
|
+
from fraclab_sdk.models.output_contract import (
|
|
12
|
+
BlobOutputSchema,
|
|
13
|
+
FrameOutputSchema,
|
|
14
|
+
ObjectOutputSchema,
|
|
15
|
+
OutputContract,
|
|
16
|
+
OutputDatasetContract,
|
|
17
|
+
OutputSchema,
|
|
18
|
+
ScalarOutputSchema,
|
|
19
|
+
)
|
|
20
|
+
from fraclab_sdk.models.run_output_manifest import (
|
|
21
|
+
ArtifactInfo,
|
|
22
|
+
OwnerRef,
|
|
23
|
+
RunInfo,
|
|
24
|
+
RunOutputDataset,
|
|
25
|
+
RunOutputItem,
|
|
26
|
+
RunOutputManifest,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"BundleManifest",
|
|
31
|
+
"DatasetEntry",
|
|
32
|
+
"DatasetEntryFile",
|
|
33
|
+
"SpecFiles",
|
|
34
|
+
"DataSpec",
|
|
35
|
+
"DataSpecDataset",
|
|
36
|
+
"DataSpecItem",
|
|
37
|
+
"DRS",
|
|
38
|
+
"DRSDataset",
|
|
39
|
+
"OutputSchema",
|
|
40
|
+
"ScalarOutputSchema",
|
|
41
|
+
"FrameOutputSchema",
|
|
42
|
+
"ObjectOutputSchema",
|
|
43
|
+
"BlobOutputSchema",
|
|
44
|
+
"OutputContract",
|
|
45
|
+
"OutputDatasetContract",
|
|
46
|
+
"ArtifactInfo",
|
|
47
|
+
"OwnerRef",
|
|
48
|
+
"RunInfo",
|
|
49
|
+
"RunOutputDataset",
|
|
50
|
+
"RunOutputItem",
|
|
51
|
+
"RunOutputManifest",
|
|
52
|
+
]
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Bundle manifest model (Data Bundle Spec v1.0.0)."""
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SpecFiles(BaseModel):
|
|
9
|
+
"""Specification files metadata."""
|
|
10
|
+
|
|
11
|
+
model_config = ConfigDict(extra="ignore")
|
|
12
|
+
|
|
13
|
+
dsPath: str = "ds.json"
|
|
14
|
+
drsPath: str = "drs.json"
|
|
15
|
+
dsSha256: str
|
|
16
|
+
drsSha256: str
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DatasetEntryFile(BaseModel):
|
|
20
|
+
"""Individual file entry in dataset files list."""
|
|
21
|
+
|
|
22
|
+
model_config = ConfigDict(extra="ignore")
|
|
23
|
+
|
|
24
|
+
path: str
|
|
25
|
+
sha256: str
|
|
26
|
+
bytes: int | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DatasetEntry(BaseModel):
|
|
30
|
+
"""Dataset entry in manifest."""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(extra="ignore")
|
|
33
|
+
|
|
34
|
+
layout: Literal["object_ndjson_lines", "frame_parquet_item_dirs"]
|
|
35
|
+
count: int
|
|
36
|
+
files: list[DatasetEntryFile] | None = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BundleManifest(BaseModel):
|
|
40
|
+
"""Manifest for a data bundle (Data Bundle Spec v1.0.0).
|
|
41
|
+
|
|
42
|
+
Contains metadata and integrity hashes for the bundle contents.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
model_config = ConfigDict(extra="ignore")
|
|
46
|
+
|
|
47
|
+
bundleVersion: str
|
|
48
|
+
createdAtUs: int
|
|
49
|
+
specFiles: SpecFiles
|
|
50
|
+
dataRoot: str = "data"
|
|
51
|
+
datasets: dict[str, DatasetEntry]
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""DataSpec model."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OwnerIds(BaseModel):
|
|
9
|
+
"""Structured owner identifiers."""
|
|
10
|
+
|
|
11
|
+
model_config = ConfigDict(extra="ignore")
|
|
12
|
+
|
|
13
|
+
platformId: str | None = None
|
|
14
|
+
wellId: str | None = None
|
|
15
|
+
stageId: str | None = None
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DataSpecItem(BaseModel):
|
|
19
|
+
"""An item within a dataset."""
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(extra="ignore")
|
|
22
|
+
|
|
23
|
+
owner: OwnerIds | str | None = None
|
|
24
|
+
resolutionParams: dict[str, Any] | None = None
|
|
25
|
+
range: dict[str, Any] | None = None
|
|
26
|
+
sourceItemIndex: int | None = None # For traceability in run ds
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DataSpecDataset(BaseModel):
|
|
30
|
+
"""A dataset within a DataSpec."""
|
|
31
|
+
|
|
32
|
+
model_config = ConfigDict(extra="ignore", populate_by_name=True)
|
|
33
|
+
|
|
34
|
+
datasetKey: str = Field(alias="key")
|
|
35
|
+
resourceType: str | None = None
|
|
36
|
+
layout: str | None = None # e.g., "frame_parquet_item_dirs" or "object_ndjson_lines"
|
|
37
|
+
items: list[DataSpecItem] = Field(default_factory=list)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DataSpec(BaseModel):
|
|
41
|
+
"""Data specification describing datasets and their items."""
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(extra="ignore", populate_by_name=True)
|
|
44
|
+
|
|
45
|
+
schemaVersion: str | None = None
|
|
46
|
+
datasets: list[DataSpecDataset] = Field(default_factory=list)
|
|
47
|
+
|
|
48
|
+
@field_validator("datasets", mode="before")
|
|
49
|
+
@classmethod
|
|
50
|
+
def _coerce_datasets(cls, v):
|
|
51
|
+
"""Accept mapping form {'key': {...}} by converting to list."""
|
|
52
|
+
if isinstance(v, dict):
|
|
53
|
+
return [{"key": k, **(val or {})} for k, val in v.items()]
|
|
54
|
+
return v
|
|
55
|
+
|
|
56
|
+
def get_dataset(self, dataset_key: str) -> DataSpecDataset | None:
|
|
57
|
+
"""Get a dataset by key."""
|
|
58
|
+
for ds in self.datasets:
|
|
59
|
+
if ds.datasetKey == dataset_key:
|
|
60
|
+
return ds
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
def get_dataset_keys(self) -> list[str]:
|
|
64
|
+
"""Get all dataset keys."""
|
|
65
|
+
return [ds.datasetKey for ds in self.datasets]
|