fraclab-sdk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. README.md +1601 -0
  2. fraclab_sdk/__init__.py +34 -0
  3. fraclab_sdk/algorithm/__init__.py +13 -0
  4. fraclab_sdk/algorithm/export.py +1 -0
  5. fraclab_sdk/algorithm/library.py +378 -0
  6. fraclab_sdk/cli.py +381 -0
  7. fraclab_sdk/config.py +54 -0
  8. fraclab_sdk/devkit/__init__.py +25 -0
  9. fraclab_sdk/devkit/compile.py +342 -0
  10. fraclab_sdk/devkit/export.py +354 -0
  11. fraclab_sdk/devkit/validate.py +1043 -0
  12. fraclab_sdk/errors.py +124 -0
  13. fraclab_sdk/materialize/__init__.py +8 -0
  14. fraclab_sdk/materialize/fsops.py +125 -0
  15. fraclab_sdk/materialize/hash.py +28 -0
  16. fraclab_sdk/materialize/materializer.py +241 -0
  17. fraclab_sdk/models/__init__.py +52 -0
  18. fraclab_sdk/models/bundle_manifest.py +51 -0
  19. fraclab_sdk/models/dataspec.py +65 -0
  20. fraclab_sdk/models/drs.py +47 -0
  21. fraclab_sdk/models/output_contract.py +111 -0
  22. fraclab_sdk/models/run_output_manifest.py +119 -0
  23. fraclab_sdk/results/__init__.py +25 -0
  24. fraclab_sdk/results/preview.py +150 -0
  25. fraclab_sdk/results/reader.py +329 -0
  26. fraclab_sdk/run/__init__.py +10 -0
  27. fraclab_sdk/run/logs.py +42 -0
  28. fraclab_sdk/run/manager.py +403 -0
  29. fraclab_sdk/run/subprocess_runner.py +153 -0
  30. fraclab_sdk/runtime/__init__.py +11 -0
  31. fraclab_sdk/runtime/artifacts.py +303 -0
  32. fraclab_sdk/runtime/data_client.py +123 -0
  33. fraclab_sdk/runtime/runner_main.py +286 -0
  34. fraclab_sdk/runtime/snapshot_provider.py +1 -0
  35. fraclab_sdk/selection/__init__.py +11 -0
  36. fraclab_sdk/selection/model.py +247 -0
  37. fraclab_sdk/selection/validate.py +54 -0
  38. fraclab_sdk/snapshot/__init__.py +12 -0
  39. fraclab_sdk/snapshot/index.py +94 -0
  40. fraclab_sdk/snapshot/library.py +205 -0
  41. fraclab_sdk/snapshot/loader.py +217 -0
  42. fraclab_sdk/specs/manifest.py +89 -0
  43. fraclab_sdk/utils/io.py +32 -0
  44. fraclab_sdk-0.1.0.dist-info/METADATA +1622 -0
  45. fraclab_sdk-0.1.0.dist-info/RECORD +47 -0
  46. fraclab_sdk-0.1.0.dist-info/WHEEL +4 -0
  47. fraclab_sdk-0.1.0.dist-info/entry_points.txt +4 -0
fraclab_sdk/errors.py ADDED
@@ -0,0 +1,124 @@
1
+ """SDK error definitions."""
2
+
3
+ from enum import IntEnum
4
+
5
+
6
+ class ExitCode(IntEnum):
7
+ """CLI exit codes for scripting/CI integration."""
8
+
9
+ SUCCESS = 0
10
+ GENERAL_ERROR = 1
11
+ INPUT_ERROR = 2 # Input/parameter errors (validation, path not found)
12
+ RUN_FAILED = 3 # Algorithm execution failed
13
+ TIMEOUT = 4 # Execution timed out
14
+ INTERNAL_ERROR = 5 # Unexpected internal error (bug)
15
+
16
+
17
+ class FraclabError(Exception):
18
+ """Base exception for all Fraclab SDK errors.
19
+
20
+ Attributes:
21
+ exit_code: Recommended CLI exit code for this error type.
22
+ """
23
+
24
+ exit_code: ExitCode = ExitCode.GENERAL_ERROR
25
+
26
+
27
+ class SnapshotError(FraclabError):
28
+ """Error related to snapshot operations."""
29
+
30
+ exit_code = ExitCode.INPUT_ERROR
31
+
32
+
33
+ class AlgorithmError(FraclabError):
34
+ """Error related to algorithm operations."""
35
+
36
+ exit_code = ExitCode.INPUT_ERROR
37
+
38
+
39
+ class SelectionError(FraclabError):
40
+ """Error related to selection operations."""
41
+
42
+ exit_code = ExitCode.INPUT_ERROR
43
+
44
+
45
+ class MaterializeError(FraclabError):
46
+ """Error related to materialization operations."""
47
+
48
+ exit_code = ExitCode.INTERNAL_ERROR
49
+
50
+
51
+ class RunError(FraclabError):
52
+ """Error related to run execution."""
53
+
54
+ exit_code = ExitCode.RUN_FAILED
55
+
56
+
57
+ class TimeoutError(RunError):
58
+ """Error when run execution times out."""
59
+
60
+ exit_code = ExitCode.TIMEOUT
61
+
62
+
63
+ class ResultError(FraclabError):
64
+ """Error related to result reading."""
65
+
66
+ exit_code = ExitCode.INPUT_ERROR
67
+
68
+
69
+ class HashMismatchError(SnapshotError):
70
+ """Error when file hash doesn't match expected hash."""
71
+
72
+ def __init__(self, file_name: str, expected: str, actual: str) -> None:
73
+ self.file_name = file_name
74
+ self.expected = expected
75
+ self.actual = actual
76
+ super().__init__(
77
+ f"Hash mismatch for {file_name}: expected {expected[:16]}..., got {actual[:16]}..."
78
+ )
79
+
80
+
81
+ class PathTraversalError(FraclabError):
82
+ """Error when a path traversal attempt is detected."""
83
+
84
+ def __init__(self, path: str) -> None:
85
+ self.path = path
86
+ super().__init__(f"Path traversal detected: {path}")
87
+
88
+
89
+ class DatasetKeyError(SelectionError):
90
+ """Error when a required dataset key is not found."""
91
+
92
+ def __init__(self, dataset_key: str, available_keys: list[str]) -> None:
93
+ self.dataset_key = dataset_key
94
+ self.available_keys = available_keys
95
+ super().__init__(
96
+ f"Dataset key '{dataset_key}' not found. Available: {available_keys}"
97
+ )
98
+
99
+
100
+ class CardinalityError(SelectionError):
101
+ """Error when selection violates cardinality constraints."""
102
+
103
+ def __init__(
104
+ self, dataset_key: str, cardinality: str, selected_count: int
105
+ ) -> None:
106
+ self.dataset_key = dataset_key
107
+ self.cardinality = cardinality
108
+ self.selected_count = selected_count
109
+ super().__init__(
110
+ f"Cardinality violation for '{dataset_key}': "
111
+ f"cardinality={cardinality}, selected={selected_count}"
112
+ )
113
+
114
+
115
+ class OutputContainmentError(RunError):
116
+ """Error when algorithm attempts to write outside output directory."""
117
+
118
+ def __init__(self, attempted_path: str, output_dir: str) -> None:
119
+ self.attempted_path = attempted_path
120
+ self.output_dir = output_dir
121
+ super().__init__(
122
+ f"Output containment violation: attempted to write to {attempted_path}, "
123
+ f"but must be under {output_dir}"
124
+ )
@@ -0,0 +1,8 @@
1
+ """Materialization management."""
2
+
3
+ from fraclab_sdk.materialize.materializer import Materializer, MaterializeResult
4
+
5
+ __all__ = [
6
+ "Materializer",
7
+ "MaterializeResult",
8
+ ]
@@ -0,0 +1,125 @@
1
+ """File system operations for materialization."""
2
+
3
+ import os
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+
8
+ def copy_file_smart(src: Path, dst: Path) -> str:
9
+ """Copy a file using hardlink > symlink > copy fallback strategy.
10
+
11
+ Args:
12
+ src: Source file path.
13
+ dst: Destination file path.
14
+
15
+ Returns:
16
+ Strategy used: "hardlink", "symlink", or "copy".
17
+
18
+ Raises:
19
+ FileNotFoundError: If source file doesn't exist.
20
+ OSError: If all copy strategies fail.
21
+ """
22
+ if not src.exists():
23
+ raise FileNotFoundError(f"Source file not found: {src}")
24
+
25
+ # Ensure parent directory exists
26
+ dst.parent.mkdir(parents=True, exist_ok=True)
27
+
28
+ # Try hardlink first
29
+ try:
30
+ os.link(src, dst)
31
+ return "hardlink"
32
+ except OSError:
33
+ pass
34
+
35
+ # Try symlink
36
+ try:
37
+ os.symlink(src.resolve(), dst)
38
+ return "symlink"
39
+ except OSError:
40
+ pass
41
+
42
+ # Fall back to copy
43
+ shutil.copy2(src, dst)
44
+ return "copy"
45
+
46
+
47
+ def copy_directory_smart(src_dir: Path, dst_dir: Path) -> dict[str, int]:
48
+ """Copy directory contents using smart file copy strategy.
49
+
50
+ Each file in the source directory (including nested) is copied
51
+ using the hardlink > symlink > copy fallback strategy.
52
+
53
+ Args:
54
+ src_dir: Source directory path.
55
+ dst_dir: Destination directory path.
56
+
57
+ Returns:
58
+ Dict with counts: {"hardlink": N, "symlink": N, "copy": N}
59
+
60
+ Raises:
61
+ FileNotFoundError: If source directory doesn't exist.
62
+ """
63
+ if not src_dir.exists():
64
+ raise FileNotFoundError(f"Source directory not found: {src_dir}")
65
+
66
+ counts = {"hardlink": 0, "symlink": 0, "copy": 0}
67
+
68
+ for src_file in src_dir.rglob("*"):
69
+ if src_file.is_file():
70
+ rel_path = src_file.relative_to(src_dir)
71
+ dst_file = dst_dir / rel_path
72
+ strategy = copy_file_smart(src_file, dst_file)
73
+ counts[strategy] += 1
74
+
75
+ return counts
76
+
77
+
78
+ def extract_ndjson_lines(
79
+ src_path: Path,
80
+ dst_path: Path,
81
+ line_indices: list[int],
82
+ ) -> int:
83
+ """Extract specific lines from ndjson file and write to new file.
84
+
85
+ Lines are written in the order of line_indices (which should be sorted).
86
+ Output file has contiguous line numbers (0..N-1).
87
+
88
+ Args:
89
+ src_path: Source ndjson file path.
90
+ dst_path: Destination ndjson file path.
91
+ line_indices: List of 0-based line indices to extract (must be sorted).
92
+
93
+ Returns:
94
+ Number of lines written.
95
+
96
+ Raises:
97
+ FileNotFoundError: If source file doesn't exist.
98
+ IndexError: If line index is out of range.
99
+ """
100
+ if not src_path.exists():
101
+ raise FileNotFoundError(f"Source file not found: {src_path}")
102
+
103
+ # Ensure parent directory exists
104
+ dst_path.parent.mkdir(parents=True, exist_ok=True)
105
+
106
+ # Read source and extract lines
107
+ with src_path.open("r", encoding="utf-8") as f:
108
+ all_lines = f.readlines()
109
+
110
+ # Validate indices and extract
111
+ extracted = []
112
+ for idx in line_indices:
113
+ if idx < 0 or idx >= len(all_lines):
114
+ raise IndexError(f"Line index {idx} out of range (0-{len(all_lines)-1})")
115
+ extracted.append(all_lines[idx])
116
+
117
+ # Write extracted lines
118
+ with dst_path.open("w", encoding="utf-8") as f:
119
+ for line in extracted:
120
+ # Ensure line ends with newline
121
+ if not line.endswith("\n"):
122
+ line += "\n"
123
+ f.write(line)
124
+
125
+ return len(extracted)
@@ -0,0 +1,28 @@
1
+ """Hash computation utilities."""
2
+
3
+ import hashlib
4
+ from pathlib import Path
5
+
6
+
7
+ def compute_sha256(data: bytes) -> str:
8
+ """Compute SHA256 hash of bytes data.
9
+
10
+ Args:
11
+ data: Bytes to hash.
12
+
13
+ Returns:
14
+ Hex-encoded SHA256 hash string.
15
+ """
16
+ return hashlib.sha256(data).hexdigest()
17
+
18
+
19
+ def compute_file_sha256(path: Path) -> str:
20
+ """Compute SHA256 hash of a file.
21
+
22
+ Args:
23
+ path: Path to file.
24
+
25
+ Returns:
26
+ Hex-encoded SHA256 hash string.
27
+ """
28
+ return compute_sha256(path.read_bytes())
@@ -0,0 +1,241 @@
1
+ """Materializer implementation."""
2
+
3
+ import json
4
+ import time
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from fraclab_sdk.errors import MaterializeError
10
+ from fraclab_sdk.materialize.fsops import copy_directory_smart, extract_ndjson_lines
11
+ from fraclab_sdk.materialize.hash import compute_sha256
12
+ from fraclab_sdk.models import DRS, DataSpec
13
+ from fraclab_sdk.snapshot.loader import SnapshotHandle
14
+
15
+
16
+ @dataclass
17
+ class MaterializeResult:
18
+ """Result of materialization."""
19
+
20
+ input_dir: Path
21
+ ds_sha256: str
22
+ drs_sha256: str
23
+ copy_stats: dict[str, dict[str, int]] # dataset_key -> {hardlink, symlink, copy}
24
+
25
+
26
+ class Materializer:
27
+ """Materializes run input from snapshot and selection."""
28
+
29
+ def materialize(
30
+ self,
31
+ run_dir: Path,
32
+ snapshot: SnapshotHandle,
33
+ run_ds: DataSpec,
34
+ drs: DRS,
35
+ params: dict[str, Any],
36
+ run_context: dict[str, Any],
37
+ ) -> MaterializeResult:
38
+ """Materialize run input directory.
39
+
40
+ Creates runs/<run_id>/input/ with:
41
+ - manifest.json (with sha256 hashes)
42
+ - ds.json (run subset, re-indexed)
43
+ - drs.json (from algorithm)
44
+ - params.json
45
+ - run_context.json
46
+ - data/ (layout-aware materialization)
47
+
48
+ Args:
49
+ run_dir: The run directory (will create input/ subdirectory).
50
+ snapshot: Source snapshot handle.
51
+ run_ds: Run DataSpec (re-indexed from selection).
52
+ drs: Algorithm's DRS.
53
+ params: Algorithm parameters.
54
+ run_context: Run context metadata.
55
+
56
+ Returns:
57
+ MaterializeResult with paths and hashes.
58
+
59
+ Raises:
60
+ MaterializeError: If materialization fails.
61
+ """
62
+ input_dir = run_dir / "input"
63
+ input_dir.mkdir(parents=True, exist_ok=True)
64
+
65
+ # Write ds.json and compute hash
66
+ ds_bytes = self._write_json(input_dir / "ds.json", run_ds.model_dump())
67
+ ds_sha256 = compute_sha256(ds_bytes)
68
+
69
+ # Write drs.json and compute hash
70
+ drs_bytes = self._write_json(input_dir / "drs.json", drs.model_dump())
71
+ drs_sha256 = compute_sha256(drs_bytes)
72
+
73
+ # Write params.json
74
+ self._write_json(input_dir / "params.json", params)
75
+
76
+ # Write run_context.json
77
+ self._write_json(input_dir / "run_context.json", run_context)
78
+
79
+ # Materialize data
80
+ copy_stats = self._materialize_data(input_dir, snapshot, run_ds)
81
+
82
+ # Write manifest.json (last, with computed hashes)
83
+ # Build datasets entry for manifest
84
+ datasets_manifest: dict[str, dict] = {}
85
+ for dataset in run_ds.datasets:
86
+ datasets_manifest[dataset.datasetKey] = {
87
+ "layout": dataset.layout,
88
+ "count": len(dataset.items),
89
+ }
90
+
91
+ manifest = {
92
+ "bundleVersion": "1.0.0",
93
+ "createdAtUs": int(time.time() * 1_000_000),
94
+ "specFiles": {
95
+ "dsPath": "ds.json",
96
+ "drsPath": "drs.json",
97
+ "dsSha256": ds_sha256,
98
+ "drsSha256": drs_sha256,
99
+ },
100
+ "dataRoot": "data",
101
+ "datasets": datasets_manifest,
102
+ }
103
+ self._write_json(input_dir / "manifest.json", manifest)
104
+
105
+ return MaterializeResult(
106
+ input_dir=input_dir,
107
+ ds_sha256=ds_sha256,
108
+ drs_sha256=drs_sha256,
109
+ copy_stats=copy_stats,
110
+ )
111
+
112
+ def _write_json(self, path: Path, data: Any) -> bytes:
113
+ """Write JSON file and return bytes written.
114
+
115
+ Args:
116
+ path: File path to write.
117
+ data: Data to serialize as JSON.
118
+
119
+ Returns:
120
+ Bytes that were written.
121
+ """
122
+ content = json.dumps(data, indent=2, ensure_ascii=False)
123
+ content_bytes = content.encode("utf-8")
124
+ path.write_bytes(content_bytes)
125
+ return content_bytes
126
+
127
+ def _materialize_data(
128
+ self,
129
+ input_dir: Path,
130
+ snapshot: SnapshotHandle,
131
+ run_ds: DataSpec,
132
+ ) -> dict[str, dict[str, int]]:
133
+ """Materialize data directory with layout-aware copying.
134
+
135
+ Args:
136
+ input_dir: The input directory.
137
+ snapshot: Source snapshot.
138
+ run_ds: Run DataSpec with re-indexed items.
139
+
140
+ Returns:
141
+ Copy stats per dataset.
142
+ """
143
+ data_dir = input_dir / "data"
144
+ copy_stats: dict[str, dict[str, int]] = {}
145
+
146
+ for dataset in run_ds.datasets:
147
+ dataset_key = dataset.datasetKey
148
+ layout = dataset.layout
149
+
150
+ if layout == "frame_parquet_item_dirs":
151
+ stats = self._materialize_parquet(
152
+ data_dir, snapshot, dataset_key, dataset.items
153
+ )
154
+ copy_stats[dataset_key] = stats
155
+ elif layout == "object_ndjson_lines":
156
+ self._materialize_ndjson(
157
+ data_dir, snapshot, dataset_key, dataset.items
158
+ )
159
+ copy_stats[dataset_key] = {"ndjson_lines": len(dataset.items)}
160
+ else:
161
+ raise MaterializeError(f"Unknown layout: {layout}")
162
+
163
+ return copy_stats
164
+
165
+ def _materialize_parquet(
166
+ self,
167
+ data_dir: Path,
168
+ snapshot: SnapshotHandle,
169
+ dataset_key: str,
170
+ items: list,
171
+ ) -> dict[str, int]:
172
+ """Materialize parquet item directories with re-indexing.
173
+
174
+ Source: snapshot/data/<datasetKey>/parquet/item-<snapshot_index:05d>/
175
+ Target: run/input/data/<datasetKey>/parquet/item-<run_index:05d>/
176
+
177
+ Args:
178
+ data_dir: Target data directory.
179
+ snapshot: Source snapshot.
180
+ dataset_key: Dataset key.
181
+ items: List of DataSpecItem (with sourceItemIndex).
182
+
183
+ Returns:
184
+ Copy stats {hardlink, symlink, copy}.
185
+ """
186
+ total_stats = {"hardlink": 0, "symlink": 0, "copy": 0}
187
+
188
+ for run_index, item in enumerate(items):
189
+ snapshot_index = item.sourceItemIndex
190
+ if snapshot_index is None:
191
+ raise MaterializeError(
192
+ f"Item at run index {run_index} missing sourceItemIndex"
193
+ )
194
+
195
+ src_dir = snapshot.get_item_dir(dataset_key, snapshot_index)
196
+ dst_dir = data_dir / dataset_key / "parquet" / f"item-{run_index:05d}"
197
+
198
+ if not src_dir.exists():
199
+ raise MaterializeError(f"Source item directory not found: {src_dir}")
200
+
201
+ stats = copy_directory_smart(src_dir, dst_dir)
202
+ for key in total_stats:
203
+ total_stats[key] += stats[key]
204
+
205
+ return total_stats
206
+
207
+ def _materialize_ndjson(
208
+ self,
209
+ data_dir: Path,
210
+ snapshot: SnapshotHandle,
211
+ dataset_key: str,
212
+ items: list,
213
+ ) -> None:
214
+ """Materialize ndjson by extracting selected lines.
215
+
216
+ Extracts lines by snapshot index and writes contiguously.
217
+ Run item 0 = line 0, run item 1 = line 1, etc.
218
+
219
+ Args:
220
+ data_dir: Target data directory.
221
+ snapshot: Source snapshot.
222
+ dataset_key: Dataset key.
223
+ items: List of DataSpecItem (with sourceItemIndex).
224
+ """
225
+ # Get snapshot indices in order
226
+ snapshot_indices = []
227
+ for run_index, item in enumerate(items):
228
+ snapshot_index = item.sourceItemIndex
229
+ if snapshot_index is None:
230
+ raise MaterializeError(
231
+ f"Item at run index {run_index} missing sourceItemIndex"
232
+ )
233
+ snapshot_indices.append(snapshot_index)
234
+
235
+ src_path = snapshot.get_ndjson_path(dataset_key)
236
+ dst_path = data_dir / dataset_key / "object.ndjson"
237
+
238
+ if not src_path.exists():
239
+ raise MaterializeError(f"Source ndjson not found: {src_path}")
240
+
241
+ extract_ndjson_lines(src_path, dst_path, snapshot_indices)
@@ -0,0 +1,52 @@
1
+ """Data models for SDK."""
2
+
3
+ from fraclab_sdk.models.bundle_manifest import (
4
+ BundleManifest,
5
+ DatasetEntry,
6
+ DatasetEntryFile,
7
+ SpecFiles,
8
+ )
9
+ from fraclab_sdk.models.dataspec import DataSpec, DataSpecDataset, DataSpecItem
10
+ from fraclab_sdk.models.drs import DRS, DRSDataset
11
+ from fraclab_sdk.models.output_contract import (
12
+ BlobOutputSchema,
13
+ FrameOutputSchema,
14
+ ObjectOutputSchema,
15
+ OutputContract,
16
+ OutputDatasetContract,
17
+ OutputSchema,
18
+ ScalarOutputSchema,
19
+ )
20
+ from fraclab_sdk.models.run_output_manifest import (
21
+ ArtifactInfo,
22
+ OwnerRef,
23
+ RunInfo,
24
+ RunOutputDataset,
25
+ RunOutputItem,
26
+ RunOutputManifest,
27
+ )
28
+
29
+ __all__ = [
30
+ "BundleManifest",
31
+ "DatasetEntry",
32
+ "DatasetEntryFile",
33
+ "SpecFiles",
34
+ "DataSpec",
35
+ "DataSpecDataset",
36
+ "DataSpecItem",
37
+ "DRS",
38
+ "DRSDataset",
39
+ "OutputSchema",
40
+ "ScalarOutputSchema",
41
+ "FrameOutputSchema",
42
+ "ObjectOutputSchema",
43
+ "BlobOutputSchema",
44
+ "OutputContract",
45
+ "OutputDatasetContract",
46
+ "ArtifactInfo",
47
+ "OwnerRef",
48
+ "RunInfo",
49
+ "RunOutputDataset",
50
+ "RunOutputItem",
51
+ "RunOutputManifest",
52
+ ]
@@ -0,0 +1,51 @@
1
+ """Bundle manifest model (Data Bundle Spec v1.0.0)."""
2
+
3
+ from typing import Literal
4
+
5
+ from pydantic import BaseModel, ConfigDict
6
+
7
+
8
+ class SpecFiles(BaseModel):
9
+ """Specification files metadata."""
10
+
11
+ model_config = ConfigDict(extra="ignore")
12
+
13
+ dsPath: str = "ds.json"
14
+ drsPath: str = "drs.json"
15
+ dsSha256: str
16
+ drsSha256: str
17
+
18
+
19
+ class DatasetEntryFile(BaseModel):
20
+ """Individual file entry in dataset files list."""
21
+
22
+ model_config = ConfigDict(extra="ignore")
23
+
24
+ path: str
25
+ sha256: str
26
+ bytes: int | None = None
27
+
28
+
29
+ class DatasetEntry(BaseModel):
30
+ """Dataset entry in manifest."""
31
+
32
+ model_config = ConfigDict(extra="ignore")
33
+
34
+ layout: Literal["object_ndjson_lines", "frame_parquet_item_dirs"]
35
+ count: int
36
+ files: list[DatasetEntryFile] | None = None
37
+
38
+
39
+ class BundleManifest(BaseModel):
40
+ """Manifest for a data bundle (Data Bundle Spec v1.0.0).
41
+
42
+ Contains metadata and integrity hashes for the bundle contents.
43
+ """
44
+
45
+ model_config = ConfigDict(extra="ignore")
46
+
47
+ bundleVersion: str
48
+ createdAtUs: int
49
+ specFiles: SpecFiles
50
+ dataRoot: str = "data"
51
+ datasets: dict[str, DatasetEntry]
@@ -0,0 +1,65 @@
1
+ """DataSpec model."""
2
+
3
+ from typing import Any
4
+
5
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
6
+
7
+
8
+ class OwnerIds(BaseModel):
9
+ """Structured owner identifiers."""
10
+
11
+ model_config = ConfigDict(extra="ignore")
12
+
13
+ platformId: str | None = None
14
+ wellId: str | None = None
15
+ stageId: str | None = None
16
+
17
+
18
+ class DataSpecItem(BaseModel):
19
+ """An item within a dataset."""
20
+
21
+ model_config = ConfigDict(extra="ignore")
22
+
23
+ owner: OwnerIds | str | None = None
24
+ resolutionParams: dict[str, Any] | None = None
25
+ range: dict[str, Any] | None = None
26
+ sourceItemIndex: int | None = None # For traceability in run ds
27
+
28
+
29
+ class DataSpecDataset(BaseModel):
30
+ """A dataset within a DataSpec."""
31
+
32
+ model_config = ConfigDict(extra="ignore", populate_by_name=True)
33
+
34
+ datasetKey: str = Field(alias="key")
35
+ resourceType: str | None = None
36
+ layout: str | None = None # e.g., "frame_parquet_item_dirs" or "object_ndjson_lines"
37
+ items: list[DataSpecItem] = Field(default_factory=list)
38
+
39
+
40
+ class DataSpec(BaseModel):
41
+ """Data specification describing datasets and their items."""
42
+
43
+ model_config = ConfigDict(extra="ignore", populate_by_name=True)
44
+
45
+ schemaVersion: str | None = None
46
+ datasets: list[DataSpecDataset] = Field(default_factory=list)
47
+
48
+ @field_validator("datasets", mode="before")
49
+ @classmethod
50
+ def _coerce_datasets(cls, v):
51
+ """Accept mapping form {'key': {...}} by converting to list."""
52
+ if isinstance(v, dict):
53
+ return [{"key": k, **(val or {})} for k, val in v.items()]
54
+ return v
55
+
56
+ def get_dataset(self, dataset_key: str) -> DataSpecDataset | None:
57
+ """Get a dataset by key."""
58
+ for ds in self.datasets:
59
+ if ds.datasetKey == dataset_key:
60
+ return ds
61
+ return None
62
+
63
+ def get_dataset_keys(self) -> list[str]:
64
+ """Get all dataset keys."""
65
+ return [ds.datasetKey for ds in self.datasets]