jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
datapipeline/runtime.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Any, List, Mapping, Optional, Sequence, Union
|
|
4
|
+
from datetime import datetime
|
|
4
5
|
|
|
5
|
-
from datapipeline.config.
|
|
6
|
+
from datapipeline.config.tasks import ServeTask
|
|
6
7
|
from datapipeline.config.split import SplitConfig
|
|
7
8
|
|
|
8
9
|
from datapipeline.registries.registry import Registry
|
|
@@ -66,7 +67,9 @@ class Runtime:
|
|
|
66
67
|
registries: Registries = field(default_factory=Registries)
|
|
67
68
|
split: Optional[SplitConfig] = None
|
|
68
69
|
split_keep: Optional[str] = None
|
|
69
|
-
run: Optional[
|
|
70
|
+
run: Optional[ServeTask] = None
|
|
71
|
+
schema_required: bool = True
|
|
72
|
+
window_bounds: tuple[datetime | None, datetime | None] | None = None
|
|
70
73
|
artifacts: ArtifactManager = field(init=False)
|
|
71
74
|
|
|
72
75
|
def __post_init__(self) -> None:
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
import json
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Any, Callable, Dict, Generic, Mapping, Optional, TypeVar
|
|
6
7
|
|
|
7
|
-
from datapipeline.services.constants import
|
|
8
|
+
from datapipeline.services.constants import VECTOR_SCHEMA, VECTOR_SCHEMA_METADATA
|
|
8
9
|
|
|
9
10
|
ArtifactValue = TypeVar("ArtifactValue")
|
|
10
11
|
|
|
@@ -85,12 +86,17 @@ class ArtifactManager:
|
|
|
85
86
|
raise RuntimeError(message) from exc
|
|
86
87
|
|
|
87
88
|
|
|
88
|
-
def
|
|
89
|
+
def _read_schema(path: Path) -> dict:
|
|
89
90
|
with path.open("r", encoding="utf-8") as fh:
|
|
90
|
-
return
|
|
91
|
+
return json.load(fh)
|
|
91
92
|
|
|
92
93
|
|
|
93
|
-
|
|
94
|
-
key=
|
|
95
|
-
loader=
|
|
94
|
+
VECTOR_SCHEMA_SPEC = ArtifactSpec[dict](
|
|
95
|
+
key=VECTOR_SCHEMA,
|
|
96
|
+
loader=_read_schema,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
VECTOR_METADATA_SPEC = ArtifactSpec[dict](
|
|
100
|
+
key=VECTOR_SCHEMA_METADATA,
|
|
101
|
+
loader=_read_schema,
|
|
96
102
|
)
|
|
@@ -41,6 +41,12 @@ def _project_vars(data: dict) -> dict[str, Any]:
|
|
|
41
41
|
if name:
|
|
42
42
|
vars_["project"] = str(name)
|
|
43
43
|
vars_["project_name"] = str(name)
|
|
44
|
+
|
|
45
|
+
version = data.get("version")
|
|
46
|
+
if version is not None:
|
|
47
|
+
vars_["version"] = str(version)
|
|
48
|
+
vars_["project_version"] = str(version)
|
|
49
|
+
|
|
44
50
|
globals_ = data.get("globals") or {}
|
|
45
51
|
for k, v in globals_.items():
|
|
46
52
|
vars_[str(k)] = _serialize_global_value(v)
|
|
@@ -64,6 +70,24 @@ def artifacts_root(project_yaml: Path) -> Path:
|
|
|
64
70
|
return (pj.parent / ap).resolve() if not ap.is_absolute() else ap
|
|
65
71
|
|
|
66
72
|
|
|
73
|
+
def run_root(project_yaml: Path, run_id: str | None = None) -> Path:
|
|
74
|
+
"""Return a per-run artifacts directory under the project artifacts root.
|
|
75
|
+
|
|
76
|
+
Example:
|
|
77
|
+
artifacts_root: /.../artifacts/my_dataset/v3
|
|
78
|
+
run_root: /.../artifacts/my_dataset/v3/runs/2025-11-29T14-15-23Z
|
|
79
|
+
"""
|
|
80
|
+
base = artifacts_root(project_yaml)
|
|
81
|
+
|
|
82
|
+
if run_id is None:
|
|
83
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
|
|
84
|
+
run_id = ts
|
|
85
|
+
|
|
86
|
+
root = (base / "runs" / run_id).resolve()
|
|
87
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
return root
|
|
89
|
+
|
|
90
|
+
|
|
67
91
|
def _load_by_key(
|
|
68
92
|
project_yaml: Path,
|
|
69
93
|
key: str,
|
|
@@ -131,6 +155,7 @@ def _interpolate(obj, vars_: dict[str, Any]):
|
|
|
131
155
|
|
|
132
156
|
__all__ = [
|
|
133
157
|
"artifacts_root",
|
|
158
|
+
"run_root",
|
|
134
159
|
"_globals",
|
|
135
160
|
"_interpolate",
|
|
136
161
|
"_load_by_key",
|
|
@@ -1,15 +1,14 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any
|
|
3
3
|
|
|
4
4
|
from datapipeline.utils.load import load_yaml
|
|
5
5
|
from datapipeline.config.catalog import StreamsConfig
|
|
6
|
-
from datapipeline.config.
|
|
6
|
+
from datapipeline.config.tasks import default_serve_task
|
|
7
7
|
from datapipeline.services.project_paths import streams_dir, sources_dir
|
|
8
8
|
from datapipeline.build.state import load_build_state
|
|
9
9
|
from datapipeline.services.constants import (
|
|
10
10
|
PARSER_KEY,
|
|
11
11
|
LOADER_KEY,
|
|
12
|
-
SOURCE_KEY,
|
|
13
12
|
SOURCE_ID_KEY,
|
|
14
13
|
MAPPER_KEY,
|
|
15
14
|
ENTRYPOINT_KEY,
|
|
@@ -19,6 +18,7 @@ from datapipeline.services.constants import (
|
|
|
19
18
|
from datapipeline.services.factories import (
|
|
20
19
|
build_source_from_spec,
|
|
21
20
|
build_mapper_from_spec,
|
|
21
|
+
build_composed_source,
|
|
22
22
|
)
|
|
23
23
|
|
|
24
24
|
from datapipeline.runtime import Runtime
|
|
@@ -28,9 +28,7 @@ from .config import (
|
|
|
28
28
|
_globals,
|
|
29
29
|
_interpolate,
|
|
30
30
|
_load_by_key,
|
|
31
|
-
_paths,
|
|
32
31
|
_project,
|
|
33
|
-
_project_vars,
|
|
34
32
|
)
|
|
35
33
|
|
|
36
34
|
|
|
@@ -41,26 +39,28 @@ SRC_LOADER_KEY = LOADER_KEY
|
|
|
41
39
|
def _load_sources_from_dir(project_yaml: Path, vars_: dict[str, Any]) -> dict:
|
|
42
40
|
"""Aggregate per-source YAML files into a raw-sources mapping.
|
|
43
41
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
42
|
+
Scans for YAML files under the sources directory (recursing through
|
|
43
|
+
subfolders). Expects each file to define a single source with top-level
|
|
44
|
+
'parser' and 'loader' keys. The top-level 'id' inside the file becomes the
|
|
45
|
+
runtime alias.
|
|
47
46
|
"""
|
|
48
|
-
import os
|
|
49
47
|
src_dir = sources_dir(project_yaml)
|
|
50
48
|
if not src_dir.exists() or not src_dir.is_dir():
|
|
51
49
|
return {}
|
|
52
50
|
out: dict[str, dict] = {}
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
51
|
+
candidates = sorted(
|
|
52
|
+
(p for p in src_dir.rglob("*.y*ml") if p.is_file()),
|
|
53
|
+
key=lambda p: p.relative_to(src_dir).as_posix(),
|
|
54
|
+
)
|
|
55
|
+
for path in candidates:
|
|
56
|
+
data = load_yaml(path)
|
|
57
57
|
if not isinstance(data, dict):
|
|
58
58
|
continue
|
|
59
59
|
if isinstance(data.get(SRC_PARSER_KEY), dict) and isinstance(data.get(SRC_LOADER_KEY), dict):
|
|
60
60
|
alias = data.get(SOURCE_ID_KEY)
|
|
61
61
|
if not alias:
|
|
62
62
|
raise ValueError(
|
|
63
|
-
f"Missing '
|
|
63
|
+
f"Missing 'id' in source file: {path.relative_to(src_dir)}")
|
|
64
64
|
out[alias] = _interpolate(data, vars_)
|
|
65
65
|
continue
|
|
66
66
|
return out
|
|
@@ -81,13 +81,32 @@ def _load_canonical_streams(project_yaml: Path, vars_: dict[str, Any]) -> dict:
|
|
|
81
81
|
if not p.is_file():
|
|
82
82
|
continue
|
|
83
83
|
data = load_yaml(p)
|
|
84
|
-
#
|
|
85
|
-
if isinstance(data, dict)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
84
|
+
# Contracts must declare kind: 'ingest' | 'composed'
|
|
85
|
+
if not isinstance(data, dict):
|
|
86
|
+
continue
|
|
87
|
+
kind = data.get("kind")
|
|
88
|
+
if kind not in {"ingest", "composed"}:
|
|
89
|
+
continue
|
|
90
|
+
if (STREAM_ID_KEY not in data):
|
|
91
|
+
continue
|
|
92
|
+
if kind == "ingest" and ("source" not in data):
|
|
93
|
+
continue
|
|
94
|
+
if kind == "composed" and ("inputs" not in data):
|
|
95
|
+
continue
|
|
96
|
+
m = data.get(MAPPER_KEY)
|
|
97
|
+
if (not isinstance(m, dict)) or (ENTRYPOINT_KEY not in (m or {})):
|
|
98
|
+
data[MAPPER_KEY] = None
|
|
99
|
+
# Support simple per-contract variables like 'cadence' while keeping
|
|
100
|
+
# project-level globals as the single source of truth for shared values.
|
|
101
|
+
local_vars = dict(vars_)
|
|
102
|
+
cadence_expr = data.get("cadence")
|
|
103
|
+
if cadence_expr is not None:
|
|
104
|
+
# Allow cadence to reference globals (e.g. ${group_by}) while also
|
|
105
|
+
# making ${cadence} usable elsewhere in the same contract.
|
|
106
|
+
resolved_cadence = _interpolate(cadence_expr, vars_)
|
|
107
|
+
local_vars["cadence"] = resolved_cadence
|
|
108
|
+
alias = data.get(STREAM_ID_KEY)
|
|
109
|
+
out[alias] = _interpolate(data, local_vars)
|
|
91
110
|
return out
|
|
92
111
|
|
|
93
112
|
|
|
@@ -101,16 +120,7 @@ def load_streams(project_yaml: Path) -> StreamsConfig:
|
|
|
101
120
|
def init_streams(cfg: StreamsConfig, runtime: Runtime) -> None:
|
|
102
121
|
"""Compile typed streams config into runtime registries."""
|
|
103
122
|
regs = runtime.registries
|
|
104
|
-
regs.
|
|
105
|
-
regs.debug_operations.clear()
|
|
106
|
-
regs.partition_by.clear()
|
|
107
|
-
regs.sort_batch_size.clear()
|
|
108
|
-
regs.record_operations.clear()
|
|
109
|
-
regs.feature_transforms.clear()
|
|
110
|
-
regs.postprocesses.clear()
|
|
111
|
-
regs.sources.clear()
|
|
112
|
-
regs.mappers.clear()
|
|
113
|
-
regs.stream_sources.clear()
|
|
123
|
+
regs.clear_all()
|
|
114
124
|
|
|
115
125
|
# Register per-stream policies and record transforms for runtime lookups
|
|
116
126
|
for alias, spec in (cfg.contracts or {}).items():
|
|
@@ -124,9 +134,16 @@ def init_streams(cfg: StreamsConfig, runtime: Runtime) -> None:
|
|
|
124
134
|
for alias, spec in (cfg.raw or {}).items():
|
|
125
135
|
regs.sources.register(alias, build_source_from_spec(spec))
|
|
126
136
|
for alias, spec in (cfg.contracts or {}).items():
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
137
|
+
if getattr(spec, "kind", None) == "composed":
|
|
138
|
+
# Composed stream: register virtual source and identity mapper
|
|
139
|
+
regs.stream_sources.register(
|
|
140
|
+
alias, build_composed_source(alias, spec, runtime)
|
|
141
|
+
)
|
|
142
|
+
regs.mappers.register(alias, build_mapper_from_spec(None))
|
|
143
|
+
else:
|
|
144
|
+
mapper = build_mapper_from_spec(spec.mapper)
|
|
145
|
+
regs.mappers.register(alias, mapper)
|
|
146
|
+
regs.stream_sources.register(alias, regs.sources.get(spec.source))
|
|
130
147
|
|
|
131
148
|
|
|
132
149
|
def bootstrap(project_yaml: Path) -> Runtime:
|
|
@@ -146,9 +163,7 @@ def bootstrap(project_yaml: Path) -> Runtime:
|
|
|
146
163
|
runtime.split = None
|
|
147
164
|
|
|
148
165
|
try:
|
|
149
|
-
runtime.run =
|
|
150
|
-
except FileNotFoundError:
|
|
151
|
-
runtime.run = None
|
|
166
|
+
runtime.run = default_serve_task(project_yaml)
|
|
152
167
|
except Exception:
|
|
153
168
|
runtime.run = None
|
|
154
169
|
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
PARSER_KEY = "parser"
|
|
2
2
|
LOADER_KEY = "loader"
|
|
3
3
|
SOURCE_KEY = "source"
|
|
4
|
-
SOURCE_ID_KEY = "
|
|
4
|
+
SOURCE_ID_KEY = "id"
|
|
5
5
|
MAPPER_KEY = "mapper"
|
|
6
6
|
ENTRYPOINT_KEY = "entrypoint"
|
|
7
7
|
ARGS_KEY = "args"
|
|
8
|
-
STREAM_ID_KEY = "
|
|
8
|
+
STREAM_ID_KEY = "id"
|
|
9
9
|
|
|
10
10
|
PARSERS_GROUP = "parsers"
|
|
11
11
|
LOADERS_GROUP = "loaders"
|
|
12
12
|
MAPPERS_GROUP = "mappers"
|
|
13
13
|
FILTERS_GROUP = "filters"
|
|
14
|
-
|
|
14
|
+
DEFAULT_IO_LOADER_EP = "core.io"
|
|
15
15
|
|
|
16
|
-
#POSTPROCESS_GLOBAL_KEY = "__global__"
|
|
16
|
+
# POSTPROCESS_GLOBAL_KEY = "__global__"
|
|
17
17
|
POSTPROCESS_TRANSFORMS = "transforms"
|
|
18
|
-
PARTIONED_IDS = "partitioned_ids"
|
|
19
18
|
SCALER_STATISTICS = "scaler_statistics"
|
|
19
|
+
VECTOR_SCHEMA = "vector_schema"
|
|
20
|
+
VECTOR_SCHEMA_METADATA = "vector_schema_metadata"
|
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
from datapipeline.utils.load import load_ep
|
|
2
2
|
from datapipeline.plugins import PARSERS_EP, LOADERS_EP, MAPPERS_EP
|
|
3
3
|
from datapipeline.sources.models.source import Source
|
|
4
|
-
from datapipeline.config.catalog import SourceConfig, EPArgs
|
|
4
|
+
from datapipeline.config.catalog import SourceConfig, EPArgs, ContractConfig
|
|
5
5
|
from datapipeline.mappers.noop import identity
|
|
6
6
|
from datapipeline.utils.placeholders import normalize_args
|
|
7
|
+
from datapipeline.sources.models.base import SourceInterface
|
|
8
|
+
from datapipeline.pipeline.context import PipelineContext
|
|
9
|
+
from datapipeline.config.dataset.feature import FeatureRecordConfig
|
|
10
|
+
from datapipeline.pipeline.pipelines import build_feature_pipeline
|
|
11
|
+
from datapipeline.pipeline.utils.transform_utils import _supports_parameter
|
|
12
|
+
from inspect import isclass
|
|
13
|
+
from typing import Iterator, Any, Optional
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
def build_source_from_spec(spec: SourceConfig) -> Source:
|
|
@@ -23,3 +30,118 @@ def build_mapper_from_spec(spec: EPArgs | None):
|
|
|
23
30
|
if args:
|
|
24
31
|
return lambda raw: fn(raw, **args)
|
|
25
32
|
return fn
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class _ComposedSource(SourceInterface):
|
|
36
|
+
def __init__(self, *, runtime, stream_id: str, spec: ContractConfig):
|
|
37
|
+
self._runtime = runtime
|
|
38
|
+
self._stream_id = stream_id
|
|
39
|
+
self._spec = spec
|
|
40
|
+
|
|
41
|
+
def stream(self):
|
|
42
|
+
context = PipelineContext(self._runtime)
|
|
43
|
+
raw_inputs = self._spec.inputs
|
|
44
|
+
input_specs = list(raw_inputs or [])
|
|
45
|
+
if not input_specs:
|
|
46
|
+
return iter(())
|
|
47
|
+
|
|
48
|
+
# Resolve inputs: "[alias=]stream_id" (streams only)
|
|
49
|
+
resolved = self._resolve_inputs(context, input_specs)
|
|
50
|
+
aligned = {k: v for k, v in resolved.items() if v["aligned"]}
|
|
51
|
+
aux = {k: v for k, v in resolved.items() if not v["aligned"]}
|
|
52
|
+
|
|
53
|
+
# Build aligned/aux iterators (unwrap FeatureRecord -> record for aligned)
|
|
54
|
+
aligned_iters: dict[str, Iterator[Any]] = {
|
|
55
|
+
k: (fr.record for fr in v["iter"]) # stage>=3 yields FeatureRecord
|
|
56
|
+
for k, v in aligned.items()
|
|
57
|
+
}
|
|
58
|
+
aux_iters: dict[str, Iterator[Any]] = {
|
|
59
|
+
k: v["iter"] for k, v in aux.items()}
|
|
60
|
+
|
|
61
|
+
# Load mapper (composer) from contract
|
|
62
|
+
mapper = self._spec.mapper
|
|
63
|
+
if not mapper or not mapper.entrypoint:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Composed stream '{self._stream_id}' requires mapper.entrypoint composer"
|
|
66
|
+
)
|
|
67
|
+
ep = load_ep(MAPPERS_EP, mapper.entrypoint)
|
|
68
|
+
kwargs = normalize_args(mapper.args)
|
|
69
|
+
|
|
70
|
+
# Choose driver among aligned inputs
|
|
71
|
+
aligned_keys = list(aligned_iters.keys())
|
|
72
|
+
if not aligned_keys:
|
|
73
|
+
driver_key = None
|
|
74
|
+
else:
|
|
75
|
+
driver_key = kwargs.pop("driver", None) or aligned_keys[0]
|
|
76
|
+
|
|
77
|
+
# Mapper adapters: Simple vs Advanced
|
|
78
|
+
if not isclass(ep) and not _supports_parameter(ep, "inputs"):
|
|
79
|
+
# Simple: expect a single iterator when exactly one aligned input and no aux
|
|
80
|
+
if len(aligned_iters) == 1 and not aux_iters:
|
|
81
|
+
single_iter = next(iter(aligned_iters.values()))
|
|
82
|
+
for rec in ep(single_iter):
|
|
83
|
+
yield getattr(rec, "record", rec)
|
|
84
|
+
return
|
|
85
|
+
raise TypeError(
|
|
86
|
+
"Mapper must accept inputs=... for multi-input or aux-enabled contracts"
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Advanced: pass inputs / aux / driver / context when supported
|
|
90
|
+
call_kwargs = dict(kwargs)
|
|
91
|
+
if _supports_parameter(ep, "context") and "context" not in call_kwargs:
|
|
92
|
+
call_kwargs["context"] = context
|
|
93
|
+
if _supports_parameter(ep, "aux"):
|
|
94
|
+
call_kwargs["aux"] = aux_iters
|
|
95
|
+
if driver_key and _supports_parameter(ep, "driver"):
|
|
96
|
+
call_kwargs["driver"] = driver_key
|
|
97
|
+
|
|
98
|
+
if isclass(ep):
|
|
99
|
+
inst = ep(**call_kwargs) if call_kwargs else ep()
|
|
100
|
+
binder = getattr(inst, "bind_context", None)
|
|
101
|
+
if callable(binder):
|
|
102
|
+
binder(context)
|
|
103
|
+
for rec in inst(inputs=aligned_iters):
|
|
104
|
+
yield getattr(rec, "record", rec)
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
for rec in ep(inputs=aligned_iters, **call_kwargs):
|
|
108
|
+
yield getattr(rec, "record", rec)
|
|
109
|
+
|
|
110
|
+
def _resolve_inputs(self, context: PipelineContext, specs: list[str]):
|
|
111
|
+
"""Parse and resolve composed inputs into iterators.
|
|
112
|
+
|
|
113
|
+
Grammar: "[alias=]stream_id" only. All inputs are built to stage 4
|
|
114
|
+
and are alignable (FeatureRecord -> domain record unwrapped).
|
|
115
|
+
"""
|
|
116
|
+
runtime = context.runtime
|
|
117
|
+
known_streams = set(runtime.registries.stream_sources.keys())
|
|
118
|
+
|
|
119
|
+
out: dict[str, dict] = {}
|
|
120
|
+
for spec in specs:
|
|
121
|
+
alias, ref = self._parse_input(spec)
|
|
122
|
+
if ref not in known_streams:
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f"Unknown input stream '{ref}'. Known streams: {sorted(known_streams)}"
|
|
125
|
+
)
|
|
126
|
+
cfg = FeatureRecordConfig(record_stream=ref, id=alias)
|
|
127
|
+
it = build_feature_pipeline(context, cfg, stage=4)
|
|
128
|
+
out[alias] = {"iter": it, "aligned": True}
|
|
129
|
+
|
|
130
|
+
return out
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _parse_input(text: str) -> tuple[str, str]:
|
|
134
|
+
# alias=stream_id
|
|
135
|
+
if "@" in text:
|
|
136
|
+
raise ValueError(
|
|
137
|
+
"composed inputs may not include '@stage'; streams align by default")
|
|
138
|
+
alias: Optional[str] = None
|
|
139
|
+
if "=" in text:
|
|
140
|
+
alias, text = text.split("=", 1)
|
|
141
|
+
ref = text
|
|
142
|
+
alias = alias or ref
|
|
143
|
+
return alias, ref
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def build_composed_source(stream_id: str, spec: ContractConfig, runtime) -> SourceInterface:
|
|
147
|
+
return _ComposedSource(runtime=runtime, stream_id=stream_id, spec=spec)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
4
5
|
|
|
5
6
|
from datapipeline.utils.load import load_yaml
|
|
6
7
|
from datapipeline.config.project import ProjectConfig
|
|
@@ -35,21 +36,19 @@ def sources_dir(project_yaml: Path) -> Path:
|
|
|
35
36
|
return p
|
|
36
37
|
|
|
37
38
|
|
|
38
|
-
def
|
|
39
|
-
"""Return the resolved path to
|
|
39
|
+
def tasks_dir(project_yaml: Path) -> Path:
|
|
40
|
+
"""Return the resolved path to the tasks directory (project.paths.tasks)."""
|
|
40
41
|
|
|
41
42
|
cfg = read_project(project_yaml)
|
|
42
|
-
|
|
43
|
-
if not
|
|
44
|
-
raise FileNotFoundError(
|
|
45
|
-
|
|
46
|
-
)
|
|
47
|
-
p = Path(build_path)
|
|
43
|
+
tasks_path = getattr(cfg.paths, "tasks", None)
|
|
44
|
+
if not tasks_path:
|
|
45
|
+
raise FileNotFoundError("project.paths.tasks must point to a tasks directory.")
|
|
46
|
+
p = Path(tasks_path)
|
|
48
47
|
if not p.is_absolute():
|
|
49
48
|
p = _project_root(project_yaml) / p
|
|
50
|
-
if not p.exists():
|
|
51
|
-
raise FileNotFoundError(f"
|
|
52
|
-
return p
|
|
49
|
+
if not p.exists() or not p.is_dir():
|
|
50
|
+
raise FileNotFoundError(f"tasks directory not found: {p}")
|
|
51
|
+
return p.resolve()
|
|
53
52
|
|
|
54
53
|
|
|
55
54
|
def ensure_project_scaffold(project_yaml: Path) -> None:
|
|
@@ -64,14 +63,14 @@ def ensure_project_scaffold(project_yaml: Path) -> None:
|
|
|
64
63
|
project_yaml.parent.mkdir(parents=True, exist_ok=True)
|
|
65
64
|
default = (
|
|
66
65
|
"version: 1\n"
|
|
66
|
+
"name: default\n"
|
|
67
67
|
"paths:\n"
|
|
68
|
-
" streams:
|
|
69
|
-
" sources:
|
|
68
|
+
" streams: ./contracts\n"
|
|
69
|
+
" sources: ./sources\n"
|
|
70
70
|
" dataset: dataset.yaml\n"
|
|
71
71
|
" postprocess: postprocess.yaml\n"
|
|
72
|
-
" artifacts:
|
|
73
|
-
"
|
|
74
|
-
" run: run.yaml\n"
|
|
72
|
+
" artifacts: ../artifacts/default\n"
|
|
73
|
+
" tasks: ./tasks\n"
|
|
75
74
|
"globals:\n"
|
|
76
75
|
" start_time: 2021-01-01T00:00:00Z\n"
|
|
77
76
|
" end_time: 2021-12-31T23:00:00Z\n"
|
|
@@ -90,7 +89,35 @@ def ensure_project_scaffold(project_yaml: Path) -> None:
|
|
|
90
89
|
if not sources.is_absolute():
|
|
91
90
|
sources = _project_root(project_yaml) / sources
|
|
92
91
|
sources.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
tasks = getattr(cfg.paths, "tasks", None)
|
|
94
|
+
if tasks:
|
|
95
|
+
tasks_path = Path(tasks)
|
|
96
|
+
if not tasks_path.is_absolute():
|
|
97
|
+
tasks_path = _project_root(project_yaml) / tasks_path
|
|
98
|
+
tasks_path.mkdir(parents=True, exist_ok=True)
|
|
93
99
|
except Exception:
|
|
94
100
|
# If the file is malformed, leave it to callers to report; this helper
|
|
95
101
|
# is best-effort to create a sensible starting point.
|
|
96
102
|
pass
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def resolve_project_yaml_path(plugin_root: Path) -> Path:
|
|
106
|
+
"""Return a best-effort project.yaml path for scaffolding.
|
|
107
|
+
|
|
108
|
+
Resolution order:
|
|
109
|
+
1) <plugin_root>/example/project.yaml
|
|
110
|
+
2) <plugin_root>/config/project.yaml
|
|
111
|
+
3) <plugin_root>/config/datasets/default/project.yaml
|
|
112
|
+
4) Fallback: <plugin_root>/example/project.yaml
|
|
113
|
+
"""
|
|
114
|
+
candidates = [
|
|
115
|
+
plugin_root / "example" / "project.yaml",
|
|
116
|
+
plugin_root / "config" / "project.yaml",
|
|
117
|
+
plugin_root / "config" / "datasets" / "default" / "project.yaml",
|
|
118
|
+
]
|
|
119
|
+
for candidate in candidates:
|
|
120
|
+
if candidate.exists():
|
|
121
|
+
return candidate
|
|
122
|
+
# Default to the first candidate; callers may scaffold a new project there.
|
|
123
|
+
return candidates[0]
|