jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
datapipeline/pipeline/context.py
CHANGED
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from contextlib import contextmanager
|
|
4
5
|
from contextvars import ContextVar
|
|
5
6
|
from dataclasses import dataclass, field
|
|
6
|
-
from typing import Iterator, Mapping, Any
|
|
7
|
+
from typing import Iterator, Mapping, Any, Callable, Optional
|
|
8
|
+
from datetime import datetime
|
|
7
9
|
|
|
8
10
|
from datapipeline.runtime import Runtime
|
|
11
|
+
from datapipeline.pipeline.observability import ObserverRegistry
|
|
9
12
|
from datapipeline.services.artifacts import (
|
|
13
|
+
ArtifactNotRegisteredError,
|
|
10
14
|
ArtifactManager,
|
|
11
15
|
ArtifactSpec,
|
|
12
16
|
ArtifactValue,
|
|
13
|
-
|
|
17
|
+
VECTOR_SCHEMA_SPEC,
|
|
14
18
|
)
|
|
19
|
+
from datapipeline.utils.window import resolve_window_bounds
|
|
15
20
|
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
16
22
|
|
|
17
23
|
_current_context: ContextVar[PipelineContext | None] = ContextVar(
|
|
18
24
|
"datapipeline_pipeline_context", default=None
|
|
@@ -24,6 +30,8 @@ class PipelineContext:
|
|
|
24
30
|
"""Lightweight runtime context shared across pipeline stages."""
|
|
25
31
|
|
|
26
32
|
runtime: Runtime
|
|
33
|
+
transform_observer: Callable[..., None] | None = None
|
|
34
|
+
observer_registry: Optional[ObserverRegistry] = None
|
|
27
35
|
_cache: dict[str, Any] = field(default_factory=dict)
|
|
28
36
|
|
|
29
37
|
@property
|
|
@@ -42,13 +50,64 @@ class PipelineContext:
|
|
|
42
50
|
def require_artifact(self, spec: ArtifactSpec[ArtifactValue]) -> ArtifactValue:
|
|
43
51
|
return self.artifacts.load(spec)
|
|
44
52
|
|
|
45
|
-
def load_expected_ids(self) -> list[str]:
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
def load_expected_ids(self, *, payload: str = "features") -> list[str]:
|
|
54
|
+
key = f"expected_ids:{payload}"
|
|
55
|
+
cached = self._cache.get(key)
|
|
56
|
+
if cached is not None:
|
|
57
|
+
return list(cached)
|
|
58
|
+
entries = self.load_schema(payload=payload)
|
|
59
|
+
if not entries:
|
|
60
|
+
if payload == "targets":
|
|
61
|
+
logger.debug("Target schema entries missing; proceeding without target baseline.")
|
|
62
|
+
self._cache[key] = []
|
|
63
|
+
return []
|
|
64
|
+
raise RuntimeError("Vector schema artifact missing; run `jerry build` to materialize schema.json.")
|
|
65
|
+
ids = [entry["id"] for entry in entries if isinstance(entry.get("id"), str)]
|
|
66
|
+
self._cache[key] = ids
|
|
50
67
|
return list(ids)
|
|
51
68
|
|
|
69
|
+
def load_schema(self, *, payload: str = "features") -> list[dict[str, Any]]:
|
|
70
|
+
key = f"schema:{payload}"
|
|
71
|
+
cached = self._cache.get(key)
|
|
72
|
+
if cached is None:
|
|
73
|
+
try:
|
|
74
|
+
doc = self.artifacts.load(VECTOR_SCHEMA_SPEC)
|
|
75
|
+
except ArtifactNotRegisteredError:
|
|
76
|
+
cached = []
|
|
77
|
+
else:
|
|
78
|
+
section = doc.get("targets" if payload == "targets" else "features")
|
|
79
|
+
if isinstance(section, list):
|
|
80
|
+
cached = [entry for entry in section if isinstance(entry, dict)]
|
|
81
|
+
else:
|
|
82
|
+
cached = []
|
|
83
|
+
self._cache[key] = cached
|
|
84
|
+
return [dict(entry) for entry in cached] if cached else []
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def schema_required(self) -> bool:
|
|
88
|
+
return bool(getattr(self.runtime, "schema_required", True))
|
|
89
|
+
|
|
90
|
+
def window_bounds(self, *, rectangular_required: bool = False) -> tuple[datetime | None, datetime | None]:
|
|
91
|
+
key = "window_bounds:required" if rectangular_required else "window_bounds:optional"
|
|
92
|
+
cached = self._cache.get(key)
|
|
93
|
+
if cached is not None:
|
|
94
|
+
return cached
|
|
95
|
+
bounds = resolve_window_bounds(self.runtime, rectangular_required)
|
|
96
|
+
if rectangular_required:
|
|
97
|
+
self.runtime.window_bounds = bounds
|
|
98
|
+
self._cache[key] = bounds
|
|
99
|
+
return bounds
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def start_time(self) -> datetime | None:
|
|
103
|
+
start, _ = self.window_bounds()
|
|
104
|
+
return start
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def end_time(self) -> datetime | None:
|
|
108
|
+
_, end = self.window_bounds()
|
|
109
|
+
return end
|
|
110
|
+
|
|
52
111
|
@contextmanager
|
|
53
112
|
def activate(self) -> Iterator[PipelineContext]:
|
|
54
113
|
token = _current_context.set(self)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Callable, Mapping, Optional, Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class TransformEvent:
|
|
10
|
+
type: str
|
|
11
|
+
payload: Mapping[str, object]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Observer receives a structured event.
|
|
15
|
+
Observer = Callable[[TransformEvent], None]
|
|
16
|
+
# Factory builds an observer for a given logger (may return None if not active at current level).
|
|
17
|
+
ObserverFactory = Callable[[logging.Logger], Optional[Observer]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@runtime_checkable
|
|
21
|
+
class SupportsObserver(Protocol):
|
|
22
|
+
def set_observer(self, observer: Optional[Observer]) -> None:
|
|
23
|
+
...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ObserverRegistry:
|
|
27
|
+
def __init__(self, factories: Optional[Mapping[str, ObserverFactory]] = None) -> None:
|
|
28
|
+
self._factories: dict[str, ObserverFactory] = dict(factories or {})
|
|
29
|
+
|
|
30
|
+
def register(self, name: str, factory: ObserverFactory) -> None:
|
|
31
|
+
self._factories[name] = factory
|
|
32
|
+
|
|
33
|
+
def get(self, name: str, logger: logging.Logger) -> Optional[Observer]:
|
|
34
|
+
factory = self._factories.get(name)
|
|
35
|
+
if not factory:
|
|
36
|
+
return None
|
|
37
|
+
return factory(logger)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _scaler_observer_factory(logger: logging.Logger) -> Optional[Observer]:
|
|
41
|
+
if not logger.isEnabledFor(logging.DEBUG):
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
warned: set[str] = set()
|
|
45
|
+
|
|
46
|
+
def _observer(event: TransformEvent) -> None:
|
|
47
|
+
if event.type != "scaler_none":
|
|
48
|
+
return
|
|
49
|
+
fid = event.payload.get("feature_id")
|
|
50
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
51
|
+
if isinstance(fid, str) and fid not in warned:
|
|
52
|
+
warned.add(fid)
|
|
53
|
+
logger.warning(
|
|
54
|
+
"Scaler encountered None value during scaling for feature=%s "
|
|
55
|
+
"(further occurrences suppressed; consider fill/lint upstream).",
|
|
56
|
+
fid,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return _observer
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def default_observer_registry() -> ObserverRegistry:
|
|
63
|
+
registry = ObserverRegistry()
|
|
64
|
+
registry.register("scale", _scaler_observer_factory)
|
|
65
|
+
return registry
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
import heapq
|
|
2
2
|
from collections.abc import Iterator, Sequence
|
|
3
3
|
from typing import Any
|
|
4
|
+
from itertools import tee
|
|
4
5
|
|
|
6
|
+
from datapipeline.domain.sample import Sample
|
|
7
|
+
from datapipeline.domain.vector import Vector
|
|
5
8
|
from datapipeline.pipeline.utils.keygen import group_key_for
|
|
6
9
|
from datapipeline.pipeline.utils.memory_sort import batch_sort
|
|
7
10
|
from datapipeline.config.dataset.feature import FeatureRecordConfig
|
|
@@ -13,6 +16,9 @@ from datapipeline.pipeline.stages import (
|
|
|
13
16
|
regularize_feature_stream,
|
|
14
17
|
apply_feature_transforms,
|
|
15
18
|
vector_assemble_stage,
|
|
19
|
+
sample_assemble_stage,
|
|
20
|
+
align_stream,
|
|
21
|
+
window_keys,
|
|
16
22
|
)
|
|
17
23
|
from datapipeline.pipeline.context import PipelineContext
|
|
18
24
|
|
|
@@ -72,20 +78,66 @@ def build_vector_pipeline(
|
|
|
72
78
|
context: PipelineContext,
|
|
73
79
|
configs: Sequence[FeatureRecordConfig],
|
|
74
80
|
group_by_cadence: str,
|
|
75
|
-
|
|
81
|
+
target_configs: Sequence[FeatureRecordConfig] | None = None,
|
|
82
|
+
*,
|
|
83
|
+
rectangular: bool = True,
|
|
76
84
|
) -> Iterator[Any]:
|
|
77
|
-
"""Build the vector assembly pipeline.
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
"""Build the vector assembly pipeline for features and optionally attach targets."""
|
|
86
|
+
feature_cfgs = list(configs)
|
|
87
|
+
target_cfgs = list(target_configs or [])
|
|
88
|
+
if not feature_cfgs and not target_cfgs:
|
|
89
|
+
return iter(())
|
|
90
|
+
|
|
91
|
+
if rectangular:
|
|
92
|
+
start, end = context.window_bounds(rectangular_required=True)
|
|
93
|
+
keys = window_keys(start, end, group_by_cadence)
|
|
94
|
+
else:
|
|
95
|
+
keys = None
|
|
96
|
+
|
|
97
|
+
feature_vectors = _assemble_vectors(
|
|
98
|
+
context,
|
|
99
|
+
feature_cfgs,
|
|
100
|
+
group_by_cadence,
|
|
101
|
+
)
|
|
102
|
+
if keys is not None:
|
|
103
|
+
# share keys across feature/target alignment
|
|
104
|
+
if target_cfgs:
|
|
105
|
+
keys_feature, keys_target = tee(keys, 2)
|
|
106
|
+
else:
|
|
107
|
+
keys_feature = keys
|
|
108
|
+
keys_target = None
|
|
109
|
+
feature_vectors = align_stream(feature_vectors, keys=keys_feature)
|
|
110
|
+
else:
|
|
111
|
+
keys_target = None
|
|
112
|
+
|
|
113
|
+
if not target_cfgs:
|
|
114
|
+
return sample_assemble_stage(feature_vectors)
|
|
115
|
+
|
|
116
|
+
target_vectors = _assemble_vectors(
|
|
117
|
+
context,
|
|
118
|
+
target_cfgs,
|
|
119
|
+
group_by_cadence,
|
|
120
|
+
)
|
|
121
|
+
if keys is not None:
|
|
122
|
+
target_vectors = align_stream(target_vectors, keys=keys_target)
|
|
123
|
+
return sample_assemble_stage(feature_vectors, target_vectors)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _assemble_vectors(
|
|
127
|
+
context: PipelineContext,
|
|
128
|
+
configs: Sequence[FeatureRecordConfig],
|
|
129
|
+
group_by_cadence: str,
|
|
130
|
+
) -> Iterator[tuple[tuple, Vector]]:
|
|
131
|
+
if not configs:
|
|
132
|
+
return iter(())
|
|
133
|
+
streams = [
|
|
134
|
+
build_feature_pipeline(
|
|
135
|
+
context,
|
|
136
|
+
cfg,
|
|
137
|
+
)
|
|
138
|
+
for cfg in configs
|
|
139
|
+
]
|
|
87
140
|
merged = heapq.merge(
|
|
88
141
|
*streams, key=lambda fr: group_key_for(fr, group_by_cadence)
|
|
89
142
|
)
|
|
90
|
-
|
|
91
|
-
return vectors
|
|
143
|
+
return vector_assemble_stage(merged, group_by_cadence)
|
datapipeline/pipeline/split.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import hashlib
|
|
2
|
-
from collections.abc import Iterator, Mapping,
|
|
2
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
3
3
|
from datetime import datetime
|
|
4
|
-
from typing import Any, Literal
|
|
4
|
+
from typing import Any, Literal
|
|
5
5
|
|
|
6
|
+
from datapipeline.domain.sample import Sample
|
|
6
7
|
from datapipeline.domain.vector import Vector
|
|
7
8
|
from datapipeline.config.split import (
|
|
8
9
|
SplitConfig,
|
|
9
|
-
HashSplitConfig,
|
|
10
10
|
TimeSplitConfig,
|
|
11
11
|
)
|
|
12
12
|
|
|
@@ -121,24 +121,25 @@ class VectorSplitApplicator:
|
|
|
121
121
|
self._keep is None or self._keep_placeholder)
|
|
122
122
|
)
|
|
123
123
|
|
|
124
|
-
def __call__(self, stream: Iterator[
|
|
124
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
125
125
|
return self.apply(stream)
|
|
126
126
|
|
|
127
|
-
def apply(self, stream: Iterator[
|
|
128
|
-
for
|
|
127
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
128
|
+
for sample in stream:
|
|
129
|
+
group_key, vector = sample.key, sample.features
|
|
129
130
|
label = self._labeler.label(group_key, vector)
|
|
130
131
|
if self._output == "filter":
|
|
131
132
|
if not self._filter_enabled:
|
|
132
|
-
yield
|
|
133
|
+
yield sample
|
|
133
134
|
continue
|
|
134
135
|
if label == self._keep:
|
|
135
|
-
yield
|
|
136
|
+
yield sample
|
|
136
137
|
else:
|
|
137
138
|
continue
|
|
138
139
|
else:
|
|
139
140
|
data = clone(vector.values)
|
|
140
141
|
data[self._field] = label
|
|
141
|
-
yield
|
|
142
|
+
yield sample.with_features(Vector(values=data))
|
|
142
143
|
|
|
143
144
|
|
|
144
145
|
def build_labeler(cfg: SplitConfig) -> BaseLabeler:
|
|
@@ -153,7 +154,7 @@ def build_applicator(cfg: SplitConfig, keep: str | None = None) -> VectorSplitAp
|
|
|
153
154
|
return VectorSplitApplicator(labeler=labeler, output="filter", keep=selected)
|
|
154
155
|
|
|
155
156
|
|
|
156
|
-
def apply_split_stage(runtime, stream: Iterator[
|
|
157
|
+
def apply_split_stage(runtime, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
157
158
|
"""Apply project-configured split at the end of the vector pipeline.
|
|
158
159
|
|
|
159
160
|
Reads `runtime.split` (set during bootstrap from project.globals.split) and,
|
datapipeline/pipeline/stages.py
CHANGED
|
@@ -1,20 +1,25 @@
|
|
|
1
1
|
from collections import defaultdict
|
|
2
|
-
from itertools import groupby
|
|
3
|
-
from typing import Any, Iterable, Iterator,
|
|
2
|
+
from itertools import chain, groupby
|
|
3
|
+
from typing import Any, Iterable, Iterator, Mapping
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
4
6
|
from datapipeline.pipeline.context import PipelineContext
|
|
5
|
-
from datapipeline.services.artifacts import PARTITIONED_IDS_SPEC
|
|
6
7
|
from datapipeline.services.constants import POSTPROCESS_TRANSFORMS, SCALER_STATISTICS
|
|
7
8
|
|
|
8
9
|
from datapipeline.domain.feature import FeatureRecord, FeatureRecordSequence
|
|
9
10
|
from datapipeline.domain.vector import Vector, vectorize_record_group
|
|
11
|
+
from datapipeline.domain.sample import Sample
|
|
10
12
|
from datapipeline.pipeline.utils.memory_sort import batch_sort
|
|
13
|
+
|
|
11
14
|
from datapipeline.pipeline.utils.transform_utils import apply_transforms
|
|
12
15
|
from datapipeline.plugins import FEATURE_TRANSFORMS_EP, VECTOR_TRANSFORMS_EP, RECORD_TRANSFORMS_EP, STREAM_TRANFORMS_EP, DEBUG_TRANSFORMS_EP
|
|
13
16
|
|
|
14
17
|
from datapipeline.domain.record import TemporalRecord
|
|
15
18
|
from datapipeline.pipeline.utils.keygen import FeatureIdGenerator, group_key_for
|
|
16
19
|
from datapipeline.sources.models.source import Source
|
|
17
|
-
from datapipeline.
|
|
20
|
+
from datapipeline.transforms.vector import VectorEnsureSchemaTransform
|
|
21
|
+
from datapipeline.config.dataset.normalize import floor_time_to_bucket
|
|
22
|
+
from datapipeline.utils.time import parse_timecode
|
|
18
23
|
|
|
19
24
|
|
|
20
25
|
def open_source_stream(context: PipelineContext, stream_alias: str) -> Source:
|
|
@@ -39,7 +44,8 @@ def apply_record_operations(
|
|
|
39
44
|
) -> Iterator[TemporalRecord]:
|
|
40
45
|
"""Apply record transforms defined in contract policies in order."""
|
|
41
46
|
steps = context.runtime.registries.record_operations.get(stream_id)
|
|
42
|
-
records = apply_transforms(
|
|
47
|
+
records = apply_transforms(
|
|
48
|
+
record_stream, RECORD_TRANSFORMS_EP, steps, context)
|
|
43
49
|
return records
|
|
44
50
|
|
|
45
51
|
|
|
@@ -65,7 +71,7 @@ def regularize_feature_stream(
|
|
|
65
71
|
batch_size: int,
|
|
66
72
|
) -> Iterator[FeatureRecord]:
|
|
67
73
|
"""Apply feature transforms defined in contract policies in order."""
|
|
68
|
-
# Sort by (id, time) to satisfy stream transforms (
|
|
74
|
+
# Sort by (id, time) to satisfy stream transforms (ensure_cadence/fill)
|
|
69
75
|
sorted = batch_sort(
|
|
70
76
|
feature_stream,
|
|
71
77
|
batch_size=batch_size,
|
|
@@ -121,11 +127,8 @@ def apply_feature_transforms(
|
|
|
121
127
|
def vector_assemble_stage(
|
|
122
128
|
merged: Iterator[FeatureRecord | FeatureRecordSequence],
|
|
123
129
|
group_by_cadence: str,
|
|
124
|
-
) -> Iterator[
|
|
125
|
-
"""Group
|
|
126
|
-
Coalesce each partitioned feature_id into record buckets.
|
|
127
|
-
Yield (group_key, Vector) pairs ready for downstream consumption."""
|
|
128
|
-
|
|
130
|
+
) -> Iterator[tuple[tuple, Vector]]:
|
|
131
|
+
"""Group merged feature stream by key and emit raw vectors."""
|
|
129
132
|
for group_key, group in groupby(
|
|
130
133
|
merged, key=lambda fr: group_key_for(fr, group_by_cadence)
|
|
131
134
|
):
|
|
@@ -136,23 +139,131 @@ def vector_assemble_stage(
|
|
|
136
139
|
else:
|
|
137
140
|
records = [fr.record]
|
|
138
141
|
feature_map[fr.id].extend(records)
|
|
139
|
-
|
|
142
|
+
vector = vectorize_record_group(feature_map)
|
|
143
|
+
yield group_key, vector
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def window_keys(start: datetime | None, end: datetime | None, cadence: str | None) -> Iterator[tuple] | None:
|
|
147
|
+
if start is None or end is None or cadence is None:
|
|
148
|
+
return None
|
|
149
|
+
try:
|
|
150
|
+
current = floor_time_to_bucket(start, cadence)
|
|
151
|
+
stop = floor_time_to_bucket(end, cadence)
|
|
152
|
+
step = parse_timecode(cadence)
|
|
153
|
+
except Exception:
|
|
154
|
+
return None
|
|
155
|
+
if stop < current:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
def _iter():
|
|
159
|
+
t = current
|
|
160
|
+
while t <= stop:
|
|
161
|
+
yield (t,)
|
|
162
|
+
t = t + step
|
|
163
|
+
|
|
164
|
+
return _iter()
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def align_stream(
|
|
168
|
+
stream: Iterator[tuple[tuple, Vector]] | None,
|
|
169
|
+
keys: Iterator[tuple] | None,
|
|
170
|
+
) -> Iterator[tuple[tuple, Vector]]:
|
|
171
|
+
if keys is None:
|
|
172
|
+
return iter(stream or ())
|
|
173
|
+
it = iter(stream or ())
|
|
174
|
+
current = next(it, None)
|
|
175
|
+
for key in keys:
|
|
176
|
+
while current and current[0] < key:
|
|
177
|
+
current = next(it, None)
|
|
178
|
+
if current and current[0] == key:
|
|
179
|
+
yield current
|
|
180
|
+
current = next(it, None)
|
|
181
|
+
else:
|
|
182
|
+
yield (key, Vector(values={}))
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def sample_assemble_stage(
|
|
186
|
+
feature_vectors: Iterator[tuple[tuple, Vector]],
|
|
187
|
+
target_vectors: Iterator[tuple[tuple, Vector]] | None = None,
|
|
188
|
+
) -> Iterator[Sample]:
|
|
189
|
+
"""Combine feature/target vectors into Sample objects."""
|
|
190
|
+
feature_iter = iter(feature_vectors)
|
|
191
|
+
target_iter = iter(target_vectors or ())
|
|
192
|
+
|
|
193
|
+
def _advance(it):
|
|
194
|
+
try:
|
|
195
|
+
return next(it)
|
|
196
|
+
except StopIteration:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
current_feature = _advance(feature_iter)
|
|
200
|
+
current_target = _advance(target_iter)
|
|
201
|
+
|
|
202
|
+
while current_feature:
|
|
203
|
+
feature_key, feature_vector = current_feature
|
|
204
|
+
targets = None
|
|
205
|
+
|
|
206
|
+
while current_target and current_target[0] < feature_key:
|
|
207
|
+
current_target = _advance(target_iter)
|
|
208
|
+
|
|
209
|
+
if current_target and current_target[0] == feature_key:
|
|
210
|
+
targets = current_target[1]
|
|
211
|
+
current_target = _advance(target_iter)
|
|
212
|
+
|
|
213
|
+
yield Sample(key=feature_key, features=feature_vector, targets=targets)
|
|
214
|
+
current_feature = _advance(feature_iter)
|
|
140
215
|
|
|
141
216
|
|
|
142
217
|
def post_process(
|
|
143
218
|
context: PipelineContext,
|
|
144
|
-
stream: Iterator[
|
|
145
|
-
) -> Iterator[
|
|
219
|
+
stream: Iterator[Sample],
|
|
220
|
+
) -> Iterator[Sample]:
|
|
146
221
|
"""Apply project-scoped postprocess transforms (from registry).
|
|
147
222
|
|
|
148
223
|
Explicit prereq artifact flow:
|
|
149
224
|
- Read a precomputed expected feature-id list (full ids) from the build
|
|
150
225
|
folder. If missing, instruct the user to generate it via CLI.
|
|
151
226
|
"""
|
|
227
|
+
stream = _apply_vector_schema(context, stream)
|
|
152
228
|
runtime = context.runtime
|
|
153
229
|
transforms = runtime.registries.postprocesses.get(POSTPROCESS_TRANSFORMS)
|
|
154
|
-
|
|
155
230
|
if not transforms:
|
|
156
231
|
return stream
|
|
157
|
-
|
|
158
232
|
return apply_transforms(stream, VECTOR_TRANSFORMS_EP, transforms, context)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _apply_vector_schema(
|
|
236
|
+
context: PipelineContext,
|
|
237
|
+
stream: Iterator[Sample],
|
|
238
|
+
) -> Iterator[Sample]:
|
|
239
|
+
with context.activate():
|
|
240
|
+
feature_entries = context.load_schema(payload="features")
|
|
241
|
+
target_entries = context.load_schema(payload="targets")
|
|
242
|
+
|
|
243
|
+
if not feature_entries:
|
|
244
|
+
if context.schema_required:
|
|
245
|
+
raise RuntimeError("Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
|
|
246
|
+
feature_stream = stream
|
|
247
|
+
else:
|
|
248
|
+
feature_schema = VectorEnsureSchemaTransform(on_missing="fill", on_extra="drop")
|
|
249
|
+
feature_schema.bind_context(context)
|
|
250
|
+
feature_stream = feature_schema(stream)
|
|
251
|
+
|
|
252
|
+
def _apply_targets(upstream: Iterator[Sample]) -> Iterator[Sample]:
|
|
253
|
+
if target_entries:
|
|
254
|
+
target_schema = VectorEnsureSchemaTransform(payload="targets", on_missing="fill", on_extra="drop")
|
|
255
|
+
target_schema.bind_context(context)
|
|
256
|
+
return target_schema(upstream)
|
|
257
|
+
if not context.schema_required:
|
|
258
|
+
return upstream
|
|
259
|
+
# schema required but missing: only raise if targets are present in stream
|
|
260
|
+
iterator = iter(upstream)
|
|
261
|
+
try:
|
|
262
|
+
first = next(iterator)
|
|
263
|
+
except StopIteration:
|
|
264
|
+
return iter(())
|
|
265
|
+
if first.targets is None:
|
|
266
|
+
return chain([first], iterator)
|
|
267
|
+
raise RuntimeError("Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
|
|
268
|
+
|
|
269
|
+
return _apply_targets(feature_stream)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from typing import Union, List, Any
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
|
|
4
|
-
from datapipeline.config.dataset.normalize import
|
|
4
|
+
from datapipeline.config.dataset.normalize import floor_time_to_bucket
|
|
5
|
+
from datapipeline.transforms.vector_utils import PARTITION_SEP
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class FeatureIdGenerator:
|
|
@@ -9,18 +10,30 @@ class FeatureIdGenerator:
|
|
|
9
10
|
Generates unique feature keys by appending suffixes from expand_by fields.
|
|
10
11
|
"""
|
|
11
12
|
|
|
13
|
+
COMPONENT_PREFIX = "@"
|
|
14
|
+
COMPONENT_JOINER = "_"
|
|
15
|
+
VALUE_DELIMITER = ":"
|
|
16
|
+
|
|
12
17
|
def __init__(self, partition_by: Union[str, List[str], None]):
|
|
13
18
|
self.partition_by = partition_by
|
|
14
19
|
|
|
20
|
+
def _format_component(self, field: str, value: Any) -> str:
|
|
21
|
+
value_str = "" if value is None else str(value)
|
|
22
|
+
return f"{self.COMPONENT_PREFIX}{field}{self.VALUE_DELIMITER}{value_str}"
|
|
23
|
+
|
|
15
24
|
def generate(self, base_id: str, record: Any) -> str:
|
|
16
25
|
if not self.partition_by:
|
|
17
26
|
return base_id
|
|
18
27
|
if isinstance(self.partition_by, str):
|
|
19
|
-
|
|
28
|
+
value = getattr(record, self.partition_by)
|
|
29
|
+
suffix = self._format_component(self.partition_by, value)
|
|
20
30
|
else:
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
31
|
+
parts = [
|
|
32
|
+
self._format_component(field, getattr(record, field))
|
|
33
|
+
for field in self.partition_by
|
|
34
|
+
]
|
|
35
|
+
suffix = self.COMPONENT_JOINER.join(parts)
|
|
36
|
+
return f"{base_id}{PARTITION_SEP}{suffix}"
|
|
24
37
|
|
|
25
38
|
|
|
26
39
|
def _anchor_time(item: Any) -> datetime | None:
|
|
@@ -36,7 +49,7 @@ def _anchor_time(item: Any) -> datetime | None:
|
|
|
36
49
|
return getattr(recs[0], "time", None) if recs else None
|
|
37
50
|
|
|
38
51
|
|
|
39
|
-
def group_key_for(item: Any,
|
|
52
|
+
def group_key_for(item: Any, cadence: str) -> tuple:
|
|
40
53
|
"""Compute 1-tuple bucket key from a FeatureRecord or FeatureRecordSequence."""
|
|
41
54
|
t = _anchor_time(item)
|
|
42
|
-
return (
|
|
55
|
+
return (floor_time_to_bucket(t, cadence),)
|
|
@@ -1,17 +1,12 @@
|
|
|
1
1
|
from typing import Iterable, Iterator, Callable, TypeVar
|
|
2
2
|
import heapq
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def apply_pipeline(stream, stages):
|
|
6
|
-
for stage in stages:
|
|
7
|
-
stream = stage(stream)
|
|
8
|
-
return stream
|
|
3
|
+
from itertools import count
|
|
9
4
|
|
|
10
5
|
|
|
11
6
|
T = TypeVar("T")
|
|
12
7
|
|
|
13
8
|
|
|
14
|
-
def read_batches(iterable: Iterable[T], batch_size: int, key: Callable[[T],
|
|
9
|
+
def read_batches(iterable: Iterable[T], batch_size: int, key: Callable[[T], object]) -> Iterator[list[T]]:
|
|
15
10
|
batch = []
|
|
16
11
|
for item in iterable:
|
|
17
12
|
batch.append(item)
|
|
@@ -22,6 +17,23 @@ def read_batches(iterable: Iterable[T], batch_size: int, key: Callable[[T], any]
|
|
|
22
17
|
yield sorted(batch, key=key)
|
|
23
18
|
|
|
24
19
|
|
|
25
|
-
def batch_sort(iterable: Iterable[T], batch_size: int, key: Callable[[T],
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
def batch_sort(iterable: Iterable[T], batch_size: int, key: Callable[[T], object]) -> Iterator[T]:
|
|
21
|
+
"""Sort an iterable by chunking then merging to reduce peak memory usage."""
|
|
22
|
+
batches = read_batches(iterable, batch_size, key)
|
|
23
|
+
|
|
24
|
+
heap: list[tuple[object, int, T, Iterator[T]]] = []
|
|
25
|
+
seq = count()
|
|
26
|
+
|
|
27
|
+
for batch in batches:
|
|
28
|
+
it = iter(batch)
|
|
29
|
+
first = next(it, None)
|
|
30
|
+
if first is None:
|
|
31
|
+
continue
|
|
32
|
+
heapq.heappush(heap, (key(first), next(seq), first, it))
|
|
33
|
+
|
|
34
|
+
while heap:
|
|
35
|
+
_, _, item, it = heapq.heappop(heap)
|
|
36
|
+
yield item
|
|
37
|
+
nxt = next(it, None)
|
|
38
|
+
if nxt is not None:
|
|
39
|
+
heapq.heappush(heap, (key(nxt), next(seq), nxt, it))
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
from collections.abc import Callable, Iterator, Mapping, Sequence
|
|
2
3
|
from typing import Any, Optional, Tuple
|
|
3
4
|
from inspect import isclass, signature, Parameter
|
|
4
5
|
from contextlib import nullcontext
|
|
5
6
|
|
|
6
7
|
from datapipeline.pipeline.context import PipelineContext
|
|
8
|
+
from datapipeline.pipeline.observability import ObserverRegistry, SupportsObserver, TransformEvent
|
|
7
9
|
|
|
8
10
|
from datapipeline.utils.load import load_ep
|
|
9
11
|
|
|
@@ -79,9 +81,16 @@ def apply_transforms(
|
|
|
79
81
|
group: str,
|
|
80
82
|
transforms: Optional[Sequence[Mapping[str, Any]]],
|
|
81
83
|
context: Optional[PipelineContext] = None,
|
|
84
|
+
observer: Callable[[TransformEvent], None] | None = None,
|
|
85
|
+
observer_registry: ObserverRegistry | None = None,
|
|
82
86
|
) -> Iterator[Any]:
|
|
83
87
|
"""Instantiate and apply configured transforms in order."""
|
|
84
88
|
|
|
89
|
+
observer = observer or (getattr(context, "transform_observer", None)
|
|
90
|
+
if context is not None else None)
|
|
91
|
+
registry = observer_registry or (getattr(context, "observer_registry", None)
|
|
92
|
+
if context is not None else None)
|
|
93
|
+
|
|
85
94
|
context_cm = context.activate() if context else nullcontext()
|
|
86
95
|
with context_cm:
|
|
87
96
|
for transform in transforms or ():
|
|
@@ -90,7 +99,20 @@ def apply_transforms(
|
|
|
90
99
|
if isclass(ep):
|
|
91
100
|
inst = _instantiate_entry_point(ep, params, context)
|
|
92
101
|
_bind_context(inst, context)
|
|
102
|
+
eff_observer = observer
|
|
103
|
+
if eff_observer is None and registry:
|
|
104
|
+
eff_observer = registry.get(
|
|
105
|
+
name, logging.getLogger(f"{group}.{name}")
|
|
106
|
+
)
|
|
107
|
+
_attach_observer(inst, eff_observer)
|
|
93
108
|
stream = inst(stream)
|
|
94
109
|
else:
|
|
95
110
|
stream = _call_with_params(ep, stream, params, context)
|
|
96
111
|
return stream
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _attach_observer(transform: Any, observer: Callable[..., None] | None) -> None:
|
|
115
|
+
if observer is None:
|
|
116
|
+
return
|
|
117
|
+
if isinstance(transform, SupportsObserver):
|
|
118
|
+
transform.set_observer(observer)
|