jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from datapipeline.domain.sample import Sample
|
|
7
|
+
|
|
8
|
+
from .horizontal import VectorDropHorizontalTransform
|
|
9
|
+
from .vertical import VectorDropVerticalTransform
|
|
10
|
+
|
|
11
|
+
Axis = Literal["horizontal", "vertical"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class VectorDropTransform:
|
|
15
|
+
"""Drop vectors or features based on coverage thresholds.
|
|
16
|
+
|
|
17
|
+
Thin orchestrator that delegates to horizontal or vertical strategies based
|
|
18
|
+
on the configured axis.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(
|
|
22
|
+
self,
|
|
23
|
+
*,
|
|
24
|
+
axis: Axis = "horizontal",
|
|
25
|
+
threshold: float,
|
|
26
|
+
payload: Literal["features", "targets", "both"] = "features",
|
|
27
|
+
only: list[str] | None = None,
|
|
28
|
+
exclude: list[str] | None = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
if axis not in {"horizontal", "vertical"}:
|
|
31
|
+
raise ValueError("axis must be 'horizontal' or 'vertical'")
|
|
32
|
+
if axis == "vertical" and payload == "both":
|
|
33
|
+
raise ValueError("axis='vertical' does not support payload='both'")
|
|
34
|
+
if axis == "horizontal":
|
|
35
|
+
self._impl: object = VectorDropHorizontalTransform(
|
|
36
|
+
threshold=threshold,
|
|
37
|
+
payload=payload,
|
|
38
|
+
only=only,
|
|
39
|
+
exclude=exclude,
|
|
40
|
+
)
|
|
41
|
+
else:
|
|
42
|
+
# Vertical drop is partition/feature-oriented and does not support
|
|
43
|
+
# payload='both'. Payload is validated above.
|
|
44
|
+
self._impl = VectorDropVerticalTransform(
|
|
45
|
+
payload=payload if payload != "both" else "features",
|
|
46
|
+
threshold=threshold,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def bind_context(self, context) -> None:
|
|
50
|
+
binder = getattr(self._impl, "bind_context", None)
|
|
51
|
+
if binder is not None:
|
|
52
|
+
binder(context)
|
|
53
|
+
|
|
54
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
55
|
+
return self.apply(stream)
|
|
56
|
+
|
|
57
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
58
|
+
return getattr(self._impl, "apply")(stream)
|
|
59
|
+
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from datapipeline.config.metadata import (
|
|
7
|
+
FEATURE_VECTORS_COUNT_KEY,
|
|
8
|
+
TARGET_VECTORS_COUNT_KEY,
|
|
9
|
+
VectorMetadata,
|
|
10
|
+
)
|
|
11
|
+
from datapipeline.domain.sample import Sample
|
|
12
|
+
from datapipeline.domain.vector import Vector
|
|
13
|
+
from datapipeline.services.artifacts import (
|
|
14
|
+
ArtifactNotRegisteredError,
|
|
15
|
+
VECTOR_METADATA_SPEC,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from ..common import (
|
|
19
|
+
VectorContextMixin,
|
|
20
|
+
replace_vector,
|
|
21
|
+
select_vector,
|
|
22
|
+
try_get_current_context,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class VectorDropVerticalTransform(VectorContextMixin):
|
|
27
|
+
required_artifacts = {VECTOR_METADATA_SPEC.key}
|
|
28
|
+
"""Drop partitions/features when metadata coverage falls below configured thresholds.
|
|
29
|
+
|
|
30
|
+
Requires the optional `metadata.json` artifact generated by the
|
|
31
|
+
`metadata` build task. The transform evaluates coverage using the recorded
|
|
32
|
+
`present_count`/`null_count` metrics and prunes the schema cache once so
|
|
33
|
+
downstream coverage checks stop expecting bad partitions.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
*,
|
|
39
|
+
payload: Literal["features", "targets"] = "features",
|
|
40
|
+
threshold: float,
|
|
41
|
+
) -> None:
|
|
42
|
+
super().__init__(payload=payload)
|
|
43
|
+
if not 0.0 <= threshold <= 1.0:
|
|
44
|
+
raise ValueError("threshold must be between 0 and 1.")
|
|
45
|
+
self._threshold = threshold
|
|
46
|
+
self._drop_ids: set[str] | None = None
|
|
47
|
+
self._schema_pruned = False
|
|
48
|
+
|
|
49
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
50
|
+
return self.apply(stream)
|
|
51
|
+
|
|
52
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
53
|
+
drop_ids = self._resolve_drop_ids()
|
|
54
|
+
if not drop_ids:
|
|
55
|
+
yield from stream
|
|
56
|
+
return
|
|
57
|
+
self._maybe_prune_schema(drop_ids)
|
|
58
|
+
for sample in stream:
|
|
59
|
+
if not self._schema_pruned:
|
|
60
|
+
self._maybe_prune_schema(drop_ids)
|
|
61
|
+
vector = select_vector(sample, self._payload)
|
|
62
|
+
if vector is None or not vector.values:
|
|
63
|
+
yield sample
|
|
64
|
+
continue
|
|
65
|
+
retained: dict[str, object] = {}
|
|
66
|
+
changed = False
|
|
67
|
+
for fid, value in vector.values.items():
|
|
68
|
+
if fid in drop_ids:
|
|
69
|
+
changed = True
|
|
70
|
+
continue
|
|
71
|
+
retained[fid] = value
|
|
72
|
+
if not changed:
|
|
73
|
+
yield sample
|
|
74
|
+
else:
|
|
75
|
+
yield replace_vector(sample, self._payload, Vector(values=retained))
|
|
76
|
+
|
|
77
|
+
def _resolve_drop_ids(self) -> set[str]:
|
|
78
|
+
if self._drop_ids is not None:
|
|
79
|
+
return self._drop_ids
|
|
80
|
+
context = self._context or try_get_current_context()
|
|
81
|
+
if not context:
|
|
82
|
+
raise RuntimeError("VectorDropVerticalTransform requires an active pipeline context.")
|
|
83
|
+
try:
|
|
84
|
+
raw = context.require_artifact(VECTOR_METADATA_SPEC)
|
|
85
|
+
except ArtifactNotRegisteredError as exc:
|
|
86
|
+
raise RuntimeError(
|
|
87
|
+
"Vector metadata artifact missing. Enable the `metadata` build task "
|
|
88
|
+
"and rerun `jerry build --project <project.yaml>`."
|
|
89
|
+
) from exc
|
|
90
|
+
meta = VectorMetadata.model_validate(raw)
|
|
91
|
+
section_key = "targets" if self._payload == "targets" else "features"
|
|
92
|
+
counts_key = (
|
|
93
|
+
TARGET_VECTORS_COUNT_KEY
|
|
94
|
+
if self._payload == "targets"
|
|
95
|
+
else FEATURE_VECTORS_COUNT_KEY
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
entries = getattr(meta, section_key) or []
|
|
99
|
+
window_size = self._window_size(getattr(meta, "window", None))
|
|
100
|
+
total = window_size if window_size is not None else meta.counts.get(counts_key)
|
|
101
|
+
if not isinstance(total, (int, float)) or total <= 0:
|
|
102
|
+
if self._payload == "targets":
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
"Vector metadata artifact missing counts for targets; "
|
|
105
|
+
"ensure your dataset defines target streams and rebuild."
|
|
106
|
+
)
|
|
107
|
+
raise RuntimeError(
|
|
108
|
+
"Vector metadata artifact missing counts for features; "
|
|
109
|
+
"rerun `jerry build --project <project.yaml>` to refresh metadata."
|
|
110
|
+
)
|
|
111
|
+
expected_buckets = float(total)
|
|
112
|
+
drop_ids: set[str] = set()
|
|
113
|
+
for entry in entries:
|
|
114
|
+
if not isinstance(entry, dict):
|
|
115
|
+
continue
|
|
116
|
+
fid = entry.get("id")
|
|
117
|
+
if not isinstance(fid, str):
|
|
118
|
+
continue
|
|
119
|
+
coverage = self._coverage_for_entry(entry, expected_buckets)
|
|
120
|
+
if coverage < self._threshold:
|
|
121
|
+
drop_ids.add(fid)
|
|
122
|
+
self._drop_ids = drop_ids
|
|
123
|
+
return drop_ids
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _window_size(window) -> float | None:
|
|
127
|
+
if window is None:
|
|
128
|
+
return None
|
|
129
|
+
if isinstance(window, dict):
|
|
130
|
+
return window.get("size")
|
|
131
|
+
return getattr(window, "size", None)
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _coverage_for_entry(entry: dict, expected_buckets: float) -> float:
|
|
135
|
+
if expected_buckets <= 0:
|
|
136
|
+
return 0.0
|
|
137
|
+
present = float(entry.get("present_count") or 0.0)
|
|
138
|
+
nulls = float(entry.get("null_count") or 0.0)
|
|
139
|
+
cadence_doc = entry.get("cadence")
|
|
140
|
+
cadence = cadence_doc.get("target") if isinstance(cadence_doc, dict) else None
|
|
141
|
+
observed_elements = entry.get("observed_elements")
|
|
142
|
+
|
|
143
|
+
if isinstance(observed_elements, (int, float)) and cadence:
|
|
144
|
+
# Base expected elements on buckets where this feature actually appeared
|
|
145
|
+
# to avoid over-crediting sparse sequences.
|
|
146
|
+
expected_elements = float(max(present, 0.0)) * float(cadence)
|
|
147
|
+
if expected_elements > 0:
|
|
148
|
+
return max(
|
|
149
|
+
0.0,
|
|
150
|
+
min(1.0, float(observed_elements) / expected_elements),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
coverage = (present - nulls) / expected_buckets
|
|
154
|
+
return max(0.0, min(1.0, coverage))
|
|
155
|
+
|
|
156
|
+
def _maybe_prune_schema(self, drop_ids: set[str]) -> None:
|
|
157
|
+
if self._schema_pruned or not drop_ids:
|
|
158
|
+
return
|
|
159
|
+
context = self._context or try_get_current_context()
|
|
160
|
+
if not context:
|
|
161
|
+
self._schema_pruned = True
|
|
162
|
+
return
|
|
163
|
+
cache = getattr(context, "_cache", None)
|
|
164
|
+
if cache is None:
|
|
165
|
+
self._schema_pruned = True
|
|
166
|
+
return
|
|
167
|
+
schema_key = f"schema:{self._payload}"
|
|
168
|
+
if schema_key not in cache:
|
|
169
|
+
return
|
|
170
|
+
entries = cache.get(schema_key)
|
|
171
|
+
if not entries:
|
|
172
|
+
self._schema_pruned = True
|
|
173
|
+
return
|
|
174
|
+
kept = [entry for entry in entries if entry.get("id") not in drop_ids]
|
|
175
|
+
cache[schema_key] = kept
|
|
176
|
+
ids_key = f"expected_ids:{self._payload}"
|
|
177
|
+
cache[ids_key] = [
|
|
178
|
+
entry.get("id")
|
|
179
|
+
for entry in kept
|
|
180
|
+
if isinstance(entry.get("id"), str)
|
|
181
|
+
]
|
|
182
|
+
self._schema_pruned = True
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import OrderedDict
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
from typing import Any, Literal
|
|
6
|
+
|
|
7
|
+
from datapipeline.domain.sample import Sample
|
|
8
|
+
from datapipeline.domain.vector import Vector
|
|
9
|
+
from datapipeline.transforms.vector_utils import clone
|
|
10
|
+
|
|
11
|
+
from .common import VectorContextMixin, replace_vector, select_vector
|
|
12
|
+
|
|
13
|
+
MissingPolicy = Literal["error", "drop", "fill"]
|
|
14
|
+
ExtraPolicy = Literal["error", "drop", "keep"]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class VectorEnsureSchemaTransform(VectorContextMixin):
|
|
18
|
+
"""Ensure vectors conform to the vector schema (`schema.json`) artifact.
|
|
19
|
+
|
|
20
|
+
Options allow filling or dropping rows with missing identifiers and
|
|
21
|
+
pruning/raising on unexpected identifiers.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
*,
|
|
27
|
+
payload: Literal["features", "targets"] = "features",
|
|
28
|
+
on_missing: MissingPolicy = "error",
|
|
29
|
+
fill_value: Any = None,
|
|
30
|
+
on_extra: ExtraPolicy = "error",
|
|
31
|
+
) -> None:
|
|
32
|
+
super().__init__(payload=payload)
|
|
33
|
+
if on_missing not in {"error", "drop", "fill"}:
|
|
34
|
+
raise ValueError("on_missing must be one of: 'error', 'drop', 'fill'")
|
|
35
|
+
if on_extra not in {"error", "drop", "keep"}:
|
|
36
|
+
raise ValueError("on_extra must be one of: 'error', 'drop', 'keep'")
|
|
37
|
+
self._on_missing = on_missing
|
|
38
|
+
self._fill_value = fill_value
|
|
39
|
+
self._on_extra = on_extra
|
|
40
|
+
self._baseline: list[str] | None = None
|
|
41
|
+
self._schema_entries: list[dict[str, Any]] | None = None
|
|
42
|
+
self._schema_meta: dict[str, dict[str, Any]] = {}
|
|
43
|
+
|
|
44
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
45
|
+
return self.apply(stream)
|
|
46
|
+
|
|
47
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
48
|
+
baseline = self._schema_ids()
|
|
49
|
+
baseline_set = set(baseline)
|
|
50
|
+
|
|
51
|
+
for sample in stream:
|
|
52
|
+
vector = select_vector(sample, self._payload)
|
|
53
|
+
if vector is None:
|
|
54
|
+
yield sample
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
values = vector.values
|
|
58
|
+
working = None
|
|
59
|
+
|
|
60
|
+
missing = [fid for fid in baseline if fid not in values]
|
|
61
|
+
if missing:
|
|
62
|
+
decision = self._on_missing
|
|
63
|
+
if decision == "error":
|
|
64
|
+
raise ValueError(
|
|
65
|
+
f"Vector missing required identifiers {missing} "
|
|
66
|
+
f"for payload '{self._payload}'."
|
|
67
|
+
)
|
|
68
|
+
if decision == "drop":
|
|
69
|
+
continue
|
|
70
|
+
working = clone(values)
|
|
71
|
+
for fid in missing:
|
|
72
|
+
working[fid] = self._fill_value
|
|
73
|
+
|
|
74
|
+
extras = [fid for fid in values if fid not in baseline_set]
|
|
75
|
+
if extras:
|
|
76
|
+
decision = self._on_extra
|
|
77
|
+
if decision == "error":
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"Vector contains unexpected identifiers {extras} "
|
|
80
|
+
f"for payload '{self._payload}'."
|
|
81
|
+
)
|
|
82
|
+
if decision == "drop":
|
|
83
|
+
working = working or clone(values)
|
|
84
|
+
for fid in extras:
|
|
85
|
+
working.pop(fid, None)
|
|
86
|
+
|
|
87
|
+
current_values = working or values
|
|
88
|
+
|
|
89
|
+
# Optionally enforce per-id cadence from schema metadata
|
|
90
|
+
current_values = self._enforce_cadence(current_values)
|
|
91
|
+
|
|
92
|
+
ordered = OrderedDict()
|
|
93
|
+
for fid in baseline:
|
|
94
|
+
ordered[fid] = current_values.get(fid)
|
|
95
|
+
if self._on_extra == "keep":
|
|
96
|
+
for fid, value in current_values.items():
|
|
97
|
+
if fid not in baseline_set:
|
|
98
|
+
ordered[fid] = value
|
|
99
|
+
current_values = ordered
|
|
100
|
+
|
|
101
|
+
if current_values is not values:
|
|
102
|
+
updated_vector = Vector(values=dict(current_values))
|
|
103
|
+
sample = replace_vector(sample, self._payload, updated_vector)
|
|
104
|
+
|
|
105
|
+
yield sample
|
|
106
|
+
|
|
107
|
+
def _schema_ids(self) -> list[str]:
|
|
108
|
+
if self._baseline is None:
|
|
109
|
+
entries = self._load_schema_entries()
|
|
110
|
+
ordered = [entry["id"] for entry in entries if isinstance(entry.get("id"), str)]
|
|
111
|
+
if not ordered:
|
|
112
|
+
raise RuntimeError(
|
|
113
|
+
"Vector schema artifact is empty or unavailable; run `jerry build` "
|
|
114
|
+
"to materialize `schema.json` via the `vector_schema` task."
|
|
115
|
+
)
|
|
116
|
+
self._baseline = ordered
|
|
117
|
+
self._schema_meta = {
|
|
118
|
+
entry["id"]: entry for entry in entries if isinstance(entry.get("id"), str)
|
|
119
|
+
}
|
|
120
|
+
return list(self._baseline)
|
|
121
|
+
|
|
122
|
+
def _load_schema_entries(self) -> list[dict[str, Any]]:
|
|
123
|
+
if self._schema_entries is None:
|
|
124
|
+
context = getattr(self, "_context", None)
|
|
125
|
+
if not context:
|
|
126
|
+
entries = []
|
|
127
|
+
else:
|
|
128
|
+
entries = context.load_schema(payload=self._payload)
|
|
129
|
+
self._schema_entries = entries or []
|
|
130
|
+
return self._schema_entries
|
|
131
|
+
|
|
132
|
+
def _enforce_cadence(self, values: dict[str, Any]) -> dict[str, Any]:
|
|
133
|
+
if not values or not self._schema_meta:
|
|
134
|
+
return values
|
|
135
|
+
adjusted = None
|
|
136
|
+
for fid, value in values.items():
|
|
137
|
+
meta = self._schema_meta.get(fid)
|
|
138
|
+
if not meta or meta.get("kind") != "list":
|
|
139
|
+
continue
|
|
140
|
+
expected = self._expected_lengths(meta)
|
|
141
|
+
if not expected:
|
|
142
|
+
continue
|
|
143
|
+
current_len = len(value) if isinstance(value, list) else (0 if value is None else 1)
|
|
144
|
+
if current_len in expected:
|
|
145
|
+
continue
|
|
146
|
+
decision = self._on_missing
|
|
147
|
+
if decision == "error":
|
|
148
|
+
raise ValueError(
|
|
149
|
+
f"List feature '{fid}' length {current_len} violates schema cadence {sorted(expected)}"
|
|
150
|
+
)
|
|
151
|
+
if decision == "drop":
|
|
152
|
+
return {}
|
|
153
|
+
# fill: pad or truncate to the closest expected length
|
|
154
|
+
target_len = expected[0]
|
|
155
|
+
adjusted = adjusted or clone(values)
|
|
156
|
+
if isinstance(value, list):
|
|
157
|
+
seq = value[:target_len]
|
|
158
|
+
elif value is None:
|
|
159
|
+
seq = []
|
|
160
|
+
else:
|
|
161
|
+
seq = [value]
|
|
162
|
+
if len(seq) < target_len:
|
|
163
|
+
seq = seq + [self._fill_value] * (target_len - len(seq))
|
|
164
|
+
adjusted[fid] = seq
|
|
165
|
+
return adjusted or values
|
|
166
|
+
|
|
167
|
+
def _expected_lengths(self, meta: dict[str, Any]) -> list[int]:
|
|
168
|
+
cadence = meta.get("cadence")
|
|
169
|
+
if isinstance(cadence, dict):
|
|
170
|
+
target = cadence.get("target")
|
|
171
|
+
if isinstance(target, (int, float)) and target > 0:
|
|
172
|
+
return [int(target)]
|
|
173
|
+
modes = meta.get("list_length", {}).get("modes")
|
|
174
|
+
if isinstance(modes, (list, tuple)) and modes:
|
|
175
|
+
ints = [int(m) for m in modes if isinstance(m, (int, float))]
|
|
176
|
+
if ints:
|
|
177
|
+
return sorted(ints)
|
|
178
|
+
expected = meta.get("expected_length")
|
|
179
|
+
if isinstance(expected, (int, float)):
|
|
180
|
+
return [int(expected)]
|
|
181
|
+
max_len = meta.get("list_length", {}).get("max")
|
|
182
|
+
if isinstance(max_len, (int, float)) and max_len > 0:
|
|
183
|
+
return [int(max_len)]
|
|
184
|
+
return []
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from collections import deque
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from statistics import mean, median
|
|
4
|
+
from typing import Any, Literal
|
|
5
|
+
|
|
6
|
+
from datapipeline.domain.sample import Sample
|
|
7
|
+
from datapipeline.domain.vector import Vector
|
|
8
|
+
from datapipeline.transforms.vector_utils import clone, is_missing
|
|
9
|
+
|
|
10
|
+
from .common import VectorPostprocessBase, replace_vector, select_vector
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class VectorFillTransform(VectorPostprocessBase):
|
|
14
|
+
"""Fill missing entries using running statistics from prior buckets."""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
*,
|
|
19
|
+
statistic: Literal["mean", "median"] = "median",
|
|
20
|
+
window: int | None = None,
|
|
21
|
+
min_samples: int = 1,
|
|
22
|
+
payload: Literal["features", "targets", "both"] = "features",
|
|
23
|
+
only: list[str] | None = None,
|
|
24
|
+
exclude: list[str] | None = None,
|
|
25
|
+
) -> None:
|
|
26
|
+
super().__init__(payload=payload, only=only, exclude=exclude)
|
|
27
|
+
if window is not None and window <= 0:
|
|
28
|
+
raise ValueError("window must be positive when provided")
|
|
29
|
+
if min_samples <= 0:
|
|
30
|
+
raise ValueError("min_samples must be positive")
|
|
31
|
+
self.statistic = statistic
|
|
32
|
+
self.window = window
|
|
33
|
+
self.min_samples = min_samples
|
|
34
|
+
self.history: dict[str, deque[float]] = {}
|
|
35
|
+
|
|
36
|
+
def _compute(self, feature_id: str) -> float | None:
|
|
37
|
+
values = self.history.get(feature_id)
|
|
38
|
+
if not values or len(values) < self.min_samples:
|
|
39
|
+
return None
|
|
40
|
+
if self.statistic == "mean":
|
|
41
|
+
return float(mean(values))
|
|
42
|
+
return float(median(values))
|
|
43
|
+
|
|
44
|
+
def _push(self, feature_id: str, value: Any) -> None:
|
|
45
|
+
if is_missing(value):
|
|
46
|
+
return
|
|
47
|
+
try:
|
|
48
|
+
num = float(value)
|
|
49
|
+
except (TypeError, ValueError):
|
|
50
|
+
return
|
|
51
|
+
bucket = self.history.setdefault(str(feature_id), deque(maxlen=self.window))
|
|
52
|
+
bucket.append(num)
|
|
53
|
+
|
|
54
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
55
|
+
return self.apply(stream)
|
|
56
|
+
|
|
57
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
58
|
+
for sample in stream:
|
|
59
|
+
for kind in self._payload_kinds():
|
|
60
|
+
ids = self._ids_for(kind)
|
|
61
|
+
if ids:
|
|
62
|
+
sample = self._apply_to_payload(sample, kind, ids)
|
|
63
|
+
yield sample
|
|
64
|
+
|
|
65
|
+
def _apply_to_payload(
|
|
66
|
+
self,
|
|
67
|
+
sample: Sample,
|
|
68
|
+
payload: Literal["features", "targets"],
|
|
69
|
+
ids: list[str],
|
|
70
|
+
) -> Sample:
|
|
71
|
+
vector = select_vector(sample, payload)
|
|
72
|
+
if vector is None:
|
|
73
|
+
return sample
|
|
74
|
+
data = clone(vector.values)
|
|
75
|
+
updated = False
|
|
76
|
+
for feature in ids:
|
|
77
|
+
if feature in data and not is_missing(data[feature]):
|
|
78
|
+
continue
|
|
79
|
+
fill = self._compute(feature)
|
|
80
|
+
if fill is not None:
|
|
81
|
+
data[feature] = fill
|
|
82
|
+
updated = True
|
|
83
|
+
for fid, value in data.items():
|
|
84
|
+
self._push(fid, value)
|
|
85
|
+
if not updated:
|
|
86
|
+
return sample
|
|
87
|
+
return replace_vector(sample, payload, Vector(values=data))
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Any, Literal
|
|
3
|
+
|
|
4
|
+
from datapipeline.domain.sample import Sample
|
|
5
|
+
from datapipeline.domain.vector import Vector
|
|
6
|
+
from datapipeline.transforms.vector_utils import clone, is_missing
|
|
7
|
+
|
|
8
|
+
from .common import VectorPostprocessBase, replace_vector, select_vector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class VectorReplaceTransform(VectorPostprocessBase):
|
|
12
|
+
"""Fill missing entries with a constant value."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
*,
|
|
17
|
+
value: Any,
|
|
18
|
+
payload: Literal["features", "targets", "both"] = "features",
|
|
19
|
+
only: list[str] | None = None,
|
|
20
|
+
exclude: list[str] | None = None,
|
|
21
|
+
target: Any | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
super().__init__(payload=payload, only=only, exclude=exclude)
|
|
24
|
+
self.value = value
|
|
25
|
+
self._target = target
|
|
26
|
+
|
|
27
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
28
|
+
return self.apply(stream)
|
|
29
|
+
|
|
30
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
31
|
+
for sample in stream:
|
|
32
|
+
for kind in self._payload_kinds():
|
|
33
|
+
ids = self._ids_for(kind)
|
|
34
|
+
if ids:
|
|
35
|
+
sample = self._apply_to_payload(sample, kind, ids)
|
|
36
|
+
yield sample
|
|
37
|
+
|
|
38
|
+
def _should_replace(self, value: Any) -> bool:
|
|
39
|
+
if self._target is None:
|
|
40
|
+
return is_missing(value)
|
|
41
|
+
return value == self._target
|
|
42
|
+
|
|
43
|
+
def _apply_to_payload(
|
|
44
|
+
self,
|
|
45
|
+
sample: Sample,
|
|
46
|
+
payload: Literal["features", "targets"],
|
|
47
|
+
ids: list[str],
|
|
48
|
+
) -> Sample:
|
|
49
|
+
vector = select_vector(sample, payload)
|
|
50
|
+
if vector is None:
|
|
51
|
+
return sample
|
|
52
|
+
data = clone(vector.values)
|
|
53
|
+
updated = False
|
|
54
|
+
for feature in ids:
|
|
55
|
+
current = data.get(feature)
|
|
56
|
+
if not self._should_replace(current):
|
|
57
|
+
continue
|
|
58
|
+
data[feature] = self.value
|
|
59
|
+
updated = True
|
|
60
|
+
if not updated:
|
|
61
|
+
return sample
|
|
62
|
+
return replace_vector(sample, payload, Vector(values=data))
|
datapipeline/utils/load.py
CHANGED
|
@@ -1,19 +1,40 @@
|
|
|
1
|
+
import importlib
|
|
1
2
|
import importlib.metadata as md
|
|
2
3
|
from functools import lru_cache
|
|
3
|
-
import yaml
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
# Local fallback map so newly added entrypoints remain usable in editable installs
|
|
9
|
+
# before package metadata is refreshed.
|
|
10
|
+
_EP_OVERRIDES = {}
|
|
11
|
+
|
|
6
12
|
|
|
7
13
|
@lru_cache
|
|
8
14
|
def load_ep(group: str, name: str):
|
|
15
|
+
target = _EP_OVERRIDES.get((group, name))
|
|
16
|
+
if target:
|
|
17
|
+
module, attr = target.split(":")
|
|
18
|
+
return getattr(importlib.import_module(module), attr)
|
|
19
|
+
|
|
9
20
|
eps = md.entry_points().select(group=group, name=name)
|
|
10
21
|
if not eps:
|
|
11
22
|
available = ", ".join(
|
|
12
|
-
sorted(ep.name for ep in md.entry_points().select(group=group))
|
|
23
|
+
sorted(ep.name for ep in md.entry_points().select(group=group))
|
|
24
|
+
)
|
|
13
25
|
raise ValueError(
|
|
14
26
|
f"No entry point '{name}' in '{group}'. Available: {available or '(none)'}")
|
|
15
27
|
if len(eps) > 1:
|
|
16
|
-
|
|
28
|
+
def describe(ep):
|
|
29
|
+
value = getattr(ep, "value", None)
|
|
30
|
+
if value:
|
|
31
|
+
return value
|
|
32
|
+
module = getattr(ep, "module", None)
|
|
33
|
+
attr = getattr(ep, "attr", None)
|
|
34
|
+
if module and attr:
|
|
35
|
+
return f"{module}:{attr}"
|
|
36
|
+
return repr(ep)
|
|
37
|
+
mods = ", ".join(describe(ep) for ep in eps)
|
|
17
38
|
raise ValueError(
|
|
18
39
|
f"Ambiguous entry point '{name}' in '{group}': {mods}")
|
|
19
40
|
# EntryPoints in newer Python versions are mapping-like; avoid integer indexing
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def suppress_file_proxy_shutdown_errors() -> None:
|
|
5
|
+
"""Patch rich.file_proxy.FileProxy.flush to ignore shutdown ImportErrors.
|
|
6
|
+
|
|
7
|
+
Rich leaves behind FileProxy instances that may flush while the interpreter
|
|
8
|
+
is tearing down, which triggers `ImportError: sys.meta_path is None`.
|
|
9
|
+
Swallow those benign errors so CLI commands exit cleanly.
|
|
10
|
+
"""
|
|
11
|
+
try:
|
|
12
|
+
from rich.file_proxy import FileProxy
|
|
13
|
+
except Exception:
|
|
14
|
+
return
|
|
15
|
+
|
|
16
|
+
if getattr(FileProxy, "_datapipeline_safe_flush", False):
|
|
17
|
+
return
|
|
18
|
+
|
|
19
|
+
original_flush = FileProxy.flush
|
|
20
|
+
|
|
21
|
+
def _safe_flush(self) -> None: # type: ignore[override]
|
|
22
|
+
try:
|
|
23
|
+
original_flush(self)
|
|
24
|
+
except ImportError as exc:
|
|
25
|
+
if "sys.meta_path is None" in str(exc):
|
|
26
|
+
return
|
|
27
|
+
raise
|
|
28
|
+
except RuntimeError as exc:
|
|
29
|
+
message = str(exc)
|
|
30
|
+
if "shutting down" in message.lower():
|
|
31
|
+
return
|
|
32
|
+
raise
|
|
33
|
+
|
|
34
|
+
FileProxy.flush = _safe_flush # type: ignore[assignment]
|
|
35
|
+
setattr(FileProxy, "_datapipeline_safe_flush", True)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
__all__ = ["suppress_file_proxy_shutdown_errors"]
|