jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,59 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import Literal
5
+
6
+ from datapipeline.domain.sample import Sample
7
+
8
+ from .horizontal import VectorDropHorizontalTransform
9
+ from .vertical import VectorDropVerticalTransform
10
+
11
+ Axis = Literal["horizontal", "vertical"]
12
+
13
+
14
+ class VectorDropTransform:
15
+ """Drop vectors or features based on coverage thresholds.
16
+
17
+ Thin orchestrator that delegates to horizontal or vertical strategies based
18
+ on the configured axis.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ *,
24
+ axis: Axis = "horizontal",
25
+ threshold: float,
26
+ payload: Literal["features", "targets", "both"] = "features",
27
+ only: list[str] | None = None,
28
+ exclude: list[str] | None = None,
29
+ ) -> None:
30
+ if axis not in {"horizontal", "vertical"}:
31
+ raise ValueError("axis must be 'horizontal' or 'vertical'")
32
+ if axis == "vertical" and payload == "both":
33
+ raise ValueError("axis='vertical' does not support payload='both'")
34
+ if axis == "horizontal":
35
+ self._impl: object = VectorDropHorizontalTransform(
36
+ threshold=threshold,
37
+ payload=payload,
38
+ only=only,
39
+ exclude=exclude,
40
+ )
41
+ else:
42
+ # Vertical drop is partition/feature-oriented and does not support
43
+ # payload='both'. Payload is validated above.
44
+ self._impl = VectorDropVerticalTransform(
45
+ payload=payload if payload != "both" else "features",
46
+ threshold=threshold,
47
+ )
48
+
49
+ def bind_context(self, context) -> None:
50
+ binder = getattr(self._impl, "bind_context", None)
51
+ if binder is not None:
52
+ binder(context)
53
+
54
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
55
+ return self.apply(stream)
56
+
57
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
58
+ return getattr(self._impl, "apply")(stream)
59
+
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import Literal
5
+
6
+ from datapipeline.config.metadata import (
7
+ FEATURE_VECTORS_COUNT_KEY,
8
+ TARGET_VECTORS_COUNT_KEY,
9
+ VectorMetadata,
10
+ )
11
+ from datapipeline.domain.sample import Sample
12
+ from datapipeline.domain.vector import Vector
13
+ from datapipeline.services.artifacts import (
14
+ ArtifactNotRegisteredError,
15
+ VECTOR_METADATA_SPEC,
16
+ )
17
+
18
+ from ..common import (
19
+ VectorContextMixin,
20
+ replace_vector,
21
+ select_vector,
22
+ try_get_current_context,
23
+ )
24
+
25
+
26
+ class VectorDropVerticalTransform(VectorContextMixin):
27
+ required_artifacts = {VECTOR_METADATA_SPEC.key}
28
+ """Drop partitions/features when metadata coverage falls below configured thresholds.
29
+
30
+ Requires the optional `metadata.json` artifact generated by the
31
+ `metadata` build task. The transform evaluates coverage using the recorded
32
+ `present_count`/`null_count` metrics and prunes the schema cache once so
33
+ downstream coverage checks stop expecting bad partitions.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ *,
39
+ payload: Literal["features", "targets"] = "features",
40
+ threshold: float,
41
+ ) -> None:
42
+ super().__init__(payload=payload)
43
+ if not 0.0 <= threshold <= 1.0:
44
+ raise ValueError("threshold must be between 0 and 1.")
45
+ self._threshold = threshold
46
+ self._drop_ids: set[str] | None = None
47
+ self._schema_pruned = False
48
+
49
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
50
+ return self.apply(stream)
51
+
52
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
53
+ drop_ids = self._resolve_drop_ids()
54
+ if not drop_ids:
55
+ yield from stream
56
+ return
57
+ self._maybe_prune_schema(drop_ids)
58
+ for sample in stream:
59
+ if not self._schema_pruned:
60
+ self._maybe_prune_schema(drop_ids)
61
+ vector = select_vector(sample, self._payload)
62
+ if vector is None or not vector.values:
63
+ yield sample
64
+ continue
65
+ retained: dict[str, object] = {}
66
+ changed = False
67
+ for fid, value in vector.values.items():
68
+ if fid in drop_ids:
69
+ changed = True
70
+ continue
71
+ retained[fid] = value
72
+ if not changed:
73
+ yield sample
74
+ else:
75
+ yield replace_vector(sample, self._payload, Vector(values=retained))
76
+
77
+ def _resolve_drop_ids(self) -> set[str]:
78
+ if self._drop_ids is not None:
79
+ return self._drop_ids
80
+ context = self._context or try_get_current_context()
81
+ if not context:
82
+ raise RuntimeError("VectorDropVerticalTransform requires an active pipeline context.")
83
+ try:
84
+ raw = context.require_artifact(VECTOR_METADATA_SPEC)
85
+ except ArtifactNotRegisteredError as exc:
86
+ raise RuntimeError(
87
+ "Vector metadata artifact missing. Enable the `metadata` build task "
88
+ "and rerun `jerry build --project <project.yaml>`."
89
+ ) from exc
90
+ meta = VectorMetadata.model_validate(raw)
91
+ section_key = "targets" if self._payload == "targets" else "features"
92
+ counts_key = (
93
+ TARGET_VECTORS_COUNT_KEY
94
+ if self._payload == "targets"
95
+ else FEATURE_VECTORS_COUNT_KEY
96
+ )
97
+
98
+ entries = getattr(meta, section_key) or []
99
+ window_size = self._window_size(getattr(meta, "window", None))
100
+ total = window_size if window_size is not None else meta.counts.get(counts_key)
101
+ if not isinstance(total, (int, float)) or total <= 0:
102
+ if self._payload == "targets":
103
+ raise RuntimeError(
104
+ "Vector metadata artifact missing counts for targets; "
105
+ "ensure your dataset defines target streams and rebuild."
106
+ )
107
+ raise RuntimeError(
108
+ "Vector metadata artifact missing counts for features; "
109
+ "rerun `jerry build --project <project.yaml>` to refresh metadata."
110
+ )
111
+ expected_buckets = float(total)
112
+ drop_ids: set[str] = set()
113
+ for entry in entries:
114
+ if not isinstance(entry, dict):
115
+ continue
116
+ fid = entry.get("id")
117
+ if not isinstance(fid, str):
118
+ continue
119
+ coverage = self._coverage_for_entry(entry, expected_buckets)
120
+ if coverage < self._threshold:
121
+ drop_ids.add(fid)
122
+ self._drop_ids = drop_ids
123
+ return drop_ids
124
+
125
+ @staticmethod
126
+ def _window_size(window) -> float | None:
127
+ if window is None:
128
+ return None
129
+ if isinstance(window, dict):
130
+ return window.get("size")
131
+ return getattr(window, "size", None)
132
+
133
+ @staticmethod
134
+ def _coverage_for_entry(entry: dict, expected_buckets: float) -> float:
135
+ if expected_buckets <= 0:
136
+ return 0.0
137
+ present = float(entry.get("present_count") or 0.0)
138
+ nulls = float(entry.get("null_count") or 0.0)
139
+ cadence_doc = entry.get("cadence")
140
+ cadence = cadence_doc.get("target") if isinstance(cadence_doc, dict) else None
141
+ observed_elements = entry.get("observed_elements")
142
+
143
+ if isinstance(observed_elements, (int, float)) and cadence:
144
+ # Base expected elements on buckets where this feature actually appeared
145
+ # to avoid over-crediting sparse sequences.
146
+ expected_elements = float(max(present, 0.0)) * float(cadence)
147
+ if expected_elements > 0:
148
+ return max(
149
+ 0.0,
150
+ min(1.0, float(observed_elements) / expected_elements),
151
+ )
152
+
153
+ coverage = (present - nulls) / expected_buckets
154
+ return max(0.0, min(1.0, coverage))
155
+
156
+ def _maybe_prune_schema(self, drop_ids: set[str]) -> None:
157
+ if self._schema_pruned or not drop_ids:
158
+ return
159
+ context = self._context or try_get_current_context()
160
+ if not context:
161
+ self._schema_pruned = True
162
+ return
163
+ cache = getattr(context, "_cache", None)
164
+ if cache is None:
165
+ self._schema_pruned = True
166
+ return
167
+ schema_key = f"schema:{self._payload}"
168
+ if schema_key not in cache:
169
+ return
170
+ entries = cache.get(schema_key)
171
+ if not entries:
172
+ self._schema_pruned = True
173
+ return
174
+ kept = [entry for entry in entries if entry.get("id") not in drop_ids]
175
+ cache[schema_key] = kept
176
+ ids_key = f"expected_ids:{self._payload}"
177
+ cache[ids_key] = [
178
+ entry.get("id")
179
+ for entry in kept
180
+ if isinstance(entry.get("id"), str)
181
+ ]
182
+ self._schema_pruned = True
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import OrderedDict
4
+ from collections.abc import Iterator
5
+ from typing import Any, Literal
6
+
7
+ from datapipeline.domain.sample import Sample
8
+ from datapipeline.domain.vector import Vector
9
+ from datapipeline.transforms.vector_utils import clone
10
+
11
+ from .common import VectorContextMixin, replace_vector, select_vector
12
+
13
+ MissingPolicy = Literal["error", "drop", "fill"]
14
+ ExtraPolicy = Literal["error", "drop", "keep"]
15
+
16
+
17
+ class VectorEnsureSchemaTransform(VectorContextMixin):
18
+ """Ensure vectors conform to the vector schema (`schema.json`) artifact.
19
+
20
+ Options allow filling or dropping rows with missing identifiers and
21
+ pruning/raising on unexpected identifiers.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ *,
27
+ payload: Literal["features", "targets"] = "features",
28
+ on_missing: MissingPolicy = "error",
29
+ fill_value: Any = None,
30
+ on_extra: ExtraPolicy = "error",
31
+ ) -> None:
32
+ super().__init__(payload=payload)
33
+ if on_missing not in {"error", "drop", "fill"}:
34
+ raise ValueError("on_missing must be one of: 'error', 'drop', 'fill'")
35
+ if on_extra not in {"error", "drop", "keep"}:
36
+ raise ValueError("on_extra must be one of: 'error', 'drop', 'keep'")
37
+ self._on_missing = on_missing
38
+ self._fill_value = fill_value
39
+ self._on_extra = on_extra
40
+ self._baseline: list[str] | None = None
41
+ self._schema_entries: list[dict[str, Any]] | None = None
42
+ self._schema_meta: dict[str, dict[str, Any]] = {}
43
+
44
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
45
+ return self.apply(stream)
46
+
47
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
48
+ baseline = self._schema_ids()
49
+ baseline_set = set(baseline)
50
+
51
+ for sample in stream:
52
+ vector = select_vector(sample, self._payload)
53
+ if vector is None:
54
+ yield sample
55
+ continue
56
+
57
+ values = vector.values
58
+ working = None
59
+
60
+ missing = [fid for fid in baseline if fid not in values]
61
+ if missing:
62
+ decision = self._on_missing
63
+ if decision == "error":
64
+ raise ValueError(
65
+ f"Vector missing required identifiers {missing} "
66
+ f"for payload '{self._payload}'."
67
+ )
68
+ if decision == "drop":
69
+ continue
70
+ working = clone(values)
71
+ for fid in missing:
72
+ working[fid] = self._fill_value
73
+
74
+ extras = [fid for fid in values if fid not in baseline_set]
75
+ if extras:
76
+ decision = self._on_extra
77
+ if decision == "error":
78
+ raise ValueError(
79
+ f"Vector contains unexpected identifiers {extras} "
80
+ f"for payload '{self._payload}'."
81
+ )
82
+ if decision == "drop":
83
+ working = working or clone(values)
84
+ for fid in extras:
85
+ working.pop(fid, None)
86
+
87
+ current_values = working or values
88
+
89
+ # Optionally enforce per-id cadence from schema metadata
90
+ current_values = self._enforce_cadence(current_values)
91
+
92
+ ordered = OrderedDict()
93
+ for fid in baseline:
94
+ ordered[fid] = current_values.get(fid)
95
+ if self._on_extra == "keep":
96
+ for fid, value in current_values.items():
97
+ if fid not in baseline_set:
98
+ ordered[fid] = value
99
+ current_values = ordered
100
+
101
+ if current_values is not values:
102
+ updated_vector = Vector(values=dict(current_values))
103
+ sample = replace_vector(sample, self._payload, updated_vector)
104
+
105
+ yield sample
106
+
107
+ def _schema_ids(self) -> list[str]:
108
+ if self._baseline is None:
109
+ entries = self._load_schema_entries()
110
+ ordered = [entry["id"] for entry in entries if isinstance(entry.get("id"), str)]
111
+ if not ordered:
112
+ raise RuntimeError(
113
+ "Vector schema artifact is empty or unavailable; run `jerry build` "
114
+ "to materialize `schema.json` via the `vector_schema` task."
115
+ )
116
+ self._baseline = ordered
117
+ self._schema_meta = {
118
+ entry["id"]: entry for entry in entries if isinstance(entry.get("id"), str)
119
+ }
120
+ return list(self._baseline)
121
+
122
+ def _load_schema_entries(self) -> list[dict[str, Any]]:
123
+ if self._schema_entries is None:
124
+ context = getattr(self, "_context", None)
125
+ if not context:
126
+ entries = []
127
+ else:
128
+ entries = context.load_schema(payload=self._payload)
129
+ self._schema_entries = entries or []
130
+ return self._schema_entries
131
+
132
+ def _enforce_cadence(self, values: dict[str, Any]) -> dict[str, Any]:
133
+ if not values or not self._schema_meta:
134
+ return values
135
+ adjusted = None
136
+ for fid, value in values.items():
137
+ meta = self._schema_meta.get(fid)
138
+ if not meta or meta.get("kind") != "list":
139
+ continue
140
+ expected = self._expected_lengths(meta)
141
+ if not expected:
142
+ continue
143
+ current_len = len(value) if isinstance(value, list) else (0 if value is None else 1)
144
+ if current_len in expected:
145
+ continue
146
+ decision = self._on_missing
147
+ if decision == "error":
148
+ raise ValueError(
149
+ f"List feature '{fid}' length {current_len} violates schema cadence {sorted(expected)}"
150
+ )
151
+ if decision == "drop":
152
+ return {}
153
+ # fill: pad or truncate to the closest expected length
154
+ target_len = expected[0]
155
+ adjusted = adjusted or clone(values)
156
+ if isinstance(value, list):
157
+ seq = value[:target_len]
158
+ elif value is None:
159
+ seq = []
160
+ else:
161
+ seq = [value]
162
+ if len(seq) < target_len:
163
+ seq = seq + [self._fill_value] * (target_len - len(seq))
164
+ adjusted[fid] = seq
165
+ return adjusted or values
166
+
167
+ def _expected_lengths(self, meta: dict[str, Any]) -> list[int]:
168
+ cadence = meta.get("cadence")
169
+ if isinstance(cadence, dict):
170
+ target = cadence.get("target")
171
+ if isinstance(target, (int, float)) and target > 0:
172
+ return [int(target)]
173
+ modes = meta.get("list_length", {}).get("modes")
174
+ if isinstance(modes, (list, tuple)) and modes:
175
+ ints = [int(m) for m in modes if isinstance(m, (int, float))]
176
+ if ints:
177
+ return sorted(ints)
178
+ expected = meta.get("expected_length")
179
+ if isinstance(expected, (int, float)):
180
+ return [int(expected)]
181
+ max_len = meta.get("list_length", {}).get("max")
182
+ if isinstance(max_len, (int, float)) and max_len > 0:
183
+ return [int(max_len)]
184
+ return []
@@ -0,0 +1,87 @@
1
+ from collections import deque
2
+ from collections.abc import Iterator
3
+ from statistics import mean, median
4
+ from typing import Any, Literal
5
+
6
+ from datapipeline.domain.sample import Sample
7
+ from datapipeline.domain.vector import Vector
8
+ from datapipeline.transforms.vector_utils import clone, is_missing
9
+
10
+ from .common import VectorPostprocessBase, replace_vector, select_vector
11
+
12
+
13
+ class VectorFillTransform(VectorPostprocessBase):
14
+ """Fill missing entries using running statistics from prior buckets."""
15
+
16
+ def __init__(
17
+ self,
18
+ *,
19
+ statistic: Literal["mean", "median"] = "median",
20
+ window: int | None = None,
21
+ min_samples: int = 1,
22
+ payload: Literal["features", "targets", "both"] = "features",
23
+ only: list[str] | None = None,
24
+ exclude: list[str] | None = None,
25
+ ) -> None:
26
+ super().__init__(payload=payload, only=only, exclude=exclude)
27
+ if window is not None and window <= 0:
28
+ raise ValueError("window must be positive when provided")
29
+ if min_samples <= 0:
30
+ raise ValueError("min_samples must be positive")
31
+ self.statistic = statistic
32
+ self.window = window
33
+ self.min_samples = min_samples
34
+ self.history: dict[str, deque[float]] = {}
35
+
36
+ def _compute(self, feature_id: str) -> float | None:
37
+ values = self.history.get(feature_id)
38
+ if not values or len(values) < self.min_samples:
39
+ return None
40
+ if self.statistic == "mean":
41
+ return float(mean(values))
42
+ return float(median(values))
43
+
44
+ def _push(self, feature_id: str, value: Any) -> None:
45
+ if is_missing(value):
46
+ return
47
+ try:
48
+ num = float(value)
49
+ except (TypeError, ValueError):
50
+ return
51
+ bucket = self.history.setdefault(str(feature_id), deque(maxlen=self.window))
52
+ bucket.append(num)
53
+
54
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
55
+ return self.apply(stream)
56
+
57
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
58
+ for sample in stream:
59
+ for kind in self._payload_kinds():
60
+ ids = self._ids_for(kind)
61
+ if ids:
62
+ sample = self._apply_to_payload(sample, kind, ids)
63
+ yield sample
64
+
65
+ def _apply_to_payload(
66
+ self,
67
+ sample: Sample,
68
+ payload: Literal["features", "targets"],
69
+ ids: list[str],
70
+ ) -> Sample:
71
+ vector = select_vector(sample, payload)
72
+ if vector is None:
73
+ return sample
74
+ data = clone(vector.values)
75
+ updated = False
76
+ for feature in ids:
77
+ if feature in data and not is_missing(data[feature]):
78
+ continue
79
+ fill = self._compute(feature)
80
+ if fill is not None:
81
+ data[feature] = fill
82
+ updated = True
83
+ for fid, value in data.items():
84
+ self._push(fid, value)
85
+ if not updated:
86
+ return sample
87
+ return replace_vector(sample, payload, Vector(values=data))
@@ -0,0 +1,62 @@
1
+ from collections.abc import Iterator
2
+ from typing import Any, Literal
3
+
4
+ from datapipeline.domain.sample import Sample
5
+ from datapipeline.domain.vector import Vector
6
+ from datapipeline.transforms.vector_utils import clone, is_missing
7
+
8
+ from .common import VectorPostprocessBase, replace_vector, select_vector
9
+
10
+
11
+ class VectorReplaceTransform(VectorPostprocessBase):
12
+ """Fill missing entries with a constant value."""
13
+
14
+ def __init__(
15
+ self,
16
+ *,
17
+ value: Any,
18
+ payload: Literal["features", "targets", "both"] = "features",
19
+ only: list[str] | None = None,
20
+ exclude: list[str] | None = None,
21
+ target: Any | None = None,
22
+ ) -> None:
23
+ super().__init__(payload=payload, only=only, exclude=exclude)
24
+ self.value = value
25
+ self._target = target
26
+
27
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
28
+ return self.apply(stream)
29
+
30
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
31
+ for sample in stream:
32
+ for kind in self._payload_kinds():
33
+ ids = self._ids_for(kind)
34
+ if ids:
35
+ sample = self._apply_to_payload(sample, kind, ids)
36
+ yield sample
37
+
38
+ def _should_replace(self, value: Any) -> bool:
39
+ if self._target is None:
40
+ return is_missing(value)
41
+ return value == self._target
42
+
43
+ def _apply_to_payload(
44
+ self,
45
+ sample: Sample,
46
+ payload: Literal["features", "targets"],
47
+ ids: list[str],
48
+ ) -> Sample:
49
+ vector = select_vector(sample, payload)
50
+ if vector is None:
51
+ return sample
52
+ data = clone(vector.values)
53
+ updated = False
54
+ for feature in ids:
55
+ current = data.get(feature)
56
+ if not self._should_replace(current):
57
+ continue
58
+ data[feature] = self.value
59
+ updated = True
60
+ if not updated:
61
+ return sample
62
+ return replace_vector(sample, payload, Vector(values=data))
@@ -1,19 +1,40 @@
1
+ import importlib
1
2
  import importlib.metadata as md
2
3
  from functools import lru_cache
3
- import yaml
4
4
  from pathlib import Path
5
5
 
6
+ import yaml
7
+
8
+ # Local fallback map so newly added entrypoints remain usable in editable installs
9
+ # before package metadata is refreshed.
10
+ _EP_OVERRIDES = {}
11
+
6
12
 
7
13
  @lru_cache
8
14
  def load_ep(group: str, name: str):
15
+ target = _EP_OVERRIDES.get((group, name))
16
+ if target:
17
+ module, attr = target.split(":")
18
+ return getattr(importlib.import_module(module), attr)
19
+
9
20
  eps = md.entry_points().select(group=group, name=name)
10
21
  if not eps:
11
22
  available = ", ".join(
12
- sorted(ep.name for ep in md.entry_points().select(group=group)))
23
+ sorted(ep.name for ep in md.entry_points().select(group=group))
24
+ )
13
25
  raise ValueError(
14
26
  f"No entry point '{name}' in '{group}'. Available: {available or '(none)'}")
15
27
  if len(eps) > 1:
16
- mods = ", ".join(f"{ep.module}:{ep.attr}" for ep in eps)
28
+ def describe(ep):
29
+ value = getattr(ep, "value", None)
30
+ if value:
31
+ return value
32
+ module = getattr(ep, "module", None)
33
+ attr = getattr(ep, "attr", None)
34
+ if module and attr:
35
+ return f"{module}:{attr}"
36
+ return repr(ep)
37
+ mods = ", ".join(describe(ep) for ep in eps)
17
38
  raise ValueError(
18
39
  f"Ambiguous entry point '{name}' in '{group}': {mods}")
19
40
  # EntryPoints in newer Python versions are mapping-like; avoid integer indexing
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ def suppress_file_proxy_shutdown_errors() -> None:
5
+ """Patch rich.file_proxy.FileProxy.flush to ignore shutdown ImportErrors.
6
+
7
+ Rich leaves behind FileProxy instances that may flush while the interpreter
8
+ is tearing down, which triggers `ImportError: sys.meta_path is None`.
9
+ Swallow those benign errors so CLI commands exit cleanly.
10
+ """
11
+ try:
12
+ from rich.file_proxy import FileProxy
13
+ except Exception:
14
+ return
15
+
16
+ if getattr(FileProxy, "_datapipeline_safe_flush", False):
17
+ return
18
+
19
+ original_flush = FileProxy.flush
20
+
21
+ def _safe_flush(self) -> None: # type: ignore[override]
22
+ try:
23
+ original_flush(self)
24
+ except ImportError as exc:
25
+ if "sys.meta_path is None" in str(exc):
26
+ return
27
+ raise
28
+ except RuntimeError as exc:
29
+ message = str(exc)
30
+ if "shutting down" in message.lower():
31
+ return
32
+ raise
33
+
34
+ FileProxy.flush = _safe_flush # type: ignore[assignment]
35
+ setattr(FileProxy, "_datapipeline_safe_flush", True)
36
+
37
+
38
+ __all__ = ["suppress_file_proxy_shutdown_errors"]