jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,7 @@
1
1
  from typing import Any
2
+
2
3
  from datapipeline.sources.models.parser import DataParser
4
+
3
5
  from .dto import {{DTO_CLASS}}
4
6
 
5
7
 
@@ -1,6 +1,8 @@
1
1
  from dataclasses import dataclass
2
+
2
3
  from datapipeline.domain.record import {{PARENT_CLASS}}
3
4
 
5
+
4
6
  @dataclass
5
7
  class {{CLASS_NAME}}({{PARENT_CLASS}}):
6
8
  """
@@ -1,6 +1,5 @@
1
- # Required identifier for this raw source. This value is referenced by
2
- # contracts under `source_id:`.
3
- source_id: "{{ source_id }}"
1
+ # Required identifier for this raw source. Contracts reference it under `source:`.
2
+ id: "{{ id }}" # format: provider.dataset
4
3
 
5
4
  # parser.entrypoint: registered parser name (not a file path)
6
5
  parser:
@@ -1,24 +1,21 @@
1
1
  import logging
2
- import math
2
+ from datetime import timedelta
3
3
  from itertools import groupby
4
4
  from typing import Iterator
5
5
 
6
6
  from datapipeline.domain.feature import FeatureRecord
7
- from datapipeline.transforms.utils import is_missing
7
+ from datapipeline.utils.time import parse_timecode
8
+
9
+
8
10
  logger = logging.getLogger(__name__)
9
11
 
10
12
 
11
13
  class StreamLint:
12
- """Validate a feature stream and emit actionable hints.
14
+ """Validate structural properties of a feature stream (order, cadence, duplicates).
13
15
 
14
16
  Parameters
15
17
  - mode: 'warn' (default) logs warnings; 'error' raises on first violation
16
18
  - tick: optional cadence (e.g. '1h', '10m'); when set, check regularity
17
- - check_missing: flag missing values (value is None/NaN)
18
- - check_regular: flag gaps vs. expected tick
19
- - check_duplicates: flag multiple records with same timestamp
20
- - check_order: flag out-of-order timestamps within a feature stream
21
- - check_finite: flag non-finite values (NaN/Inf)
22
19
  """
23
20
 
24
21
  def __init__(
@@ -26,19 +23,20 @@ class StreamLint:
26
23
  *,
27
24
  mode: str = "warn",
28
25
  tick: str | None = None,
29
- check_missing: bool = True,
30
- check_regular: bool = True,
31
- check_duplicates: bool = True,
32
- check_order: bool = True,
33
- check_finite: bool = True,
34
26
  ) -> None:
35
27
  self.mode = mode
36
28
  self.tick = tick
37
- self.check_missing = check_missing
38
- self.check_regular = check_regular
39
- self.check_duplicates = check_duplicates
40
- self.check_order = check_order
41
- self.check_finite = check_finite
29
+
30
+ # Pre-compute tick step in seconds when provided to avoid repeated parsing.
31
+ self._tick_seconds: int | None = None
32
+ if self.tick:
33
+ try:
34
+ self._tick_seconds = int(parse_timecode(self.tick).total_seconds())
35
+ except Exception:
36
+ logger.warning(
37
+ "StreamLint: invalid tick %r (cadence checks disabled)", self.tick
38
+ )
39
+ self._tick_seconds = None
42
40
 
43
41
  def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
44
42
  return self.apply(stream)
@@ -55,46 +53,33 @@ class StreamLint:
55
53
  seen_times: set = set()
56
54
  for fr in records:
57
55
  t = getattr(fr.record, "time", None)
58
- v = getattr(fr.record, "value", None)
59
56
 
60
57
  # Check ordering
61
- if self.check_order and last_time is not None and t is not None and t < last_time:
58
+ if last_time is not None and t is not None and t < last_time:
62
59
  self._violation(
63
60
  f"out-of-order timestamp for feature '{fid}': {t} < {last_time}. "
64
61
  f"Consider sorting upstream or fixing loader."
65
62
  )
66
63
 
67
64
  # Check duplicates
68
- if self.check_duplicates and t in seen_times:
65
+ if t in seen_times:
69
66
  self._violation(
70
67
  f"duplicate timestamp for feature '{fid}' at {t}. "
71
68
  f"Consider a granularity transform (first/last/mean/median)."
72
69
  )
73
70
  seen_times.add(t)
74
71
 
75
- # Check missing / non-finite
76
- if self.check_missing and is_missing(v):
77
- self._violation(
78
- f"missing value for feature '{fid}' at {t}. "
79
- f"Consider using a fill transform."
80
- )
81
- if self.check_finite and isinstance(v, float) and not math.isfinite(v):
82
- self._violation(
83
- f"non-finite value for feature '{fid}' at {t}: {v}. "
84
- f"Consider filtering or scaling."
85
- )
86
-
87
- # Regularity check requires explicit tick; done at stream layer via ensure_ticks normally
88
- if self.check_regular and self.tick and last_time is not None and t is not None:
89
- # Lazy import to avoid cycle
90
- from datapipeline.utils.time import parse_timecode
91
-
92
- step = parse_timecode(self.tick)
93
- expect = last_time + step
72
+ # Regularity check requires explicit tick; done at stream layer via ensure_cadence normally
73
+ if (
74
+ self._tick_seconds
75
+ and last_time is not None
76
+ and t is not None
77
+ ):
78
+ expect = last_time + timedelta(seconds=self._tick_seconds)
94
79
  if t != expect and t > expect:
95
80
  self._violation(
96
81
  f"skipped tick(s) for feature '{fid}': expected {expect}, got {t}. "
97
- f"Consider using ensure_ticks."
82
+ f"Consider using ensure_cadence."
98
83
  )
99
84
 
100
85
  last_time = t
@@ -3,13 +3,14 @@ from collections import defaultdict
3
3
  from itertools import groupby
4
4
  from numbers import Real
5
5
  from pathlib import Path
6
- from typing import Any, Iterator
6
+ from typing import Any, Callable, Iterator, Literal, Mapping
7
7
 
8
8
  from datapipeline.domain.feature import FeatureRecord
9
- from datapipeline.domain.record import TemporalRecord
9
+ from datapipeline.domain.sample import Sample
10
10
  from datapipeline.transforms.feature.model import FeatureTransform
11
11
  from datapipeline.transforms.utils import clone_record_with_value
12
12
  from datapipeline.utils.pickle_model import PicklePersistanceMixin
13
+ from datapipeline.pipeline.observability import TransformEvent
13
14
 
14
15
 
15
16
  def _iter_numeric_values(value: Any) -> Iterator[float]:
@@ -38,12 +39,14 @@ class StandardScaler(PicklePersistanceMixin):
38
39
  self.with_std = with_std
39
40
  self.epsilon = epsilon
40
41
  self.statistics: dict[str, dict[str, float | int]] = {}
42
+ self.missing_counts: dict[str, int] = {}
41
43
 
42
- def fit(self, vectors: Iterator[tuple[Any, Any]]) -> int:
44
+ def fit(self, vectors: Iterator[Sample]) -> int:
43
45
  trackers: dict[str, StandardScaler._RunningStats] = defaultdict(
44
46
  self._RunningStats)
45
47
  total = 0
46
- for _, vector in vectors:
48
+ for sample in vectors:
49
+ vector = sample.features
47
50
  values = getattr(vector, "values", {})
48
51
  for fid, raw in values.items():
49
52
  for value in _iter_numeric_values(raw):
@@ -61,11 +64,19 @@ class StandardScaler(PicklePersistanceMixin):
61
64
  }
62
65
  return total
63
66
 
64
- def transform(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
67
+ def transform(
68
+ self,
69
+ stream: Iterator[FeatureRecord],
70
+ *,
71
+ on_none: Literal["error", "skip"] = "skip",
72
+ observer: Callable[[TransformEvent], None] | None = None,
73
+ ) -> Iterator[FeatureRecord]:
65
74
  if not self.statistics:
66
75
  raise RuntimeError(
67
76
  "StandardScaler must be fitted before calling transform().")
68
77
 
78
+ self.missing_counts = {}
79
+
69
80
  grouped = groupby(stream, key=lambda fr: fr.id)
70
81
  for feature_id, records in grouped:
71
82
  stats = self.statistics.get(feature_id)
@@ -75,7 +86,29 @@ class StandardScaler(PicklePersistanceMixin):
75
86
  mean = float(stats.get("mean", 0.0))
76
87
  std = float(stats.get("std", 1.0))
77
88
  for fr in records:
78
- raw = self._extract_value(fr.record)
89
+ value = fr.record.value
90
+ if not isinstance(value, Real):
91
+ if value is None and on_none == "skip":
92
+ self.missing_counts[feature_id] = (
93
+ self.missing_counts.get(feature_id, 0) + 1
94
+ )
95
+ if observer is not None:
96
+ observer(
97
+ TransformEvent(
98
+ type="scaler_none",
99
+ payload={
100
+ "feature_id": feature_id,
101
+ "record": fr.record,
102
+ "count": self.missing_counts[feature_id],
103
+ },
104
+ )
105
+ )
106
+ yield fr
107
+ continue
108
+ raise TypeError(
109
+ f"Record value must be numeric, got {value!r}")
110
+
111
+ raw = float(value)
79
112
  normalized = raw
80
113
  if self.with_mean:
81
114
  normalized -= mean
@@ -86,12 +119,36 @@ class StandardScaler(PicklePersistanceMixin):
86
119
  id=fr.id,
87
120
  )
88
121
 
89
- @staticmethod
90
- def _extract_value(record: TemporalRecord) -> float:
91
- value = record.value
92
- if isinstance(value, Real):
93
- return float(value)
94
- raise TypeError(f"Record value must be numeric, got {value!r}")
122
+ def inverse_transform(
123
+ self,
124
+ stream: Iterator[FeatureRecord],
125
+ ) -> Iterator[FeatureRecord]:
126
+ if not self.statistics:
127
+ raise RuntimeError(
128
+ "StandardScaler must be fitted before calling inverse_transform().")
129
+
130
+ grouped = groupby(stream, key=lambda fr: fr.id)
131
+ for feature_id, records in grouped:
132
+ stats = self.statistics.get(feature_id)
133
+ if not stats:
134
+ raise KeyError(
135
+ f"Missing scaler statistics for feature '{feature_id}'.")
136
+ mean = float(stats.get("mean", 0.0))
137
+ std = float(stats.get("std", 1.0))
138
+ for fr in records:
139
+ value = fr.record.value
140
+ if not isinstance(value, Real):
141
+ raise TypeError(
142
+ f"Record value must be numeric, got {value!r}")
143
+ restored = float(value)
144
+ if self.with_std:
145
+ restored *= std
146
+ if self.with_mean:
147
+ restored += mean
148
+ yield FeatureRecord(
149
+ record=clone_record_with_value(fr.record, restored),
150
+ id=fr.id,
151
+ )
95
152
 
96
153
  class _RunningStats:
97
154
  __slots__ = ("count", "mean", "m2")
@@ -132,6 +189,8 @@ class StandardScalerTransform(FeatureTransform):
132
189
  with_mean: bool = True,
133
190
  with_std: bool = True,
134
191
  epsilon: float = 1e-12,
192
+ on_none: Literal["error", "skip"] = "skip",
193
+ observer: Callable[[TransformEvent], None] | None = None,
135
194
  ) -> None:
136
195
  base: StandardScaler
137
196
  if scaler is not None:
@@ -152,6 +211,23 @@ class StandardScalerTransform(FeatureTransform):
152
211
  epsilon=epsilon,
153
212
  )
154
213
  self._scaler.statistics = dict(base.statistics)
214
+ self._on_none = on_none
215
+ self._observer = observer
216
+
217
+ @property
218
+ def missing_counts(self) -> dict[str, int]:
219
+ return dict(self._scaler.missing_counts)
220
+
221
+ def set_observer(self, observer: Callable[[TransformEvent], None] | None) -> None:
222
+ self._observer = observer
155
223
 
156
224
  def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
157
- yield from self._scaler.transform(stream)
225
+ yield from self._scaler.transform(
226
+ stream,
227
+ on_none=self._on_none,
228
+ observer=self._observer,
229
+ )
230
+
231
+ def inverse(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
232
+ """Undo scaling using the fitted statistics."""
233
+ yield from self._scaler.inverse_transform(stream)
@@ -3,15 +3,15 @@ from __future__ import annotations
3
3
  from typing import Iterator
4
4
 
5
5
  from datapipeline.domain.record import TemporalRecord
6
- from datapipeline.config.dataset.normalize import floor_time_to_resolution
6
+ from datapipeline.config.dataset.normalize import floor_time_to_bucket
7
7
 
8
8
 
9
- def floor_time(stream: Iterator[TemporalRecord], resolution: str) -> Iterator[TemporalRecord]:
10
- """Floor record timestamps to the given resolution (e.g., '1h', '10min').
9
+ def floor_time(stream: Iterator[TemporalRecord], cadence: str) -> Iterator[TemporalRecord]:
10
+ """Floor record timestamps to the given cadence bucket (e.g., '1h', '10min').
11
11
 
12
12
  Useful before granularity aggregation to downsample within bins by making
13
13
  all intra-bin records share the same timestamp.
14
14
  """
15
15
  for record in stream:
16
- record.time = floor_time_to_resolution(record.time, resolution)
16
+ record.time = floor_time_to_bucket(record.time, cadence)
17
17
  yield record
@@ -1,14 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections import deque
4
- import logging
5
4
  from itertools import groupby
6
5
  from typing import Iterator
7
6
 
8
7
  from datapipeline.domain.feature import FeatureRecord, FeatureRecordSequence
9
- from datapipeline.utils.time import parse_timecode
10
-
11
- logger = logging.getLogger(__name__)
12
8
 
13
9
 
14
10
  class WindowTransformer:
@@ -16,25 +12,15 @@ class WindowTransformer:
16
12
  self,
17
13
  size: int,
18
14
  stride: int = 1,
19
- *,
20
- tick: str | None = None,
21
15
  ) -> None:
22
16
  """Sliding windows over time-ordered feature streams.
23
17
 
24
18
  Parameters
25
19
  - size: window length in steps (int).
26
20
  - stride: step between windows (int number of steps).
27
- - tick: duration string denoting the expected cadence of the stream.
28
- Supports 's', 'm', 'h', 'd'. When provided, enforce completeness: only emit windows if
29
- consecutive records are exactly one tick apart; gaps reset the
30
- window. Examples: "1h", "10m". Optional.
31
21
  """
32
22
 
33
23
  self.size = int(size)
34
- self._tick_seconds: int | None = (
35
- int(parse_timecode(tick).total_seconds()) if tick else None
36
- )
37
-
38
24
  self.stride = int(stride)
39
25
 
40
26
  if self.size <= 0 or self.stride <= 0:
@@ -52,33 +38,14 @@ class WindowTransformer:
52
38
 
53
39
  grouped = groupby(stream, key=lambda fr: fr.id)
54
40
 
55
- for id, records in grouped:
41
+ for fid, records in grouped:
56
42
  window = deque(maxlen=self.size)
57
43
  step = 0
58
- last_time = None
59
44
  for fr in records:
60
- # Enforce completeness when configured and tick is known
61
- if self._tick_seconds is not None:
62
- t = getattr(fr.record, "time", None)
63
- if t is not None and last_time is not None:
64
- delta = int((t - last_time).total_seconds())
65
- if delta != self._tick_seconds:
66
- logger.debug(
67
- "sequence gap: feature_id=%s expected=%ss delta=%ss last=%s now=%s",
68
- id,
69
- self._tick_seconds,
70
- delta,
71
- last_time,
72
- t,
73
- )
74
- window.clear()
75
- step = 0
76
- last_time = t
77
-
78
45
  window.append(fr)
79
46
  if len(window) == self.size and step % self.stride == 0:
80
47
  yield FeatureRecordSequence(
81
48
  records=[r.record for r in window],
82
- id=id,
49
+ id=fid,
83
50
  )
84
51
  step += 1
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+
5
+ from datapipeline.domain.feature import FeatureRecord
6
+
7
+
8
+ class FeatureDeduplicateTransform:
9
+ """Drop consecutive identical feature records (id + timestamp + payload)."""
10
+
11
+ def __init__(self, **_: object) -> None:
12
+ # Accept arbitrary config mapping for consistency with other transforms.
13
+ pass
14
+
15
+ def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
16
+ return self.apply(stream)
17
+
18
+ def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
19
+ last: FeatureRecord | None = None
20
+ for record in stream:
21
+ if last is not None and record == last:
22
+ continue
23
+ last = record
24
+ yield record
@@ -1,20 +1,21 @@
1
1
  from typing import Iterator
2
2
 
3
- from datapipeline.domain.record import TemporalRecord
3
+ from dataclasses import replace
4
+
4
5
  from datapipeline.domain.feature import FeatureRecord
6
+ from datapipeline.domain.record import TemporalRecord
5
7
  from datapipeline.utils.time import parse_timecode
6
- from dataclasses import replace
7
8
 
8
9
 
9
- def ensure_ticks(stream: Iterator[FeatureRecord], tick: str) -> Iterator[FeatureRecord]:
10
- """Insert placeholder FeatureRecords so timestamps are exactly one tick apart per feature id.
10
+ def ensure_cadence(stream: Iterator[FeatureRecord], cadence: str) -> Iterator[FeatureRecord]:
11
+ """Insert placeholder FeatureRecords so timestamps are exactly one cadence apart per feature id.
11
12
 
12
- - tick: duration string (e.g., "10m", "1h", "30s").
13
+ - cadence: duration string (e.g., "10m", "1h", "30s").
13
14
  - Placeholders carry value=None and inherit the feature id; group bucketing
14
15
  is applied later at vector assembly from record.time.
15
16
  - Assumes input sorted by (feature_id, record.time).
16
17
  """
17
- step = parse_timecode(tick)
18
+ step = parse_timecode(cadence)
18
19
  last: FeatureRecord | None = None
19
20
  for fr in stream:
20
21
  if (last is None) or (last.id != fr.id):
@@ -0,0 +1,5 @@
1
+ from .common import VectorContextMixin, replace_vector, select_vector
2
+ from .drop import VectorDropTransform
3
+ from .ensure_schema import VectorEnsureSchemaTransform
4
+ from .fill import VectorFillTransform
5
+ from .replace import VectorReplaceTransform
@@ -0,0 +1,98 @@
1
+ from typing import Literal
2
+
3
+ from datapipeline.domain.sample import Sample
4
+ from datapipeline.domain.vector import Vector
5
+ from datapipeline.pipeline.context import (
6
+ PipelineContext,
7
+ try_get_current_context,
8
+ )
9
+
10
+
11
+ def select_vector(sample: Sample, payload: Literal["features", "targets"]) -> Vector | None:
12
+ if payload == "targets":
13
+ return sample.targets
14
+ return sample.features
15
+
16
+
17
+ def replace_vector(sample: Sample, payload: Literal["features", "targets"], vector: Vector) -> Sample:
18
+ if payload == "targets":
19
+ return sample.with_targets(vector)
20
+ return sample.with_features(vector)
21
+
22
+
23
+ class VectorContextMixin:
24
+ def __init__(self, payload: Literal["features", "targets"] = "features") -> None:
25
+ if payload not in {"features", "targets"}:
26
+ raise ValueError("payload must be 'features' or 'targets'")
27
+ self._context: PipelineContext | None = None
28
+ self._payload = payload
29
+
30
+ def bind_context(self, context: PipelineContext) -> None:
31
+ self._context = context
32
+
33
+ def _expected_ids(self, payload: str | None = None) -> list[str]:
34
+ """Return expected feature/target ids for the given payload.
35
+
36
+ When `payload` is omitted, the instance default is used.
37
+ """
38
+ ctx = self._context or try_get_current_context()
39
+ if not ctx:
40
+ return []
41
+ kind = payload or self._payload
42
+ if kind not in {"features", "targets"}:
43
+ return []
44
+ schema = ctx.load_schema(payload=kind) or []
45
+ ids = [
46
+ entry.get("id")
47
+ for entry in schema
48
+ if isinstance(entry, dict) and isinstance(entry.get("id"), str)
49
+ ]
50
+ return ids or []
51
+
52
+
53
+ class VectorPostprocessBase(VectorContextMixin):
54
+ """Shared envelope for vector postprocess transforms.
55
+
56
+ Provides a consistent contract for payload selection and id filtering:
57
+ - payload: features | targets | both
58
+ - only: optional allow-list of ids
59
+ - exclude: optional deny-list of ids
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ *,
65
+ payload: Literal["features", "targets", "both"] = "features",
66
+ only: list[str] | None = None,
67
+ exclude: list[str] | None = None,
68
+ ) -> None:
69
+ if payload not in {"features", "targets", "both"}:
70
+ raise ValueError(
71
+ "payload must be 'features', 'targets', or 'both'")
72
+ base_payload = "features" if payload == "both" else payload
73
+ super().__init__(payload=base_payload)
74
+ self._payload_mode: Literal["features", "targets", "both"] = payload
75
+ self._only = {str(fid) for fid in (only or [])} or None
76
+ self._exclude = {str(fid) for fid in (exclude or [])} or None
77
+ self._baseline_cache: dict[str, list[str]] = {}
78
+
79
+ def _payload_kinds(self) -> list[Literal["features", "targets"]]:
80
+ mode = self._payload_mode
81
+ kinds: list[Literal["features", "targets"]] = []
82
+ if mode in {"features", "both"}:
83
+ kinds.append("features")
84
+ if mode in {"targets", "both"}:
85
+ kinds.append("targets")
86
+ return kinds
87
+
88
+ def _ids_for(self, payload: Literal["features", "targets"]) -> list[str]:
89
+ cached = self._baseline_cache.get(payload)
90
+ if cached is not None:
91
+ return list(cached)
92
+ ids = self._expected_ids(payload=payload)
93
+ if self._only is not None:
94
+ ids = [fid for fid in ids if fid in self._only]
95
+ if self._exclude is not None:
96
+ ids = [fid for fid in ids if fid not in self._exclude]
97
+ self._baseline_cache[payload] = list(ids)
98
+ return list(ids)
@@ -0,0 +1,4 @@
1
+ from .horizontal import VectorDropHorizontalTransform
2
+ from .vertical import VectorDropVerticalTransform
3
+ from .orchestrator import VectorDropTransform
4
+
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import Literal
5
+
6
+ from datapipeline.domain.sample import Sample
7
+ from datapipeline.domain.vector import Vector
8
+ from datapipeline.transforms.vector_utils import is_missing
9
+
10
+ from ..common import VectorPostprocessBase, select_vector
11
+
12
+
13
+ def cell_coverage(value) -> float:
14
+ """Return coverage for a single feature value.
15
+
16
+ Scalars: 1.0 when not missing, 0.0 when missing.
17
+ Lists: fraction of non-missing elements (0.0 for empty lists).
18
+ """
19
+ if isinstance(value, list):
20
+ if not value:
21
+ return 0.0
22
+ total = len(value)
23
+ ok = sum(1 for item in value if not is_missing(item))
24
+ return ok / total if total > 0 else 0.0
25
+ if is_missing(value):
26
+ return 0.0
27
+ return 1.0
28
+
29
+
30
+ class VectorDropHorizontalTransform(VectorPostprocessBase):
31
+ """Horizontal (row-wise) drop based on coverage thresholds."""
32
+
33
+ def __init__(
34
+ self,
35
+ *,
36
+ threshold: float,
37
+ payload: Literal["features", "targets", "both"] = "features",
38
+ only: list[str] | None = None,
39
+ exclude: list[str] | None = None,
40
+ ) -> None:
41
+ if not 0.0 <= threshold <= 1.0:
42
+ raise ValueError("threshold must be between 0 and 1.")
43
+ super().__init__(payload=payload, only=only, exclude=exclude)
44
+ self._threshold = threshold
45
+
46
+ def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
47
+ return self.apply(stream)
48
+
49
+ def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
50
+ for sample in stream:
51
+ total = 0.0
52
+ count = 0
53
+ for kind in self._payload_kinds():
54
+ baseline = self._ids_for(kind)
55
+ if not baseline:
56
+ continue
57
+ vector = select_vector(sample, kind)
58
+ if vector is None:
59
+ continue
60
+ total += self._horizontal_coverage(vector, baseline) * len(baseline)
61
+ count += len(baseline)
62
+ if count == 0:
63
+ yield sample
64
+ continue
65
+ coverage = total / float(count)
66
+ if coverage < self._threshold:
67
+ continue
68
+ yield sample
69
+
70
+ @staticmethod
71
+ def _horizontal_coverage(vector: Vector, baseline: list[str]) -> float:
72
+ if not baseline:
73
+ return 1.0
74
+ total = 0.0
75
+ for fid in baseline:
76
+ value = vector.values.get(fid)
77
+ total += cell_coverage(value)
78
+ return total / float(len(baseline))
79
+