jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +292 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +54 -18
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/paths.py +10 -1
  52. datapipeline/services/project_paths.py +0 -2
  53. datapipeline/services/runs.py +0 -2
  54. datapipeline/services/scaffold/contract_yaml.py +76 -0
  55. datapipeline/services/scaffold/demo.py +141 -0
  56. datapipeline/services/scaffold/discovery.py +115 -0
  57. datapipeline/services/scaffold/domain.py +21 -13
  58. datapipeline/services/scaffold/dto.py +31 -0
  59. datapipeline/services/scaffold/filter.py +2 -1
  60. datapipeline/services/scaffold/layout.py +96 -0
  61. datapipeline/services/scaffold/loader.py +61 -0
  62. datapipeline/services/scaffold/mapper.py +116 -0
  63. datapipeline/services/scaffold/parser.py +56 -0
  64. datapipeline/services/scaffold/plugin.py +14 -2
  65. datapipeline/services/scaffold/source_yaml.py +91 -0
  66. datapipeline/services/scaffold/stream_plan.py +129 -0
  67. datapipeline/services/scaffold/utils.py +187 -0
  68. datapipeline/sources/data_loader.py +0 -2
  69. datapipeline/sources/decoders.py +49 -8
  70. datapipeline/sources/factory.py +9 -6
  71. datapipeline/sources/foreach.py +18 -3
  72. datapipeline/sources/synthetic/time/parser.py +1 -1
  73. datapipeline/sources/transports.py +10 -4
  74. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  77. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  79. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  82. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  83. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  84. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  91. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  100. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  101. datapipeline/templates/plugin_skeleton/README.md +57 -136
  102. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  103. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  119. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  124. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  125. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  126. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  127. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  133. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
  137. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  138. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  139. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  140. datapipeline/templates/stubs/dto.py.j2 +2 -2
  141. datapipeline/templates/stubs/filter.py.j2 +1 -1
  142. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  143. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  144. datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
  145. datapipeline/templates/stubs/parser.py.j2 +5 -1
  146. datapipeline/templates/stubs/record.py.j2 +1 -1
  147. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  148. datapipeline/transforms/debug/identity.py +34 -16
  149. datapipeline/transforms/debug/lint.py +14 -11
  150. datapipeline/transforms/feature/scaler.py +5 -12
  151. datapipeline/transforms/filter.py +73 -17
  152. datapipeline/transforms/interfaces.py +58 -0
  153. datapipeline/transforms/record/floor_time.py +10 -7
  154. datapipeline/transforms/record/lag.py +8 -10
  155. datapipeline/transforms/sequence.py +2 -3
  156. datapipeline/transforms/stream/dedupe.py +5 -7
  157. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  158. datapipeline/transforms/stream/fill.py +34 -25
  159. datapipeline/transforms/stream/filter.py +25 -0
  160. datapipeline/transforms/stream/floor_time.py +16 -0
  161. datapipeline/transforms/stream/granularity.py +52 -30
  162. datapipeline/transforms/stream/lag.py +17 -0
  163. datapipeline/transforms/stream/rolling.py +72 -0
  164. datapipeline/transforms/utils.py +42 -10
  165. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  166. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  167. datapipeline/transforms/vector/drop/vertical.py +0 -2
  168. datapipeline/transforms/vector/ensure_schema.py +0 -2
  169. datapipeline/utils/paths.py +0 -2
  170. datapipeline/utils/placeholders.py +0 -2
  171. datapipeline/utils/rich_compat.py +0 -3
  172. datapipeline/utils/window.py +0 -2
  173. jerry_thomas-2.0.1.dist-info/METADATA +269 -0
  174. jerry_thomas-2.0.1.dist-info/RECORD +264 -0
  175. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
  176. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
  177. datapipeline/services/scaffold/mappers.py +0 -55
  178. datapipeline/services/scaffold/source.py +0 -191
  179. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  180. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  181. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  182. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  183. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  184. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  185. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  186. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  188. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  189. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  190. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  191. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  192. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  193. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
  194. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Iterator
2
- from typing import Any
2
+ from typing import Any, Callable
3
3
 
4
4
  from datapipeline.filters import filters as _filters
5
5
  from datapipeline.plugins import FILTERS_EP
@@ -23,31 +23,24 @@ _ALIAS = {
23
23
  }
24
24
 
25
25
 
26
- def _normalize_op(op: str) -> str:
26
+ def normalize_operator(op: str) -> str:
27
27
  op = (op or "").strip()
28
28
  return _ALIAS.get(op, op)
29
29
 
30
30
 
31
- def filter(
32
- stream: Iterator[Any],
33
- *,
31
+ def resolve_filter(
34
32
  operator: str,
35
- field: str,
33
+ *,
36
34
  comparand: Any,
37
- ) -> Iterator[Any]:
38
- """Generic filter transform.
35
+ ) -> tuple[str, Any | None]:
36
+ """Resolve a normalized operator and callable filter function.
39
37
 
40
- Parameters
41
- - operator: one of eq, ne, lt, le, gt, ge, in, nin (case-sensitive), or a common alias
42
- - field: record attribute/key to compare
43
- - comparand: scalar for unary operators; list/tuple/set for membership (in/nin)
38
+ Returns (op, fn) where fn may be None if comparand is missing.
44
39
  """
45
-
46
40
  if is_missing(comparand):
47
- # Skip filter when comparand is an unresolved placeholder.
48
- return stream
41
+ return "", None
49
42
 
50
- op = _normalize_op(operator)
43
+ op = normalize_operator(operator)
51
44
  fn = None
52
45
  try:
53
46
  fn = load_ep(FILTERS_EP, op)
@@ -57,4 +50,67 @@ def filter(
57
50
  raise ValueError(
58
51
  f"Unsupported filter operator: {operator!r} (normalized: {op!r})"
59
52
  )
60
- return fn(stream, field, comparand)
53
+ return op, fn
54
+
55
+
56
+ def apply_filter(
57
+ stream: Iterator[Any],
58
+ *,
59
+ field_getter: Callable[[Any, str], Any],
60
+ operator: str,
61
+ field: str,
62
+ comparand: Any,
63
+ ) -> Iterator[Any]:
64
+ op, fn = resolve_filter(operator, comparand=comparand)
65
+ if fn is None:
66
+ return stream
67
+ if getattr(fn, "__module__", None) != _filters.__name__:
68
+ return fn(stream, field, comparand)
69
+
70
+ if op in {"in_", "nin"}:
71
+ bag = _filters._as_set(comparand)
72
+
73
+ def apply_in() -> Iterator[Any]:
74
+ for record in stream:
75
+ left = field_getter(record, field)
76
+ if (left in bag) == (op == "in_"):
77
+ yield record
78
+
79
+ return apply_in()
80
+
81
+ cmp = getattr(_filters._op, op, None)
82
+ if cmp is None:
83
+ raise ValueError(
84
+ f"Unsupported filter operator: {operator!r} (normalized: {op!r})"
85
+ )
86
+
87
+ def apply_cmp() -> Iterator[Any]:
88
+ for record in stream:
89
+ left = field_getter(record, field)
90
+ if _filters.compare_values(left, comparand, cmp):
91
+ yield record
92
+
93
+ return apply_cmp()
94
+
95
+
96
+ def filter(
97
+ stream: Iterator[Any],
98
+ *,
99
+ operator: str,
100
+ field: str,
101
+ comparand: Any,
102
+ ) -> Iterator[Any]:
103
+ """Generic filter transform.
104
+
105
+ Parameters
106
+ - operator: one of eq, ne, lt, le, gt, ge, in, nin (case-sensitive), or a common alias
107
+ - field: record attribute/key to compare
108
+ - comparand: scalar for unary operators; list/tuple/set for membership (in/nin)
109
+ """
110
+ return apply_filter(
111
+ stream,
112
+ field_getter=_filters.get_field,
113
+ operator=operator,
114
+ field=field,
115
+ comparand=comparand,
116
+ )
@@ -0,0 +1,58 @@
1
+ from abc import ABC, abstractmethod
2
+ from collections.abc import Iterator
3
+ from typing import Any, TypeVar
4
+
5
+ from datapipeline.domain.record import TemporalRecord
6
+
7
+
8
+ class StreamTransformBase(ABC):
9
+ """Base interface for stream transforms over TemporalRecord."""
10
+
11
+ def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
12
+ return self.apply(stream)
13
+
14
+ @abstractmethod
15
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
16
+ ...
17
+
18
+
19
+ class FieldStreamTransformBase(StreamTransformBase):
20
+ """Base for stream transforms that read/write a record field."""
21
+
22
+ def __init__(
23
+ self,
24
+ field: str,
25
+ to: str | None = None,
26
+ partition_by: str | list[str] | None = None,
27
+ ) -> None:
28
+ if not field:
29
+ raise ValueError("field is required")
30
+ self.field = field
31
+ self.to = to or field
32
+ self.partition_by = partition_by
33
+
34
+ def _ensure_output_field(
35
+ self,
36
+ record: TemporalRecord,
37
+ value: Any = None,
38
+ ) -> TemporalRecord:
39
+ if self.to is None:
40
+ return record
41
+ if hasattr(record, self.to):
42
+ return record
43
+ setattr(record, self.to, value)
44
+ return record
45
+
46
+
47
+ TRecord = TypeVar("TRecord", bound=TemporalRecord)
48
+
49
+
50
+ class RecordTransformBase(ABC):
51
+ """Base interface for record transforms over TemporalRecord."""
52
+
53
+ def __call__(self, stream: Iterator[TRecord]) -> Iterator[TRecord]:
54
+ return self.apply(stream)
55
+
56
+ @abstractmethod
57
+ def apply(self, stream: Iterator[TRecord]) -> Iterator[TRecord]:
58
+ ...
@@ -1,17 +1,20 @@
1
- from __future__ import annotations
2
-
3
1
  from typing import Iterator
4
2
 
5
3
  from datapipeline.domain.record import TemporalRecord
6
- from datapipeline.config.dataset.normalize import floor_time_to_bucket
4
+ from datapipeline.transforms.interfaces import RecordTransformBase
5
+ from datapipeline.transforms.utils import floor_record_time
7
6
 
8
7
 
9
- def floor_time(stream: Iterator[TemporalRecord], cadence: str) -> Iterator[TemporalRecord]:
8
+ class FloorTimeRecordTransform(RecordTransformBase):
10
9
  """Floor record timestamps to the given cadence bucket (e.g., '1h', '10min').
11
10
 
12
11
  Useful before granularity aggregation to downsample within bins by making
13
12
  all intra-bin records share the same timestamp.
14
13
  """
15
- for record in stream:
16
- record.time = floor_time_to_bucket(record.time, cadence)
17
- yield record
14
+
15
+ def __init__(self, cadence: str) -> None:
16
+ self.cadence = cadence
17
+
18
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
19
+ for record in stream:
20
+ yield floor_record_time(record, self.cadence)
@@ -1,18 +1,16 @@
1
- from __future__ import annotations
2
-
3
1
  from datetime import timedelta
4
2
  from typing import Iterator
5
3
 
6
4
  from datapipeline.domain.record import TemporalRecord
7
5
  from datapipeline.utils.time import parse_timecode
6
+ from datapipeline.transforms.interfaces import RecordTransformBase
8
7
 
9
8
 
10
- def _shift_record_time(record: TemporalRecord, lag: timedelta) -> TemporalRecord:
11
- record.time = record.time - lag
12
- return record
13
-
9
+ class LagRecordTransform(RecordTransformBase):
10
+ def __init__(self, lag: str) -> None:
11
+ self.lag = parse_timecode(lag)
14
12
 
15
- def apply_lag(stream: Iterator[TemporalRecord], lag: str) -> Iterator[TemporalRecord]:
16
- lag_td = parse_timecode(lag)
17
- for record in stream:
18
- yield _shift_record_time(record, lag_td)
13
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
14
+ for record in stream:
15
+ record.time = record.time - self.lag
16
+ yield record
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  from collections import deque
4
2
  from itertools import groupby
5
3
  from typing import Iterator
@@ -33,7 +31,7 @@ class WindowTransformer:
33
31
  """Assumes input is pre-sorted by (feature_id, record.time).
34
32
 
35
33
  Produces sliding windows per feature_id. Each output carries a
36
- list[Record] in ``records``.
34
+ list[Record] in ``records`` and the selected values in ``values``.
37
35
  """
38
36
 
39
37
  grouped = groupby(stream, key=lambda fr: fr.id)
@@ -46,6 +44,7 @@ class WindowTransformer:
46
44
  if len(window) == self.size and step % self.stride == 0:
47
45
  yield FeatureRecordSequence(
48
46
  records=[r.record for r in window],
47
+ values=[r.value for r in window],
49
48
  id=fid,
50
49
  )
51
50
  step += 1
@@ -1,22 +1,20 @@
1
- from __future__ import annotations
2
-
3
1
  from collections.abc import Iterator
4
2
 
5
- from datapipeline.domain.feature import FeatureRecord
3
+ from datapipeline.domain.record import TemporalRecord
6
4
 
7
5
 
8
6
  class FeatureDeduplicateTransform:
9
- """Drop consecutive identical feature records (id + timestamp + payload)."""
7
+ """Drop consecutive identical records (timestamp + payload)."""
10
8
 
11
9
  def __init__(self, **_: object) -> None:
12
10
  # Accept arbitrary config mapping for consistency with other transforms.
13
11
  pass
14
12
 
15
- def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
13
+ def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
16
14
  return self.apply(stream)
17
15
 
18
- def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
19
- last: FeatureRecord | None = None
16
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
17
+ last: TemporalRecord | None = None
20
18
  for record in stream:
21
19
  if last is not None and record == last:
22
20
  continue
@@ -1,34 +1,49 @@
1
1
  from typing import Iterator
2
2
 
3
- from dataclasses import replace
4
-
5
- from datapipeline.domain.feature import FeatureRecord
6
3
  from datapipeline.domain.record import TemporalRecord
4
+ from datapipeline.transforms.interfaces import FieldStreamTransformBase
5
+ from datapipeline.transforms.utils import clone_record, get_field, partition_key
7
6
  from datapipeline.utils.time import parse_timecode
8
7
 
9
8
 
10
- def ensure_cadence(stream: Iterator[FeatureRecord], cadence: str) -> Iterator[FeatureRecord]:
11
- """Insert placeholder FeatureRecords so timestamps are exactly one cadence apart per feature id.
9
+ class EnsureCadenceTransform(FieldStreamTransformBase):
10
+ """Insert placeholder records so timestamps are exactly one cadence apart per partition.
12
11
 
13
12
  - cadence: duration string (e.g., "10m", "1h", "30s").
14
- - Placeholders carry value=None and inherit the feature id; group bucketing
15
- is applied later at vector assembly from record.time.
16
- - Assumes input sorted by (feature_id, record.time).
13
+ - Placeholders carry field=None and inherit partition metadata.
14
+ - Assumes input sorted by (partition_key, record.time).
17
15
  """
18
- step = parse_timecode(cadence)
19
- last: FeatureRecord | None = None
20
- for fr in stream:
21
- if (last is None) or (last.id != fr.id):
22
- yield fr
23
- last = fr
24
- continue
25
16
 
26
- expect = last.record.time + step
27
- while expect < fr.record.time:
28
- yield FeatureRecord(
29
- record=replace(last.record, time=expect, value=None),
30
- id=fr.id,
31
- )
32
- expect = expect + step
33
- yield fr
34
- last = fr
17
+ def __init__(
18
+ self,
19
+ *,
20
+ cadence: str,
21
+ field: str,
22
+ to: str | None = None,
23
+ partition_by: str | list[str] | None = None,
24
+ ) -> None:
25
+ super().__init__(field=field, to=to, partition_by=partition_by)
26
+ self.cadence = cadence
27
+
28
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
29
+ step = parse_timecode(self.cadence)
30
+ last: TemporalRecord | None = None
31
+ last_key: tuple | None = None
32
+ for record in stream:
33
+ if self.to != self.field:
34
+ record = self._ensure_output_field(
35
+ record, get_field(record, self.field)
36
+ )
37
+ key = partition_key(record, self.partition_by)
38
+ if last is None or last_key != key:
39
+ yield record
40
+ last = record
41
+ last_key = key
42
+ continue
43
+
44
+ expect = last.time + step
45
+ while expect < record.time:
46
+ yield clone_record(last, time=expect, **{self.to: None})
47
+ expect = expect + step
48
+ yield record
49
+ last = record
@@ -1,17 +1,19 @@
1
+ from collections import deque
1
2
  from itertools import groupby
2
3
  from statistics import mean, median
3
- from typing import Any, Iterator
4
- from collections import deque
5
-
6
- from datapipeline.domain.feature import FeatureRecord, FeatureRecordSequence
7
- from datapipeline.transforms.utils import is_missing, clone_record_with_value
4
+ from typing import Iterator
8
5
 
6
+ from datapipeline.domain.record import TemporalRecord
7
+ from datapipeline.transforms.interfaces import FieldStreamTransformBase
8
+ from datapipeline.transforms.utils import (
9
+ get_field,
10
+ is_missing,
11
+ clone_record_with_field,
12
+ partition_key,
13
+ )
9
14
 
10
- def _extract_value(record: Any) -> Any:
11
- return getattr(record, "value", None)
12
15
 
13
-
14
- class FillTransformer:
16
+ class FillTransformer(FieldStreamTransformBase):
15
17
  """Time-aware imputer using a strict rolling tick window.
16
18
 
17
19
  - window: number of recent ticks to consider (including missing ticks). A
@@ -23,7 +25,17 @@ class FillTransformer:
23
25
  window.
24
26
  """
25
27
 
26
- def __init__(self, statistic: str = "median", window: int | None = None, min_samples: int = 1) -> None:
28
+ def __init__(
29
+ self,
30
+ *,
31
+ field: str,
32
+ to: str | None = None,
33
+ statistic: str = "median",
34
+ window: int | None = None,
35
+ min_samples: int = 1,
36
+ partition_by: str | list[str] | None = None,
37
+ ) -> None:
38
+ super().__init__(field=field, to=to, partition_by=partition_by)
27
39
  if window is None or window <= 0:
28
40
  raise ValueError("window must be a positive integer")
29
41
  if min_samples <= 0:
@@ -43,21 +55,19 @@ class FillTransformer:
43
55
  return None
44
56
  return float(self.statistic(values))
45
57
 
46
- def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecordSequence]:
47
- return self.apply(stream)
48
-
49
- def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecordSequence]:
50
- grouped = groupby(stream, key=lambda fr: fr.id)
58
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
59
+ grouped = groupby(stream, key=lambda rec: partition_key(rec, self.partition_by))
51
60
 
52
- for id, feature_records in grouped:
61
+ for _, records in grouped:
53
62
  # Store the last `window` ticks with a flag marking whether the tick
54
63
  # had an original (non-filled) valid value, and its numeric value.
55
64
  tick_window: deque[tuple[bool, float | None]] = deque(maxlen=self.window)
56
65
 
57
- for fr in feature_records:
58
- if isinstance(fr.record, FeatureRecordSequence):
59
- raise TypeError("Fills should run before windowing transforms")
60
- value = _extract_value(fr.record)
66
+ for record in records:
67
+ value = get_field(record, self.field)
68
+ record = self._ensure_output_field(
69
+ record, None if is_missing(value) else value
70
+ )
61
71
 
62
72
  if is_missing(value):
63
73
  # Count valid values in the current window
@@ -67,15 +77,14 @@ class FillTransformer:
67
77
  if fill is not None:
68
78
  # Do NOT treat filled value as original valid; append a missing marker
69
79
  tick_window.append((False, None))
70
- yield FeatureRecord(
71
- record=clone_record_with_value(fr.record, fill),
72
- id=id,
80
+ yield clone_record_with_field(
81
+ record, self.to, fill
73
82
  )
74
83
  continue
75
84
  # Not enough valid samples in window: pass through missing
76
85
  tick_window.append((False, None))
77
- yield fr
86
+ yield record
78
87
  else:
79
88
  as_float = float(value)
80
89
  tick_window.append((True, as_float))
81
- yield fr
90
+ yield record
@@ -0,0 +1,25 @@
1
+ from collections.abc import Iterator
2
+ from typing import Any
3
+
4
+ from datapipeline.domain.record import TemporalRecord
5
+ from datapipeline.filters import filters as _filters
6
+ from datapipeline.transforms.filter import apply_filter
7
+ from datapipeline.transforms.interfaces import StreamTransformBase
8
+
9
+
10
+ class FilterTransform(StreamTransformBase):
11
+ """Filter records by comparing a field on record payloads."""
12
+
13
+ def __init__(self, operator: str, field: str, comparand: Any) -> None:
14
+ self.operator = operator
15
+ self.field = field
16
+ self.comparand = comparand
17
+
18
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
19
+ return apply_filter(
20
+ stream,
21
+ field_getter=_filters.get_field,
22
+ operator=self.operator,
23
+ field=self.field,
24
+ comparand=self.comparand,
25
+ )
@@ -0,0 +1,16 @@
1
+ from typing import Iterator
2
+
3
+ from datapipeline.domain.record import TemporalRecord
4
+ from datapipeline.transforms.interfaces import StreamTransformBase
5
+ from datapipeline.transforms.utils import floor_record_time
6
+
7
+
8
+ class FloorTimeTransform(StreamTransformBase):
9
+ """Floor record timestamps to the given cadence bucket."""
10
+
11
+ def __init__(self, cadence: str) -> None:
12
+ self.cadence = cadence
13
+
14
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
15
+ for record in stream:
16
+ yield floor_record_time(record, self.cadence)
@@ -1,79 +1,101 @@
1
- from __future__ import annotations
2
-
3
1
  from statistics import mean, median
4
2
  from typing import Iterator
5
3
 
6
- from datapipeline.domain.feature import FeatureRecord
4
+ from datapipeline.domain.record import TemporalRecord
5
+ from datapipeline.transforms.interfaces import FieldStreamTransformBase
6
+ from datapipeline.transforms.utils import (
7
+ get_field,
8
+ clone_record_with_field,
9
+ partition_key,
10
+ )
7
11
 
8
12
 
9
- class FeatureGranularityTransform:
10
- """Normalize same-timestamp duplicates for non-sequence features.
13
+ class FeatureGranularityTransform(FieldStreamTransformBase):
14
+ """Normalize same-timestamp duplicates for non-sequence streams.
11
15
 
12
16
  Single-argument API (preferred for concise YAML):
13
17
  - "first" | "last" | "mean" | "median" => aggregate duplicates within a timestamp.
14
18
  """
15
19
 
16
- def __init__(self, mode: str = "first") -> None:
20
+ def __init__(
21
+ self,
22
+ *,
23
+ field: str,
24
+ to: str | None = None,
25
+ mode: str = "first",
26
+ partition_by: str | list[str] | None = None,
27
+ ) -> None:
28
+ super().__init__(field=field, to=to, partition_by=partition_by)
17
29
  if mode not in {"first", "last", "mean", "median"}:
18
30
  raise ValueError(f"Unsupported granularity mode: {mode!r}")
19
31
  self.mode = mode
20
32
 
21
- def _aggregate(self, items: list[FeatureRecord]) -> FeatureRecord:
33
+ def _aggregate(self, items: list[TemporalRecord]) -> TemporalRecord:
22
34
  vals: list[float] = []
23
- for fr in items:
24
- vals.append(float(fr.record.value))
35
+ for rec in items:
36
+ vals.append(float(get_field(rec, self.field)))
25
37
  if self.mode == "mean":
26
38
  agg_val = mean(vals)
27
39
  elif self.mode == "median":
28
40
  agg_val = median(vals)
29
41
  new = items[-1]
30
- new.record.value = agg_val
31
- return new
32
-
33
- def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
34
- return self.apply(stream)
42
+ return clone_record_with_field(new, self.to, agg_val)
35
43
 
36
- def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
44
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
37
45
  """Aggregate duplicates per timestamp while preserving order.
38
46
 
39
- Precondition: input is sorted by (feature_id, record.time).
47
+ Precondition: input is sorted by (partition_key, record.time).
40
48
 
41
- We process one base feature stream at a time (feature_id),
49
+ We process one base stream at a time (partition_key),
42
50
  bucket its records by timestamp, then aggregate each bucket according to
43
51
  the selected mode (first/last/mean/median), emitting in increasing timestamp
44
52
  order.
45
53
  """
46
54
 
47
- # State for the current base stream: id
48
- current_key: str | None = None
55
+ # State for the current base stream: partition key
56
+ current_key: tuple | None = None
49
57
  # Buckets of same-timestamp duplicates for the current base stream
50
58
  # Maintain insertion order of timestamps as encountered
51
- time_buckets: dict[object, list[FeatureRecord]] = {}
59
+ time_buckets: dict[object, list[TemporalRecord]] = {}
52
60
 
53
- def flush_current() -> Iterator[FeatureRecord]:
61
+ def flush_current() -> Iterator[TemporalRecord]:
54
62
  if current_key is None or not time_buckets:
55
63
  return iter(())
56
64
 
57
65
  # Ordered list of timestamps as they appeared in the input
58
66
  ordered_times = list(time_buckets.keys())
59
67
 
60
- out: list[FeatureRecord] = []
68
+ out: list[TemporalRecord] = []
61
69
  for t in ordered_times:
62
70
  bucket = time_buckets.get(t, [])
63
71
  if not bucket:
64
72
  continue
65
73
  if self.mode == "last":
66
- out.append(bucket[-1])
74
+ last = bucket[-1]
75
+ out.append(
76
+ clone_record_with_field(
77
+ last,
78
+ self.to,
79
+ get_field(last, self.field),
80
+ )
81
+ )
67
82
  elif self.mode == "first":
68
- out.append(bucket[0])
83
+ first = bucket[0]
84
+ out.append(
85
+ clone_record_with_field(
86
+ first,
87
+ self.to,
88
+ get_field(first, self.field),
89
+ )
90
+ )
69
91
  else:
70
92
  out.append(self._aggregate(bucket))
71
93
  return iter(out)
72
94
 
73
- for fr in stream:
74
- base_key = fr.id
75
- t = getattr(fr.record, "time", None)
76
- # Start new base stream when feature_id changes
95
+ for record in stream:
96
+ base_key = partition_key(record, self.partition_by)
97
+ t = getattr(record, "time", None)
98
+ # Start new base stream when partition key changes
77
99
  if current_key is not None and base_key != current_key:
78
100
  for out in flush_current():
79
101
  yield out
@@ -82,9 +104,9 @@ class FeatureGranularityTransform:
82
104
  # Append to the bucket for this timestamp
83
105
  bucket = time_buckets.get(t)
84
106
  if bucket is None:
85
- time_buckets[t] = [fr]
107
+ time_buckets[t] = [record]
86
108
  else:
87
- bucket.append(fr)
109
+ bucket.append(record)
88
110
 
89
111
  # Flush any remaining base stream
90
112
  if current_key is not None:
@@ -0,0 +1,17 @@
1
+ from typing import Iterator
2
+
3
+ from datapipeline.domain.record import TemporalRecord
4
+ from datapipeline.transforms.interfaces import StreamTransformBase
5
+ from datapipeline.utils.time import parse_timecode
6
+
7
+
8
+ class LagTransform(StreamTransformBase):
9
+ """Shift record timestamps backwards by the given lag."""
10
+
11
+ def __init__(self, lag: str) -> None:
12
+ self.lag = parse_timecode(lag)
13
+
14
+ def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
15
+ for record in stream:
16
+ record.time = record.time - self.lag
17
+ yield record