jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +292 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +54 -18
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/paths.py +10 -1
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +129 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -2
- datapipeline/templates/stubs/filter.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
- datapipeline/templates/stubs/parser.py.j2 +5 -1
- datapipeline/templates/stubs/record.py.j2 +1 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.1.dist-info/METADATA +269 -0
- jerry_thomas-2.0.1.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from collections.abc import Iterator
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any, Callable
|
|
3
3
|
|
|
4
4
|
from datapipeline.filters import filters as _filters
|
|
5
5
|
from datapipeline.plugins import FILTERS_EP
|
|
@@ -23,31 +23,24 @@ _ALIAS = {
|
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
def
|
|
26
|
+
def normalize_operator(op: str) -> str:
|
|
27
27
|
op = (op or "").strip()
|
|
28
28
|
return _ALIAS.get(op, op)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def
|
|
32
|
-
stream: Iterator[Any],
|
|
33
|
-
*,
|
|
31
|
+
def resolve_filter(
|
|
34
32
|
operator: str,
|
|
35
|
-
|
|
33
|
+
*,
|
|
36
34
|
comparand: Any,
|
|
37
|
-
) ->
|
|
38
|
-
"""
|
|
35
|
+
) -> tuple[str, Any | None]:
|
|
36
|
+
"""Resolve a normalized operator and callable filter function.
|
|
39
37
|
|
|
40
|
-
|
|
41
|
-
- operator: one of eq, ne, lt, le, gt, ge, in, nin (case-sensitive), or a common alias
|
|
42
|
-
- field: record attribute/key to compare
|
|
43
|
-
- comparand: scalar for unary operators; list/tuple/set for membership (in/nin)
|
|
38
|
+
Returns (op, fn) where fn may be None if comparand is missing.
|
|
44
39
|
"""
|
|
45
|
-
|
|
46
40
|
if is_missing(comparand):
|
|
47
|
-
|
|
48
|
-
return stream
|
|
41
|
+
return "", None
|
|
49
42
|
|
|
50
|
-
op =
|
|
43
|
+
op = normalize_operator(operator)
|
|
51
44
|
fn = None
|
|
52
45
|
try:
|
|
53
46
|
fn = load_ep(FILTERS_EP, op)
|
|
@@ -57,4 +50,67 @@ def filter(
|
|
|
57
50
|
raise ValueError(
|
|
58
51
|
f"Unsupported filter operator: {operator!r} (normalized: {op!r})"
|
|
59
52
|
)
|
|
60
|
-
return
|
|
53
|
+
return op, fn
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def apply_filter(
|
|
57
|
+
stream: Iterator[Any],
|
|
58
|
+
*,
|
|
59
|
+
field_getter: Callable[[Any, str], Any],
|
|
60
|
+
operator: str,
|
|
61
|
+
field: str,
|
|
62
|
+
comparand: Any,
|
|
63
|
+
) -> Iterator[Any]:
|
|
64
|
+
op, fn = resolve_filter(operator, comparand=comparand)
|
|
65
|
+
if fn is None:
|
|
66
|
+
return stream
|
|
67
|
+
if getattr(fn, "__module__", None) != _filters.__name__:
|
|
68
|
+
return fn(stream, field, comparand)
|
|
69
|
+
|
|
70
|
+
if op in {"in_", "nin"}:
|
|
71
|
+
bag = _filters._as_set(comparand)
|
|
72
|
+
|
|
73
|
+
def apply_in() -> Iterator[Any]:
|
|
74
|
+
for record in stream:
|
|
75
|
+
left = field_getter(record, field)
|
|
76
|
+
if (left in bag) == (op == "in_"):
|
|
77
|
+
yield record
|
|
78
|
+
|
|
79
|
+
return apply_in()
|
|
80
|
+
|
|
81
|
+
cmp = getattr(_filters._op, op, None)
|
|
82
|
+
if cmp is None:
|
|
83
|
+
raise ValueError(
|
|
84
|
+
f"Unsupported filter operator: {operator!r} (normalized: {op!r})"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def apply_cmp() -> Iterator[Any]:
|
|
88
|
+
for record in stream:
|
|
89
|
+
left = field_getter(record, field)
|
|
90
|
+
if _filters.compare_values(left, comparand, cmp):
|
|
91
|
+
yield record
|
|
92
|
+
|
|
93
|
+
return apply_cmp()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def filter(
|
|
97
|
+
stream: Iterator[Any],
|
|
98
|
+
*,
|
|
99
|
+
operator: str,
|
|
100
|
+
field: str,
|
|
101
|
+
comparand: Any,
|
|
102
|
+
) -> Iterator[Any]:
|
|
103
|
+
"""Generic filter transform.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
- operator: one of eq, ne, lt, le, gt, ge, in, nin (case-sensitive), or a common alias
|
|
107
|
+
- field: record attribute/key to compare
|
|
108
|
+
- comparand: scalar for unary operators; list/tuple/set for membership (in/nin)
|
|
109
|
+
"""
|
|
110
|
+
return apply_filter(
|
|
111
|
+
stream,
|
|
112
|
+
field_getter=_filters.get_field,
|
|
113
|
+
operator=operator,
|
|
114
|
+
field=field,
|
|
115
|
+
comparand=comparand,
|
|
116
|
+
)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from collections.abc import Iterator
|
|
3
|
+
from typing import Any, TypeVar
|
|
4
|
+
|
|
5
|
+
from datapipeline.domain.record import TemporalRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class StreamTransformBase(ABC):
|
|
9
|
+
"""Base interface for stream transforms over TemporalRecord."""
|
|
10
|
+
|
|
11
|
+
def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
12
|
+
return self.apply(stream)
|
|
13
|
+
|
|
14
|
+
@abstractmethod
|
|
15
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FieldStreamTransformBase(StreamTransformBase):
|
|
20
|
+
"""Base for stream transforms that read/write a record field."""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
field: str,
|
|
25
|
+
to: str | None = None,
|
|
26
|
+
partition_by: str | list[str] | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
if not field:
|
|
29
|
+
raise ValueError("field is required")
|
|
30
|
+
self.field = field
|
|
31
|
+
self.to = to or field
|
|
32
|
+
self.partition_by = partition_by
|
|
33
|
+
|
|
34
|
+
def _ensure_output_field(
|
|
35
|
+
self,
|
|
36
|
+
record: TemporalRecord,
|
|
37
|
+
value: Any = None,
|
|
38
|
+
) -> TemporalRecord:
|
|
39
|
+
if self.to is None:
|
|
40
|
+
return record
|
|
41
|
+
if hasattr(record, self.to):
|
|
42
|
+
return record
|
|
43
|
+
setattr(record, self.to, value)
|
|
44
|
+
return record
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
TRecord = TypeVar("TRecord", bound=TemporalRecord)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class RecordTransformBase(ABC):
|
|
51
|
+
"""Base interface for record transforms over TemporalRecord."""
|
|
52
|
+
|
|
53
|
+
def __call__(self, stream: Iterator[TRecord]) -> Iterator[TRecord]:
|
|
54
|
+
return self.apply(stream)
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def apply(self, stream: Iterator[TRecord]) -> Iterator[TRecord]:
|
|
58
|
+
...
|
|
@@ -1,17 +1,20 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from typing import Iterator
|
|
4
2
|
|
|
5
3
|
from datapipeline.domain.record import TemporalRecord
|
|
6
|
-
from datapipeline.
|
|
4
|
+
from datapipeline.transforms.interfaces import RecordTransformBase
|
|
5
|
+
from datapipeline.transforms.utils import floor_record_time
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
|
|
8
|
+
class FloorTimeRecordTransform(RecordTransformBase):
|
|
10
9
|
"""Floor record timestamps to the given cadence bucket (e.g., '1h', '10min').
|
|
11
10
|
|
|
12
11
|
Useful before granularity aggregation to downsample within bins by making
|
|
13
12
|
all intra-bin records share the same timestamp.
|
|
14
13
|
"""
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
14
|
+
|
|
15
|
+
def __init__(self, cadence: str) -> None:
|
|
16
|
+
self.cadence = cadence
|
|
17
|
+
|
|
18
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
19
|
+
for record in stream:
|
|
20
|
+
yield floor_record_time(record, self.cadence)
|
|
@@ -1,18 +1,16 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from datetime import timedelta
|
|
4
2
|
from typing import Iterator
|
|
5
3
|
|
|
6
4
|
from datapipeline.domain.record import TemporalRecord
|
|
7
5
|
from datapipeline.utils.time import parse_timecode
|
|
6
|
+
from datapipeline.transforms.interfaces import RecordTransformBase
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
9
|
+
class LagRecordTransform(RecordTransformBase):
|
|
10
|
+
def __init__(self, lag: str) -> None:
|
|
11
|
+
self.lag = parse_timecode(lag)
|
|
14
12
|
|
|
15
|
-
def
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
13
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
14
|
+
for record in stream:
|
|
15
|
+
record.time = record.time - self.lag
|
|
16
|
+
yield record
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from collections import deque
|
|
4
2
|
from itertools import groupby
|
|
5
3
|
from typing import Iterator
|
|
@@ -33,7 +31,7 @@ class WindowTransformer:
|
|
|
33
31
|
"""Assumes input is pre-sorted by (feature_id, record.time).
|
|
34
32
|
|
|
35
33
|
Produces sliding windows per feature_id. Each output carries a
|
|
36
|
-
list[Record] in ``records``.
|
|
34
|
+
list[Record] in ``records`` and the selected values in ``values``.
|
|
37
35
|
"""
|
|
38
36
|
|
|
39
37
|
grouped = groupby(stream, key=lambda fr: fr.id)
|
|
@@ -46,6 +44,7 @@ class WindowTransformer:
|
|
|
46
44
|
if len(window) == self.size and step % self.stride == 0:
|
|
47
45
|
yield FeatureRecordSequence(
|
|
48
46
|
records=[r.record for r in window],
|
|
47
|
+
values=[r.value for r in window],
|
|
49
48
|
id=fid,
|
|
50
49
|
)
|
|
51
50
|
step += 1
|
|
@@ -1,22 +1,20 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from collections.abc import Iterator
|
|
4
2
|
|
|
5
|
-
from datapipeline.domain.
|
|
3
|
+
from datapipeline.domain.record import TemporalRecord
|
|
6
4
|
|
|
7
5
|
|
|
8
6
|
class FeatureDeduplicateTransform:
|
|
9
|
-
"""Drop consecutive identical
|
|
7
|
+
"""Drop consecutive identical records (timestamp + payload)."""
|
|
10
8
|
|
|
11
9
|
def __init__(self, **_: object) -> None:
|
|
12
10
|
# Accept arbitrary config mapping for consistency with other transforms.
|
|
13
11
|
pass
|
|
14
12
|
|
|
15
|
-
def __call__(self, stream: Iterator[
|
|
13
|
+
def __call__(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
16
14
|
return self.apply(stream)
|
|
17
15
|
|
|
18
|
-
def apply(self, stream: Iterator[
|
|
19
|
-
last:
|
|
16
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
17
|
+
last: TemporalRecord | None = None
|
|
20
18
|
for record in stream:
|
|
21
19
|
if last is not None and record == last:
|
|
22
20
|
continue
|
|
@@ -1,34 +1,49 @@
|
|
|
1
1
|
from typing import Iterator
|
|
2
2
|
|
|
3
|
-
from dataclasses import replace
|
|
4
|
-
|
|
5
|
-
from datapipeline.domain.feature import FeatureRecord
|
|
6
3
|
from datapipeline.domain.record import TemporalRecord
|
|
4
|
+
from datapipeline.transforms.interfaces import FieldStreamTransformBase
|
|
5
|
+
from datapipeline.transforms.utils import clone_record, get_field, partition_key
|
|
7
6
|
from datapipeline.utils.time import parse_timecode
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
"""Insert placeholder
|
|
9
|
+
class EnsureCadenceTransform(FieldStreamTransformBase):
|
|
10
|
+
"""Insert placeholder records so timestamps are exactly one cadence apart per partition.
|
|
12
11
|
|
|
13
12
|
- cadence: duration string (e.g., "10m", "1h", "30s").
|
|
14
|
-
- Placeholders carry
|
|
15
|
-
|
|
16
|
-
- Assumes input sorted by (feature_id, record.time).
|
|
13
|
+
- Placeholders carry field=None and inherit partition metadata.
|
|
14
|
+
- Assumes input sorted by (partition_key, record.time).
|
|
17
15
|
"""
|
|
18
|
-
step = parse_timecode(cadence)
|
|
19
|
-
last: FeatureRecord | None = None
|
|
20
|
-
for fr in stream:
|
|
21
|
-
if (last is None) or (last.id != fr.id):
|
|
22
|
-
yield fr
|
|
23
|
-
last = fr
|
|
24
|
-
continue
|
|
25
16
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
*,
|
|
20
|
+
cadence: str,
|
|
21
|
+
field: str,
|
|
22
|
+
to: str | None = None,
|
|
23
|
+
partition_by: str | list[str] | None = None,
|
|
24
|
+
) -> None:
|
|
25
|
+
super().__init__(field=field, to=to, partition_by=partition_by)
|
|
26
|
+
self.cadence = cadence
|
|
27
|
+
|
|
28
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
29
|
+
step = parse_timecode(self.cadence)
|
|
30
|
+
last: TemporalRecord | None = None
|
|
31
|
+
last_key: tuple | None = None
|
|
32
|
+
for record in stream:
|
|
33
|
+
if self.to != self.field:
|
|
34
|
+
record = self._ensure_output_field(
|
|
35
|
+
record, get_field(record, self.field)
|
|
36
|
+
)
|
|
37
|
+
key = partition_key(record, self.partition_by)
|
|
38
|
+
if last is None or last_key != key:
|
|
39
|
+
yield record
|
|
40
|
+
last = record
|
|
41
|
+
last_key = key
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
expect = last.time + step
|
|
45
|
+
while expect < record.time:
|
|
46
|
+
yield clone_record(last, time=expect, **{self.to: None})
|
|
47
|
+
expect = expect + step
|
|
48
|
+
yield record
|
|
49
|
+
last = record
|
|
@@ -1,17 +1,19 @@
|
|
|
1
|
+
from collections import deque
|
|
1
2
|
from itertools import groupby
|
|
2
3
|
from statistics import mean, median
|
|
3
|
-
from typing import
|
|
4
|
-
from collections import deque
|
|
5
|
-
|
|
6
|
-
from datapipeline.domain.feature import FeatureRecord, FeatureRecordSequence
|
|
7
|
-
from datapipeline.transforms.utils import is_missing, clone_record_with_value
|
|
4
|
+
from typing import Iterator
|
|
8
5
|
|
|
6
|
+
from datapipeline.domain.record import TemporalRecord
|
|
7
|
+
from datapipeline.transforms.interfaces import FieldStreamTransformBase
|
|
8
|
+
from datapipeline.transforms.utils import (
|
|
9
|
+
get_field,
|
|
10
|
+
is_missing,
|
|
11
|
+
clone_record_with_field,
|
|
12
|
+
partition_key,
|
|
13
|
+
)
|
|
9
14
|
|
|
10
|
-
def _extract_value(record: Any) -> Any:
|
|
11
|
-
return getattr(record, "value", None)
|
|
12
15
|
|
|
13
|
-
|
|
14
|
-
class FillTransformer:
|
|
16
|
+
class FillTransformer(FieldStreamTransformBase):
|
|
15
17
|
"""Time-aware imputer using a strict rolling tick window.
|
|
16
18
|
|
|
17
19
|
- window: number of recent ticks to consider (including missing ticks). A
|
|
@@ -23,7 +25,17 @@ class FillTransformer:
|
|
|
23
25
|
window.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
|
-
def __init__(
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
*,
|
|
31
|
+
field: str,
|
|
32
|
+
to: str | None = None,
|
|
33
|
+
statistic: str = "median",
|
|
34
|
+
window: int | None = None,
|
|
35
|
+
min_samples: int = 1,
|
|
36
|
+
partition_by: str | list[str] | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
super().__init__(field=field, to=to, partition_by=partition_by)
|
|
27
39
|
if window is None or window <= 0:
|
|
28
40
|
raise ValueError("window must be a positive integer")
|
|
29
41
|
if min_samples <= 0:
|
|
@@ -43,21 +55,19 @@ class FillTransformer:
|
|
|
43
55
|
return None
|
|
44
56
|
return float(self.statistic(values))
|
|
45
57
|
|
|
46
|
-
def
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecordSequence]:
|
|
50
|
-
grouped = groupby(stream, key=lambda fr: fr.id)
|
|
58
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
59
|
+
grouped = groupby(stream, key=lambda rec: partition_key(rec, self.partition_by))
|
|
51
60
|
|
|
52
|
-
for
|
|
61
|
+
for _, records in grouped:
|
|
53
62
|
# Store the last `window` ticks with a flag marking whether the tick
|
|
54
63
|
# had an original (non-filled) valid value, and its numeric value.
|
|
55
64
|
tick_window: deque[tuple[bool, float | None]] = deque(maxlen=self.window)
|
|
56
65
|
|
|
57
|
-
for
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
66
|
+
for record in records:
|
|
67
|
+
value = get_field(record, self.field)
|
|
68
|
+
record = self._ensure_output_field(
|
|
69
|
+
record, None if is_missing(value) else value
|
|
70
|
+
)
|
|
61
71
|
|
|
62
72
|
if is_missing(value):
|
|
63
73
|
# Count valid values in the current window
|
|
@@ -67,15 +77,14 @@ class FillTransformer:
|
|
|
67
77
|
if fill is not None:
|
|
68
78
|
# Do NOT treat filled value as original valid; append a missing marker
|
|
69
79
|
tick_window.append((False, None))
|
|
70
|
-
yield
|
|
71
|
-
record
|
|
72
|
-
id=id,
|
|
80
|
+
yield clone_record_with_field(
|
|
81
|
+
record, self.to, fill
|
|
73
82
|
)
|
|
74
83
|
continue
|
|
75
84
|
# Not enough valid samples in window: pass through missing
|
|
76
85
|
tick_window.append((False, None))
|
|
77
|
-
yield
|
|
86
|
+
yield record
|
|
78
87
|
else:
|
|
79
88
|
as_float = float(value)
|
|
80
89
|
tick_window.append((True, as_float))
|
|
81
|
-
yield
|
|
90
|
+
yield record
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from datapipeline.domain.record import TemporalRecord
|
|
5
|
+
from datapipeline.filters import filters as _filters
|
|
6
|
+
from datapipeline.transforms.filter import apply_filter
|
|
7
|
+
from datapipeline.transforms.interfaces import StreamTransformBase
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FilterTransform(StreamTransformBase):
|
|
11
|
+
"""Filter records by comparing a field on record payloads."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, operator: str, field: str, comparand: Any) -> None:
|
|
14
|
+
self.operator = operator
|
|
15
|
+
self.field = field
|
|
16
|
+
self.comparand = comparand
|
|
17
|
+
|
|
18
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
19
|
+
return apply_filter(
|
|
20
|
+
stream,
|
|
21
|
+
field_getter=_filters.get_field,
|
|
22
|
+
operator=self.operator,
|
|
23
|
+
field=self.field,
|
|
24
|
+
comparand=self.comparand,
|
|
25
|
+
)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
from datapipeline.domain.record import TemporalRecord
|
|
4
|
+
from datapipeline.transforms.interfaces import StreamTransformBase
|
|
5
|
+
from datapipeline.transforms.utils import floor_record_time
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FloorTimeTransform(StreamTransformBase):
|
|
9
|
+
"""Floor record timestamps to the given cadence bucket."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, cadence: str) -> None:
|
|
12
|
+
self.cadence = cadence
|
|
13
|
+
|
|
14
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
15
|
+
for record in stream:
|
|
16
|
+
yield floor_record_time(record, self.cadence)
|
|
@@ -1,79 +1,101 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
1
|
from statistics import mean, median
|
|
4
2
|
from typing import Iterator
|
|
5
3
|
|
|
6
|
-
from datapipeline.domain.
|
|
4
|
+
from datapipeline.domain.record import TemporalRecord
|
|
5
|
+
from datapipeline.transforms.interfaces import FieldStreamTransformBase
|
|
6
|
+
from datapipeline.transforms.utils import (
|
|
7
|
+
get_field,
|
|
8
|
+
clone_record_with_field,
|
|
9
|
+
partition_key,
|
|
10
|
+
)
|
|
7
11
|
|
|
8
12
|
|
|
9
|
-
class FeatureGranularityTransform:
|
|
10
|
-
"""Normalize same-timestamp duplicates for non-sequence
|
|
13
|
+
class FeatureGranularityTransform(FieldStreamTransformBase):
|
|
14
|
+
"""Normalize same-timestamp duplicates for non-sequence streams.
|
|
11
15
|
|
|
12
16
|
Single-argument API (preferred for concise YAML):
|
|
13
17
|
- "first" | "last" | "mean" | "median" => aggregate duplicates within a timestamp.
|
|
14
18
|
"""
|
|
15
19
|
|
|
16
|
-
def __init__(
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
*,
|
|
23
|
+
field: str,
|
|
24
|
+
to: str | None = None,
|
|
25
|
+
mode: str = "first",
|
|
26
|
+
partition_by: str | list[str] | None = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
super().__init__(field=field, to=to, partition_by=partition_by)
|
|
17
29
|
if mode not in {"first", "last", "mean", "median"}:
|
|
18
30
|
raise ValueError(f"Unsupported granularity mode: {mode!r}")
|
|
19
31
|
self.mode = mode
|
|
20
32
|
|
|
21
|
-
def _aggregate(self, items: list[
|
|
33
|
+
def _aggregate(self, items: list[TemporalRecord]) -> TemporalRecord:
|
|
22
34
|
vals: list[float] = []
|
|
23
|
-
for
|
|
24
|
-
vals.append(float(
|
|
35
|
+
for rec in items:
|
|
36
|
+
vals.append(float(get_field(rec, self.field)))
|
|
25
37
|
if self.mode == "mean":
|
|
26
38
|
agg_val = mean(vals)
|
|
27
39
|
elif self.mode == "median":
|
|
28
40
|
agg_val = median(vals)
|
|
29
41
|
new = items[-1]
|
|
30
|
-
new.
|
|
31
|
-
return new
|
|
32
|
-
|
|
33
|
-
def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
34
|
-
return self.apply(stream)
|
|
42
|
+
return clone_record_with_field(new, self.to, agg_val)
|
|
35
43
|
|
|
36
|
-
def apply(self, stream: Iterator[
|
|
44
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
37
45
|
"""Aggregate duplicates per timestamp while preserving order.
|
|
38
46
|
|
|
39
|
-
Precondition: input is sorted by (
|
|
47
|
+
Precondition: input is sorted by (partition_key, record.time).
|
|
40
48
|
|
|
41
|
-
We process one base
|
|
49
|
+
We process one base stream at a time (partition_key),
|
|
42
50
|
bucket its records by timestamp, then aggregate each bucket according to
|
|
43
51
|
the selected mode (first/last/mean/median), emitting in increasing timestamp
|
|
44
52
|
order.
|
|
45
53
|
"""
|
|
46
54
|
|
|
47
|
-
# State for the current base stream:
|
|
48
|
-
current_key:
|
|
55
|
+
# State for the current base stream: partition key
|
|
56
|
+
current_key: tuple | None = None
|
|
49
57
|
# Buckets of same-timestamp duplicates for the current base stream
|
|
50
58
|
# Maintain insertion order of timestamps as encountered
|
|
51
|
-
time_buckets: dict[object, list[
|
|
59
|
+
time_buckets: dict[object, list[TemporalRecord]] = {}
|
|
52
60
|
|
|
53
|
-
def flush_current() -> Iterator[
|
|
61
|
+
def flush_current() -> Iterator[TemporalRecord]:
|
|
54
62
|
if current_key is None or not time_buckets:
|
|
55
63
|
return iter(())
|
|
56
64
|
|
|
57
65
|
# Ordered list of timestamps as they appeared in the input
|
|
58
66
|
ordered_times = list(time_buckets.keys())
|
|
59
67
|
|
|
60
|
-
out: list[
|
|
68
|
+
out: list[TemporalRecord] = []
|
|
61
69
|
for t in ordered_times:
|
|
62
70
|
bucket = time_buckets.get(t, [])
|
|
63
71
|
if not bucket:
|
|
64
72
|
continue
|
|
65
73
|
if self.mode == "last":
|
|
66
|
-
|
|
74
|
+
last = bucket[-1]
|
|
75
|
+
out.append(
|
|
76
|
+
clone_record_with_field(
|
|
77
|
+
last,
|
|
78
|
+
self.to,
|
|
79
|
+
get_field(last, self.field),
|
|
80
|
+
)
|
|
81
|
+
)
|
|
67
82
|
elif self.mode == "first":
|
|
68
|
-
|
|
83
|
+
first = bucket[0]
|
|
84
|
+
out.append(
|
|
85
|
+
clone_record_with_field(
|
|
86
|
+
first,
|
|
87
|
+
self.to,
|
|
88
|
+
get_field(first, self.field),
|
|
89
|
+
)
|
|
90
|
+
)
|
|
69
91
|
else:
|
|
70
92
|
out.append(self._aggregate(bucket))
|
|
71
93
|
return iter(out)
|
|
72
94
|
|
|
73
|
-
for
|
|
74
|
-
base_key =
|
|
75
|
-
t = getattr(
|
|
76
|
-
# Start new base stream when
|
|
95
|
+
for record in stream:
|
|
96
|
+
base_key = partition_key(record, self.partition_by)
|
|
97
|
+
t = getattr(record, "time", None)
|
|
98
|
+
# Start new base stream when partition key changes
|
|
77
99
|
if current_key is not None and base_key != current_key:
|
|
78
100
|
for out in flush_current():
|
|
79
101
|
yield out
|
|
@@ -82,9 +104,9 @@ class FeatureGranularityTransform:
|
|
|
82
104
|
# Append to the bucket for this timestamp
|
|
83
105
|
bucket = time_buckets.get(t)
|
|
84
106
|
if bucket is None:
|
|
85
|
-
time_buckets[t] = [
|
|
107
|
+
time_buckets[t] = [record]
|
|
86
108
|
else:
|
|
87
|
-
bucket.append(
|
|
109
|
+
bucket.append(record)
|
|
88
110
|
|
|
89
111
|
# Flush any remaining base stream
|
|
90
112
|
if current_key is not None:
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Iterator
|
|
2
|
+
|
|
3
|
+
from datapipeline.domain.record import TemporalRecord
|
|
4
|
+
from datapipeline.transforms.interfaces import StreamTransformBase
|
|
5
|
+
from datapipeline.utils.time import parse_timecode
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LagTransform(StreamTransformBase):
|
|
9
|
+
"""Shift record timestamps backwards by the given lag."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, lag: str) -> None:
|
|
12
|
+
self.lag = parse_timecode(lag)
|
|
13
|
+
|
|
14
|
+
def apply(self, stream: Iterator[TemporalRecord]) -> Iterator[TemporalRecord]:
|
|
15
|
+
for record in stream:
|
|
16
|
+
record.time = record.time - self.lag
|
|
17
|
+
yield record
|