jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
from typing import
|
|
2
|
-
|
|
1
|
+
from typing import Any, Iterator
|
|
2
|
+
|
|
3
3
|
from {{PACKAGE_NAME}}.domains.{{TARGET_DOMAIN}}.model import {{DomainRecord}}
|
|
4
|
+
from {{PACKAGE_NAME}}.sources.{{ORIGIN}}.{{DATASET}}.dto import {{OriginDTO}}
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def {{FUNCTION_NAME}}(
|
|
7
8
|
stream: Iterator[{{OriginDTO}}],
|
|
8
|
-
|
|
9
|
+
**params: Any,
|
|
9
10
|
) -> Iterator[{{DomainRecord}}]:
|
|
10
11
|
"""Map raw {{ORIGIN}} DTOs to domain-level {{TARGET_DOMAIN}} records.
|
|
11
12
|
|
|
12
13
|
- Required on domain record: time and value.
|
|
13
|
-
- Additional options may be passed via kwargs (e.g., mode="...").
|
|
14
|
+
- Additional options may be passed via kwargs (e.g., variant="..." or mode="...").
|
|
14
15
|
"""
|
|
15
16
|
for dto in stream:
|
|
16
17
|
# TODO: construct {{DomainRecord}} from dto fields
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
# Required identifier for this raw source.
|
|
2
|
-
#
|
|
3
|
-
source_id: "{{ source_id }}"
|
|
1
|
+
# Required identifier for this raw source. Contracts reference it under `source:`.
|
|
2
|
+
id: "{{ id }}" # format: provider.dataset
|
|
4
3
|
|
|
5
4
|
# parser.entrypoint: registered parser name (not a file path)
|
|
6
5
|
parser:
|
|
@@ -1,24 +1,21 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import
|
|
2
|
+
from datetime import timedelta
|
|
3
3
|
from itertools import groupby
|
|
4
4
|
from typing import Iterator
|
|
5
5
|
|
|
6
6
|
from datapipeline.domain.feature import FeatureRecord
|
|
7
|
-
from datapipeline.
|
|
7
|
+
from datapipeline.utils.time import parse_timecode
|
|
8
|
+
|
|
9
|
+
|
|
8
10
|
logger = logging.getLogger(__name__)
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class StreamLint:
|
|
12
|
-
"""Validate a feature stream
|
|
14
|
+
"""Validate structural properties of a feature stream (order, cadence, duplicates).
|
|
13
15
|
|
|
14
16
|
Parameters
|
|
15
17
|
- mode: 'warn' (default) logs warnings; 'error' raises on first violation
|
|
16
18
|
- tick: optional cadence (e.g. '1h', '10m'); when set, check regularity
|
|
17
|
-
- check_missing: flag missing values (value is None/NaN)
|
|
18
|
-
- check_regular: flag gaps vs. expected tick
|
|
19
|
-
- check_duplicates: flag multiple records with same timestamp
|
|
20
|
-
- check_order: flag out-of-order timestamps within a feature stream
|
|
21
|
-
- check_finite: flag non-finite values (NaN/Inf)
|
|
22
19
|
"""
|
|
23
20
|
|
|
24
21
|
def __init__(
|
|
@@ -26,19 +23,20 @@ class StreamLint:
|
|
|
26
23
|
*,
|
|
27
24
|
mode: str = "warn",
|
|
28
25
|
tick: str | None = None,
|
|
29
|
-
check_missing: bool = True,
|
|
30
|
-
check_regular: bool = True,
|
|
31
|
-
check_duplicates: bool = True,
|
|
32
|
-
check_order: bool = True,
|
|
33
|
-
check_finite: bool = True,
|
|
34
26
|
) -> None:
|
|
35
27
|
self.mode = mode
|
|
36
28
|
self.tick = tick
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self.
|
|
40
|
-
self.
|
|
41
|
-
|
|
29
|
+
|
|
30
|
+
# Pre-compute tick step in seconds when provided to avoid repeated parsing.
|
|
31
|
+
self._tick_seconds: int | None = None
|
|
32
|
+
if self.tick:
|
|
33
|
+
try:
|
|
34
|
+
self._tick_seconds = int(parse_timecode(self.tick).total_seconds())
|
|
35
|
+
except Exception:
|
|
36
|
+
logger.warning(
|
|
37
|
+
"StreamLint: invalid tick %r (cadence checks disabled)", self.tick
|
|
38
|
+
)
|
|
39
|
+
self._tick_seconds = None
|
|
42
40
|
|
|
43
41
|
def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
44
42
|
return self.apply(stream)
|
|
@@ -55,46 +53,33 @@ class StreamLint:
|
|
|
55
53
|
seen_times: set = set()
|
|
56
54
|
for fr in records:
|
|
57
55
|
t = getattr(fr.record, "time", None)
|
|
58
|
-
v = getattr(fr.record, "value", None)
|
|
59
56
|
|
|
60
57
|
# Check ordering
|
|
61
|
-
if
|
|
58
|
+
if last_time is not None and t is not None and t < last_time:
|
|
62
59
|
self._violation(
|
|
63
60
|
f"out-of-order timestamp for feature '{fid}': {t} < {last_time}. "
|
|
64
61
|
f"Consider sorting upstream or fixing loader."
|
|
65
62
|
)
|
|
66
63
|
|
|
67
64
|
# Check duplicates
|
|
68
|
-
if
|
|
65
|
+
if t in seen_times:
|
|
69
66
|
self._violation(
|
|
70
67
|
f"duplicate timestamp for feature '{fid}' at {t}. "
|
|
71
68
|
f"Consider a granularity transform (first/last/mean/median)."
|
|
72
69
|
)
|
|
73
70
|
seen_times.add(t)
|
|
74
71
|
|
|
75
|
-
#
|
|
76
|
-
if
|
|
77
|
-
self.
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
self._violation(
|
|
83
|
-
f"non-finite value for feature '{fid}' at {t}: {v}. "
|
|
84
|
-
f"Consider filtering or scaling."
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
# Regularity check requires explicit tick; done at stream layer via ensure_ticks normally
|
|
88
|
-
if self.check_regular and self.tick and last_time is not None and t is not None:
|
|
89
|
-
# Lazy import to avoid cycle
|
|
90
|
-
from datapipeline.utils.time import parse_timecode
|
|
91
|
-
|
|
92
|
-
step = parse_timecode(self.tick)
|
|
93
|
-
expect = last_time + step
|
|
72
|
+
# Regularity check requires explicit tick; done at stream layer via ensure_cadence normally
|
|
73
|
+
if (
|
|
74
|
+
self._tick_seconds
|
|
75
|
+
and last_time is not None
|
|
76
|
+
and t is not None
|
|
77
|
+
):
|
|
78
|
+
expect = last_time + timedelta(seconds=self._tick_seconds)
|
|
94
79
|
if t != expect and t > expect:
|
|
95
80
|
self._violation(
|
|
96
81
|
f"skipped tick(s) for feature '{fid}': expected {expect}, got {t}. "
|
|
97
|
-
f"Consider using
|
|
82
|
+
f"Consider using ensure_cadence."
|
|
98
83
|
)
|
|
99
84
|
|
|
100
85
|
last_time = t
|
|
@@ -3,13 +3,14 @@ from collections import defaultdict
|
|
|
3
3
|
from itertools import groupby
|
|
4
4
|
from numbers import Real
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Iterator
|
|
6
|
+
from typing import Any, Callable, Iterator, Literal, Mapping
|
|
7
7
|
|
|
8
8
|
from datapipeline.domain.feature import FeatureRecord
|
|
9
|
-
from datapipeline.domain.
|
|
9
|
+
from datapipeline.domain.sample import Sample
|
|
10
10
|
from datapipeline.transforms.feature.model import FeatureTransform
|
|
11
11
|
from datapipeline.transforms.utils import clone_record_with_value
|
|
12
12
|
from datapipeline.utils.pickle_model import PicklePersistanceMixin
|
|
13
|
+
from datapipeline.pipeline.observability import TransformEvent
|
|
13
14
|
|
|
14
15
|
|
|
15
16
|
def _iter_numeric_values(value: Any) -> Iterator[float]:
|
|
@@ -38,12 +39,14 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
38
39
|
self.with_std = with_std
|
|
39
40
|
self.epsilon = epsilon
|
|
40
41
|
self.statistics: dict[str, dict[str, float | int]] = {}
|
|
42
|
+
self.missing_counts: dict[str, int] = {}
|
|
41
43
|
|
|
42
|
-
def fit(self, vectors: Iterator[
|
|
44
|
+
def fit(self, vectors: Iterator[Sample]) -> int:
|
|
43
45
|
trackers: dict[str, StandardScaler._RunningStats] = defaultdict(
|
|
44
46
|
self._RunningStats)
|
|
45
47
|
total = 0
|
|
46
|
-
for
|
|
48
|
+
for sample in vectors:
|
|
49
|
+
vector = sample.features
|
|
47
50
|
values = getattr(vector, "values", {})
|
|
48
51
|
for fid, raw in values.items():
|
|
49
52
|
for value in _iter_numeric_values(raw):
|
|
@@ -61,11 +64,19 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
61
64
|
}
|
|
62
65
|
return total
|
|
63
66
|
|
|
64
|
-
def transform(
|
|
67
|
+
def transform(
|
|
68
|
+
self,
|
|
69
|
+
stream: Iterator[FeatureRecord],
|
|
70
|
+
*,
|
|
71
|
+
on_none: Literal["error", "skip"] = "skip",
|
|
72
|
+
observer: Callable[[TransformEvent], None] | None = None,
|
|
73
|
+
) -> Iterator[FeatureRecord]:
|
|
65
74
|
if not self.statistics:
|
|
66
75
|
raise RuntimeError(
|
|
67
76
|
"StandardScaler must be fitted before calling transform().")
|
|
68
77
|
|
|
78
|
+
self.missing_counts = {}
|
|
79
|
+
|
|
69
80
|
grouped = groupby(stream, key=lambda fr: fr.id)
|
|
70
81
|
for feature_id, records in grouped:
|
|
71
82
|
stats = self.statistics.get(feature_id)
|
|
@@ -75,7 +86,29 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
75
86
|
mean = float(stats.get("mean", 0.0))
|
|
76
87
|
std = float(stats.get("std", 1.0))
|
|
77
88
|
for fr in records:
|
|
78
|
-
|
|
89
|
+
value = fr.record.value
|
|
90
|
+
if not isinstance(value, Real):
|
|
91
|
+
if value is None and on_none == "skip":
|
|
92
|
+
self.missing_counts[feature_id] = (
|
|
93
|
+
self.missing_counts.get(feature_id, 0) + 1
|
|
94
|
+
)
|
|
95
|
+
if observer is not None:
|
|
96
|
+
observer(
|
|
97
|
+
TransformEvent(
|
|
98
|
+
type="scaler_none",
|
|
99
|
+
payload={
|
|
100
|
+
"feature_id": feature_id,
|
|
101
|
+
"record": fr.record,
|
|
102
|
+
"count": self.missing_counts[feature_id],
|
|
103
|
+
},
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
yield fr
|
|
107
|
+
continue
|
|
108
|
+
raise TypeError(
|
|
109
|
+
f"Record value must be numeric, got {value!r}")
|
|
110
|
+
|
|
111
|
+
raw = float(value)
|
|
79
112
|
normalized = raw
|
|
80
113
|
if self.with_mean:
|
|
81
114
|
normalized -= mean
|
|
@@ -86,12 +119,36 @@ class StandardScaler(PicklePersistanceMixin):
|
|
|
86
119
|
id=fr.id,
|
|
87
120
|
)
|
|
88
121
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
122
|
+
def inverse_transform(
|
|
123
|
+
self,
|
|
124
|
+
stream: Iterator[FeatureRecord],
|
|
125
|
+
) -> Iterator[FeatureRecord]:
|
|
126
|
+
if not self.statistics:
|
|
127
|
+
raise RuntimeError(
|
|
128
|
+
"StandardScaler must be fitted before calling inverse_transform().")
|
|
129
|
+
|
|
130
|
+
grouped = groupby(stream, key=lambda fr: fr.id)
|
|
131
|
+
for feature_id, records in grouped:
|
|
132
|
+
stats = self.statistics.get(feature_id)
|
|
133
|
+
if not stats:
|
|
134
|
+
raise KeyError(
|
|
135
|
+
f"Missing scaler statistics for feature '{feature_id}'.")
|
|
136
|
+
mean = float(stats.get("mean", 0.0))
|
|
137
|
+
std = float(stats.get("std", 1.0))
|
|
138
|
+
for fr in records:
|
|
139
|
+
value = fr.record.value
|
|
140
|
+
if not isinstance(value, Real):
|
|
141
|
+
raise TypeError(
|
|
142
|
+
f"Record value must be numeric, got {value!r}")
|
|
143
|
+
restored = float(value)
|
|
144
|
+
if self.with_std:
|
|
145
|
+
restored *= std
|
|
146
|
+
if self.with_mean:
|
|
147
|
+
restored += mean
|
|
148
|
+
yield FeatureRecord(
|
|
149
|
+
record=clone_record_with_value(fr.record, restored),
|
|
150
|
+
id=fr.id,
|
|
151
|
+
)
|
|
95
152
|
|
|
96
153
|
class _RunningStats:
|
|
97
154
|
__slots__ = ("count", "mean", "m2")
|
|
@@ -132,6 +189,8 @@ class StandardScalerTransform(FeatureTransform):
|
|
|
132
189
|
with_mean: bool = True,
|
|
133
190
|
with_std: bool = True,
|
|
134
191
|
epsilon: float = 1e-12,
|
|
192
|
+
on_none: Literal["error", "skip"] = "skip",
|
|
193
|
+
observer: Callable[[TransformEvent], None] | None = None,
|
|
135
194
|
) -> None:
|
|
136
195
|
base: StandardScaler
|
|
137
196
|
if scaler is not None:
|
|
@@ -152,6 +211,23 @@ class StandardScalerTransform(FeatureTransform):
|
|
|
152
211
|
epsilon=epsilon,
|
|
153
212
|
)
|
|
154
213
|
self._scaler.statistics = dict(base.statistics)
|
|
214
|
+
self._on_none = on_none
|
|
215
|
+
self._observer = observer
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def missing_counts(self) -> dict[str, int]:
|
|
219
|
+
return dict(self._scaler.missing_counts)
|
|
220
|
+
|
|
221
|
+
def set_observer(self, observer: Callable[[TransformEvent], None] | None) -> None:
|
|
222
|
+
self._observer = observer
|
|
155
223
|
|
|
156
224
|
def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
157
|
-
yield from self._scaler.transform(
|
|
225
|
+
yield from self._scaler.transform(
|
|
226
|
+
stream,
|
|
227
|
+
on_none=self._on_none,
|
|
228
|
+
observer=self._observer,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def inverse(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
232
|
+
"""Undo scaling using the fitted statistics."""
|
|
233
|
+
yield from self._scaler.inverse_transform(stream)
|
|
@@ -3,15 +3,15 @@ from __future__ import annotations
|
|
|
3
3
|
from typing import Iterator
|
|
4
4
|
|
|
5
5
|
from datapipeline.domain.record import TemporalRecord
|
|
6
|
-
from datapipeline.config.dataset.normalize import
|
|
6
|
+
from datapipeline.config.dataset.normalize import floor_time_to_bucket
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def floor_time(stream: Iterator[TemporalRecord],
|
|
10
|
-
"""Floor record timestamps to the given
|
|
9
|
+
def floor_time(stream: Iterator[TemporalRecord], cadence: str) -> Iterator[TemporalRecord]:
|
|
10
|
+
"""Floor record timestamps to the given cadence bucket (e.g., '1h', '10min').
|
|
11
11
|
|
|
12
12
|
Useful before granularity aggregation to downsample within bins by making
|
|
13
13
|
all intra-bin records share the same timestamp.
|
|
14
14
|
"""
|
|
15
15
|
for record in stream:
|
|
16
|
-
record.time =
|
|
16
|
+
record.time = floor_time_to_bucket(record.time, cadence)
|
|
17
17
|
yield record
|
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from collections import deque
|
|
4
|
-
import logging
|
|
5
4
|
from itertools import groupby
|
|
6
5
|
from typing import Iterator
|
|
7
6
|
|
|
8
7
|
from datapipeline.domain.feature import FeatureRecord, FeatureRecordSequence
|
|
9
|
-
from datapipeline.utils.time import parse_timecode
|
|
10
|
-
|
|
11
|
-
logger = logging.getLogger(__name__)
|
|
12
8
|
|
|
13
9
|
|
|
14
10
|
class WindowTransformer:
|
|
@@ -16,25 +12,15 @@ class WindowTransformer:
|
|
|
16
12
|
self,
|
|
17
13
|
size: int,
|
|
18
14
|
stride: int = 1,
|
|
19
|
-
*,
|
|
20
|
-
tick: str | None = None,
|
|
21
15
|
) -> None:
|
|
22
16
|
"""Sliding windows over time-ordered feature streams.
|
|
23
17
|
|
|
24
18
|
Parameters
|
|
25
19
|
- size: window length in steps (int).
|
|
26
20
|
- stride: step between windows (int number of steps).
|
|
27
|
-
- tick: duration string denoting the expected cadence of the stream.
|
|
28
|
-
Supports 's', 'm', 'h', 'd'. When provided, enforce completeness: only emit windows if
|
|
29
|
-
consecutive records are exactly one tick apart; gaps reset the
|
|
30
|
-
window. Examples: "1h", "10m". Optional.
|
|
31
21
|
"""
|
|
32
22
|
|
|
33
23
|
self.size = int(size)
|
|
34
|
-
self._tick_seconds: int | None = (
|
|
35
|
-
int(parse_timecode(tick).total_seconds()) if tick else None
|
|
36
|
-
)
|
|
37
|
-
|
|
38
24
|
self.stride = int(stride)
|
|
39
25
|
|
|
40
26
|
if self.size <= 0 or self.stride <= 0:
|
|
@@ -52,33 +38,14 @@ class WindowTransformer:
|
|
|
52
38
|
|
|
53
39
|
grouped = groupby(stream, key=lambda fr: fr.id)
|
|
54
40
|
|
|
55
|
-
for
|
|
41
|
+
for fid, records in grouped:
|
|
56
42
|
window = deque(maxlen=self.size)
|
|
57
43
|
step = 0
|
|
58
|
-
last_time = None
|
|
59
44
|
for fr in records:
|
|
60
|
-
# Enforce completeness when configured and tick is known
|
|
61
|
-
if self._tick_seconds is not None:
|
|
62
|
-
t = getattr(fr.record, "time", None)
|
|
63
|
-
if t is not None and last_time is not None:
|
|
64
|
-
delta = int((t - last_time).total_seconds())
|
|
65
|
-
if delta != self._tick_seconds:
|
|
66
|
-
logger.debug(
|
|
67
|
-
"sequence gap: feature_id=%s expected=%ss delta=%ss last=%s now=%s",
|
|
68
|
-
id,
|
|
69
|
-
self._tick_seconds,
|
|
70
|
-
delta,
|
|
71
|
-
last_time,
|
|
72
|
-
t,
|
|
73
|
-
)
|
|
74
|
-
window.clear()
|
|
75
|
-
step = 0
|
|
76
|
-
last_time = t
|
|
77
|
-
|
|
78
45
|
window.append(fr)
|
|
79
46
|
if len(window) == self.size and step % self.stride == 0:
|
|
80
47
|
yield FeatureRecordSequence(
|
|
81
48
|
records=[r.record for r in window],
|
|
82
|
-
id=
|
|
49
|
+
id=fid,
|
|
83
50
|
)
|
|
84
51
|
step += 1
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
|
|
5
|
+
from datapipeline.domain.feature import FeatureRecord
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FeatureDeduplicateTransform:
|
|
9
|
+
"""Drop consecutive identical feature records (id + timestamp + payload)."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, **_: object) -> None:
|
|
12
|
+
# Accept arbitrary config mapping for consistency with other transforms.
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
def __call__(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
16
|
+
return self.apply(stream)
|
|
17
|
+
|
|
18
|
+
def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
19
|
+
last: FeatureRecord | None = None
|
|
20
|
+
for record in stream:
|
|
21
|
+
if last is not None and record == last:
|
|
22
|
+
continue
|
|
23
|
+
last = record
|
|
24
|
+
yield record
|
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
from typing import Iterator
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from dataclasses import replace
|
|
4
|
+
|
|
4
5
|
from datapipeline.domain.feature import FeatureRecord
|
|
6
|
+
from datapipeline.domain.record import TemporalRecord
|
|
5
7
|
from datapipeline.utils.time import parse_timecode
|
|
6
|
-
from dataclasses import replace
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
def
|
|
10
|
-
"""Insert placeholder FeatureRecords so timestamps are exactly one
|
|
10
|
+
def ensure_cadence(stream: Iterator[FeatureRecord], cadence: str) -> Iterator[FeatureRecord]:
|
|
11
|
+
"""Insert placeholder FeatureRecords so timestamps are exactly one cadence apart per feature id.
|
|
11
12
|
|
|
12
|
-
-
|
|
13
|
+
- cadence: duration string (e.g., "10m", "1h", "30s").
|
|
13
14
|
- Placeholders carry value=None and inherit the feature id; group bucketing
|
|
14
15
|
is applied later at vector assembly from record.time.
|
|
15
16
|
- Assumes input sorted by (feature_id, record.time).
|
|
16
17
|
"""
|
|
17
|
-
step = parse_timecode(
|
|
18
|
+
step = parse_timecode(cadence)
|
|
18
19
|
last: FeatureRecord | None = None
|
|
19
20
|
for fr in stream:
|
|
20
21
|
if (last is None) or (last.id != fr.id):
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from datapipeline.domain.sample import Sample
|
|
4
|
+
from datapipeline.domain.vector import Vector
|
|
5
|
+
from datapipeline.pipeline.context import (
|
|
6
|
+
PipelineContext,
|
|
7
|
+
try_get_current_context,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def select_vector(sample: Sample, payload: Literal["features", "targets"]) -> Vector | None:
|
|
12
|
+
if payload == "targets":
|
|
13
|
+
return sample.targets
|
|
14
|
+
return sample.features
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def replace_vector(sample: Sample, payload: Literal["features", "targets"], vector: Vector) -> Sample:
|
|
18
|
+
if payload == "targets":
|
|
19
|
+
return sample.with_targets(vector)
|
|
20
|
+
return sample.with_features(vector)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VectorContextMixin:
|
|
24
|
+
def __init__(self, payload: Literal["features", "targets"] = "features") -> None:
|
|
25
|
+
if payload not in {"features", "targets"}:
|
|
26
|
+
raise ValueError("payload must be 'features' or 'targets'")
|
|
27
|
+
self._context: PipelineContext | None = None
|
|
28
|
+
self._payload = payload
|
|
29
|
+
|
|
30
|
+
def bind_context(self, context: PipelineContext) -> None:
|
|
31
|
+
self._context = context
|
|
32
|
+
|
|
33
|
+
def _expected_ids(self, payload: str | None = None) -> list[str]:
|
|
34
|
+
"""Return expected feature/target ids for the given payload.
|
|
35
|
+
|
|
36
|
+
When `payload` is omitted, the instance default is used.
|
|
37
|
+
"""
|
|
38
|
+
ctx = self._context or try_get_current_context()
|
|
39
|
+
if not ctx:
|
|
40
|
+
return []
|
|
41
|
+
kind = payload or self._payload
|
|
42
|
+
if kind not in {"features", "targets"}:
|
|
43
|
+
return []
|
|
44
|
+
schema = ctx.load_schema(payload=kind) or []
|
|
45
|
+
ids = [
|
|
46
|
+
entry.get("id")
|
|
47
|
+
for entry in schema
|
|
48
|
+
if isinstance(entry, dict) and isinstance(entry.get("id"), str)
|
|
49
|
+
]
|
|
50
|
+
return ids or []
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class VectorPostprocessBase(VectorContextMixin):
|
|
54
|
+
"""Shared envelope for vector postprocess transforms.
|
|
55
|
+
|
|
56
|
+
Provides a consistent contract for payload selection and id filtering:
|
|
57
|
+
- payload: features | targets | both
|
|
58
|
+
- only: optional allow-list of ids
|
|
59
|
+
- exclude: optional deny-list of ids
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
payload: Literal["features", "targets", "both"] = "features",
|
|
66
|
+
only: list[str] | None = None,
|
|
67
|
+
exclude: list[str] | None = None,
|
|
68
|
+
) -> None:
|
|
69
|
+
if payload not in {"features", "targets", "both"}:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
"payload must be 'features', 'targets', or 'both'")
|
|
72
|
+
base_payload = "features" if payload == "both" else payload
|
|
73
|
+
super().__init__(payload=base_payload)
|
|
74
|
+
self._payload_mode: Literal["features", "targets", "both"] = payload
|
|
75
|
+
self._only = {str(fid) for fid in (only or [])} or None
|
|
76
|
+
self._exclude = {str(fid) for fid in (exclude or [])} or None
|
|
77
|
+
self._baseline_cache: dict[str, list[str]] = {}
|
|
78
|
+
|
|
79
|
+
def _payload_kinds(self) -> list[Literal["features", "targets"]]:
|
|
80
|
+
mode = self._payload_mode
|
|
81
|
+
kinds: list[Literal["features", "targets"]] = []
|
|
82
|
+
if mode in {"features", "both"}:
|
|
83
|
+
kinds.append("features")
|
|
84
|
+
if mode in {"targets", "both"}:
|
|
85
|
+
kinds.append("targets")
|
|
86
|
+
return kinds
|
|
87
|
+
|
|
88
|
+
def _ids_for(self, payload: Literal["features", "targets"]) -> list[str]:
|
|
89
|
+
cached = self._baseline_cache.get(payload)
|
|
90
|
+
if cached is not None:
|
|
91
|
+
return list(cached)
|
|
92
|
+
ids = self._expected_ids(payload=payload)
|
|
93
|
+
if self._only is not None:
|
|
94
|
+
ids = [fid for fid in ids if fid in self._only]
|
|
95
|
+
if self._exclude is not None:
|
|
96
|
+
ids = [fid for fid in ids if fid not in self._exclude]
|
|
97
|
+
self._baseline_cache[payload] = list(ids)
|
|
98
|
+
return list(ids)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from datapipeline.domain.sample import Sample
|
|
7
|
+
from datapipeline.domain.vector import Vector
|
|
8
|
+
from datapipeline.transforms.vector_utils import is_missing
|
|
9
|
+
|
|
10
|
+
from ..common import VectorPostprocessBase, select_vector
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def cell_coverage(value) -> float:
|
|
14
|
+
"""Return coverage for a single feature value.
|
|
15
|
+
|
|
16
|
+
Scalars: 1.0 when not missing, 0.0 when missing.
|
|
17
|
+
Lists: fraction of non-missing elements (0.0 for empty lists).
|
|
18
|
+
"""
|
|
19
|
+
if isinstance(value, list):
|
|
20
|
+
if not value:
|
|
21
|
+
return 0.0
|
|
22
|
+
total = len(value)
|
|
23
|
+
ok = sum(1 for item in value if not is_missing(item))
|
|
24
|
+
return ok / total if total > 0 else 0.0
|
|
25
|
+
if is_missing(value):
|
|
26
|
+
return 0.0
|
|
27
|
+
return 1.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class VectorDropHorizontalTransform(VectorPostprocessBase):
|
|
31
|
+
"""Horizontal (row-wise) drop based on coverage thresholds."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
threshold: float,
|
|
37
|
+
payload: Literal["features", "targets", "both"] = "features",
|
|
38
|
+
only: list[str] | None = None,
|
|
39
|
+
exclude: list[str] | None = None,
|
|
40
|
+
) -> None:
|
|
41
|
+
if not 0.0 <= threshold <= 1.0:
|
|
42
|
+
raise ValueError("threshold must be between 0 and 1.")
|
|
43
|
+
super().__init__(payload=payload, only=only, exclude=exclude)
|
|
44
|
+
self._threshold = threshold
|
|
45
|
+
|
|
46
|
+
def __call__(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
47
|
+
return self.apply(stream)
|
|
48
|
+
|
|
49
|
+
def apply(self, stream: Iterator[Sample]) -> Iterator[Sample]:
|
|
50
|
+
for sample in stream:
|
|
51
|
+
total = 0.0
|
|
52
|
+
count = 0
|
|
53
|
+
for kind in self._payload_kinds():
|
|
54
|
+
baseline = self._ids_for(kind)
|
|
55
|
+
if not baseline:
|
|
56
|
+
continue
|
|
57
|
+
vector = select_vector(sample, kind)
|
|
58
|
+
if vector is None:
|
|
59
|
+
continue
|
|
60
|
+
total += self._horizontal_coverage(vector, baseline) * len(baseline)
|
|
61
|
+
count += len(baseline)
|
|
62
|
+
if count == 0:
|
|
63
|
+
yield sample
|
|
64
|
+
continue
|
|
65
|
+
coverage = total / float(count)
|
|
66
|
+
if coverage < self._threshold:
|
|
67
|
+
continue
|
|
68
|
+
yield sample
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def _horizontal_coverage(vector: Vector, baseline: list[str]) -> float:
|
|
72
|
+
if not baseline:
|
|
73
|
+
return 1.0
|
|
74
|
+
total = 0.0
|
|
75
|
+
for fid in baseline:
|
|
76
|
+
value = vector.values.get(fid)
|
|
77
|
+
total += cell_coverage(value)
|
|
78
|
+
return total / float(len(baseline))
|
|
79
|
+
|