jerry-thomas 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/mappers/synthetic/time.py +1 -2
- datapipeline/transforms/transforms.py +137 -2
- {jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/METADATA +3 -2
- {jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/RECORD +8 -8
- {jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/entry_points.txt +4 -0
- {jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/top_level.txt +0 -0
|
@@ -12,8 +12,7 @@ def encode(stream: Iterator[TimeFeatureRecord], mode: str) -> Iterator[TimeFeatu
|
|
|
12
12
|
elif mode == "weekday_sin":
|
|
13
13
|
val = sin(2 * pi * t.weekday() / 7)
|
|
14
14
|
elif mode == "linear":
|
|
15
|
-
|
|
16
|
-
val = (t - start).total_seconds()
|
|
15
|
+
val = t.timestamp()
|
|
17
16
|
else:
|
|
18
17
|
raise ValueError(f"Unsupported encode_time mode: {mode}")
|
|
19
18
|
yield TimeFeatureRecord(time=rec.time, value=val)
|
|
@@ -1,9 +1,63 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import is_dataclass, replace
|
|
2
4
|
from datetime import timedelta
|
|
3
|
-
from
|
|
5
|
+
from itertools import groupby
|
|
6
|
+
from math import sqrt
|
|
7
|
+
from numbers import Real
|
|
8
|
+
from typing import Any, Iterator, Mapping, MutableMapping
|
|
9
|
+
|
|
10
|
+
from datapipeline.domain.feature import FeatureRecord
|
|
11
|
+
from datapipeline.domain.record import Record, TimeFeatureRecord
|
|
4
12
|
from datapipeline.utils.time import parse_timecode
|
|
5
13
|
|
|
6
14
|
|
|
15
|
+
def _get_field(record: Any, field: str, default: Any = None) -> Any:
|
|
16
|
+
"""Retrieve attribute *field* from *record* supporting dicts and objects."""
|
|
17
|
+
|
|
18
|
+
if isinstance(record, Mapping):
|
|
19
|
+
return record.get(field, default)
|
|
20
|
+
return getattr(record, field, default)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is_missing(value: Any) -> bool:
|
|
24
|
+
"""Return True when *value* should be treated as a missing observation."""
|
|
25
|
+
|
|
26
|
+
if value is None:
|
|
27
|
+
return True
|
|
28
|
+
if isinstance(value, float): # covers NaN/inf cases
|
|
29
|
+
return value != value # NaN check without importing numpy
|
|
30
|
+
try:
|
|
31
|
+
if isinstance(value, Real):
|
|
32
|
+
return value != value
|
|
33
|
+
except TypeError:
|
|
34
|
+
pass
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _clone_with_value(record: Any, value: float) -> Any:
|
|
39
|
+
"""Return a shallow copy of *record* with its ``value`` field replaced."""
|
|
40
|
+
|
|
41
|
+
if isinstance(record, list):
|
|
42
|
+
raise TypeError(
|
|
43
|
+
"StandardScalerTransform does not support sequence FeatureRecord payloads."
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
if isinstance(record, Mapping):
|
|
47
|
+
cloned: MutableMapping[str, Any] = type(record)(record)
|
|
48
|
+
cloned["value"] = value
|
|
49
|
+
return cloned
|
|
50
|
+
|
|
51
|
+
if hasattr(record, "value"):
|
|
52
|
+
if is_dataclass(record):
|
|
53
|
+
return replace(record, value=value)
|
|
54
|
+
cloned = type(record)(**record.__dict__)
|
|
55
|
+
cloned.value = value
|
|
56
|
+
return cloned
|
|
57
|
+
|
|
58
|
+
raise TypeError(f"Cannot replace value on record type: {type(record)!r}")
|
|
59
|
+
|
|
60
|
+
|
|
7
61
|
def shift_record_time(record: TimeFeatureRecord, lag: timedelta) -> TimeFeatureRecord:
|
|
8
62
|
record.time = record.time - lag
|
|
9
63
|
return record
|
|
@@ -13,3 +67,84 @@ def time_lag(stream: Iterator[TimeFeatureRecord], lag: str) -> Iterator[TimeFeat
|
|
|
13
67
|
lag_td = parse_timecode(lag)
|
|
14
68
|
for record in stream:
|
|
15
69
|
yield shift_record_time(record, lag_td)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def drop_missing_values(
|
|
73
|
+
stream: Iterator[Any],
|
|
74
|
+
field: str = "value",
|
|
75
|
+
) -> Iterator[Any]:
|
|
76
|
+
"""Filter out records whose *field* contains a missing/null value."""
|
|
77
|
+
|
|
78
|
+
for record in stream:
|
|
79
|
+
value = _get_field(record, field)
|
|
80
|
+
if _is_missing(value):
|
|
81
|
+
continue
|
|
82
|
+
yield record
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class StandardScalerTransform:
|
|
86
|
+
"""Standardize feature values to zero mean and unit variance per feature id."""
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
*,
|
|
91
|
+
with_mean: bool = True,
|
|
92
|
+
with_std: bool = True,
|
|
93
|
+
epsilon: float = 1e-12,
|
|
94
|
+
statistics: Mapping[str, Mapping[str, float]] | None = None,
|
|
95
|
+
) -> None:
|
|
96
|
+
self.with_mean = with_mean
|
|
97
|
+
self.with_std = with_std
|
|
98
|
+
self.epsilon = epsilon
|
|
99
|
+
self.statistics = dict(statistics or {})
|
|
100
|
+
self.stats_: dict[str, dict[str, float]] = {}
|
|
101
|
+
|
|
102
|
+
def _resolve_stats(
|
|
103
|
+
self, feature_id: str, values: list[float]
|
|
104
|
+
) -> tuple[float, float]:
|
|
105
|
+
if feature_id in self.statistics:
|
|
106
|
+
stats = self.statistics[feature_id]
|
|
107
|
+
mean = float(stats.get("mean", 0.0))
|
|
108
|
+
std = float(stats.get("std", 1.0))
|
|
109
|
+
else:
|
|
110
|
+
mean = sum(values) / len(values) if self.with_mean else 0.0
|
|
111
|
+
if self.with_std:
|
|
112
|
+
variance = sum((v - mean) ** 2 for v in values) / len(values)
|
|
113
|
+
std = sqrt(variance)
|
|
114
|
+
else:
|
|
115
|
+
std = 1.0
|
|
116
|
+
self.stats_[feature_id] = {
|
|
117
|
+
"mean": mean if self.with_mean else 0.0,
|
|
118
|
+
"std": std if self.with_std else 1.0,
|
|
119
|
+
}
|
|
120
|
+
if self.with_std:
|
|
121
|
+
std = max(std, self.epsilon)
|
|
122
|
+
else:
|
|
123
|
+
std = 1.0
|
|
124
|
+
return (mean if self.with_mean else 0.0, std)
|
|
125
|
+
|
|
126
|
+
def _extract_value(self, record: Record) -> float:
|
|
127
|
+
value = _get_field(record, "value")
|
|
128
|
+
if isinstance(value, Real):
|
|
129
|
+
return float(value)
|
|
130
|
+
raise TypeError(f"Record value must be numeric, got {value!r}")
|
|
131
|
+
|
|
132
|
+
def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
|
|
133
|
+
grouped = groupby(stream, key=lambda fr: fr.feature_id)
|
|
134
|
+
for feature_id, records in grouped:
|
|
135
|
+
bucket = list(records)
|
|
136
|
+
if not bucket:
|
|
137
|
+
continue
|
|
138
|
+
values = [self._extract_value(fr.record) for fr in bucket]
|
|
139
|
+
mean, std = self._resolve_stats(feature_id, values)
|
|
140
|
+
for fr, raw in zip(bucket, values):
|
|
141
|
+
normalized = raw
|
|
142
|
+
if self.with_mean:
|
|
143
|
+
normalized -= mean
|
|
144
|
+
if self.with_std:
|
|
145
|
+
normalized /= std
|
|
146
|
+
yield FeatureRecord(
|
|
147
|
+
record=_clone_with_value(fr.record, normalized),
|
|
148
|
+
feature_id=fr.feature_id,
|
|
149
|
+
group_key=fr.group_key,
|
|
150
|
+
)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: jerry-thomas
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.5
|
|
4
4
|
Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
|
|
5
|
-
Author:
|
|
5
|
+
Author: Anders Skott Lind
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: >=3.9
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
@@ -12,6 +12,7 @@ Requires-Dist: pydantic>=1.8
|
|
|
12
12
|
Requires-Dist: PyYAML>=5.4
|
|
13
13
|
Requires-Dist: tqdm>=4.0
|
|
14
14
|
Requires-Dist: jinja2>=3.0
|
|
15
|
+
Requires-Dist: setuptools>=70
|
|
15
16
|
Dynamic: license-file
|
|
16
17
|
|
|
17
18
|
# Jerry Thomas
|
|
@@ -29,7 +29,7 @@ datapipeline/domain/record.py,sha256=WSIHMy3IvXjQqrXkysEmvhzQsOqfHjsSf2tfnwuTK_w
|
|
|
29
29
|
datapipeline/domain/vector.py,sha256=_5xFkRaGGc-rnwmVCTwkMNk8xBkLWGupubyMQrSTEMk,1152
|
|
30
30
|
datapipeline/filters/filters.py,sha256=L4Nnuxbi7KXwfFCfJULzr_-_rdnfiPLmIy_inQEySH4,2685
|
|
31
31
|
datapipeline/mappers/noop.py,sha256=L8bH1QVbLH-ogIam0ppYdx7KuWQ7Dj44lvD8tvNlY0Q,111
|
|
32
|
-
datapipeline/mappers/synthetic/time.py,sha256=
|
|
32
|
+
datapipeline/mappers/synthetic/time.py,sha256=ZrJsaUCpTHKTaVKud2PHYbmclpXWcgfDoOw5oiCM0Z4,651
|
|
33
33
|
datapipeline/parsers/identity.py,sha256=pdGuz0SSQGfySPpvZSnLgfTXTkC36x-7dQMMei3XhsU,321
|
|
34
34
|
datapipeline/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
35
|
datapipeline/pipeline/pipelines.py,sha256=87fqod7nMSnIVGHD-aBa7oWTZCfGLUWSBot9UM9qFBI,1600
|
|
@@ -87,13 +87,13 @@ datapipeline/templates/stubs/parser_custom.py.j2,sha256=0Nytq43JdTZoyRj-4Mz6HWdM
|
|
|
87
87
|
datapipeline/templates/stubs/record.py.j2,sha256=FDZyDR1mYTBWKRMDlLTB7PduBpbcADNrB80AK47e7qE,678
|
|
88
88
|
datapipeline/templates/stubs/source.yaml.j2,sha256=kdEWU7poH05UcDwkB8gNGjx2gaGDi5yhP0PGYbQ6yuE,283
|
|
89
89
|
datapipeline/transforms/sequence.py,sha256=ap3LM-ZmWt8MpJPEzZAEiZqhC9Z1PFB93rHxzID4F0A,1148
|
|
90
|
-
datapipeline/transforms/transforms.py,sha256=
|
|
90
|
+
datapipeline/transforms/transforms.py,sha256=PUXPHUY1dl6MSo9Gi-o-9QMG0QeMkD8aC-7BuPaXLJY,5037
|
|
91
91
|
datapipeline/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
92
92
|
datapipeline/utils/load.py,sha256=NVPEwKgK2DQSrB0OTRLf9N1yGBS5x9FxAY_gfo2BJ20,1177
|
|
93
93
|
datapipeline/utils/time.py,sha256=8E-vjUV4EnHVmhAjMozaRRD9WAf9C3sCGYsYmHczfa8,1009
|
|
94
|
-
jerry_thomas-0.0.
|
|
95
|
-
jerry_thomas-0.0.
|
|
96
|
-
jerry_thomas-0.0.
|
|
97
|
-
jerry_thomas-0.0.
|
|
98
|
-
jerry_thomas-0.0.
|
|
99
|
-
jerry_thomas-0.0.
|
|
94
|
+
jerry_thomas-0.0.5.dist-info/licenses/LICENSE,sha256=pkBMylAJF5yChHAkdxwFhEptLGx13i-XFEKh-Sh6DkM,1073
|
|
95
|
+
jerry_thomas-0.0.5.dist-info/METADATA,sha256=ONeBhBFGHp3O7669WEjhkF_zKkZ7Rl8ELBAETAdp_vU,13876
|
|
96
|
+
jerry_thomas-0.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
97
|
+
jerry_thomas-0.0.5.dist-info/entry_points.txt,sha256=z-idrww2BTME9Flc7URKhSYm3mWxE46wOw5Cfpjr-hw,1659
|
|
98
|
+
jerry_thomas-0.0.5.dist-info/top_level.txt,sha256=N8aoNPdPyHefODO4YAm7tqTaUcw0e8LDcqycFTf8TbM,13
|
|
99
|
+
jerry_thomas-0.0.5.dist-info/RECORD,,
|
|
@@ -34,7 +34,11 @@ identity = datapipeline.parsers.identity:IdentityParser
|
|
|
34
34
|
synthetic.time = datapipeline.sources.synthetic.time.parser:TimeRowParser
|
|
35
35
|
|
|
36
36
|
[datapipeline.transforms]
|
|
37
|
+
drop_missing = datapipeline.transforms.transforms:drop_missing_values
|
|
37
38
|
time_lag = datapipeline.transforms.transforms:time_lag
|
|
38
39
|
|
|
40
|
+
[datapipeline.transforms.feature]
|
|
41
|
+
standard_scale = datapipeline.transforms.transforms:StandardScalerTransform
|
|
42
|
+
|
|
39
43
|
[datapipeline.transforms.sequence]
|
|
40
44
|
time_window = datapipeline.transforms.sequence:TimeWindowTransformer
|
|
File without changes
|
|
File without changes
|
|
File without changes
|