jerry-thomas 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,8 +12,7 @@ def encode(stream: Iterator[TimeFeatureRecord], mode: str) -> Iterator[TimeFeatu
12
12
  elif mode == "weekday_sin":
13
13
  val = sin(2 * pi * t.weekday() / 7)
14
14
  elif mode == "linear":
15
- start = t.replace(hour=0, minute=0, second=0, microsecond=0)
16
- val = (t - start).total_seconds()
15
+ val = t.timestamp()
17
16
  else:
18
17
  raise ValueError(f"Unsupported encode_time mode: {mode}")
19
18
  yield TimeFeatureRecord(time=rec.time, value=val)
@@ -1,9 +1,63 @@
1
- from datapipeline.domain.record import TimeFeatureRecord
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import is_dataclass, replace
2
4
  from datetime import timedelta
3
- from typing import Iterator
5
+ from itertools import groupby
6
+ from math import sqrt
7
+ from numbers import Real
8
+ from typing import Any, Iterator, Mapping, MutableMapping
9
+
10
+ from datapipeline.domain.feature import FeatureRecord
11
+ from datapipeline.domain.record import Record, TimeFeatureRecord
4
12
  from datapipeline.utils.time import parse_timecode
5
13
 
6
14
 
15
+ def _get_field(record: Any, field: str, default: Any = None) -> Any:
16
+ """Retrieve attribute *field* from *record* supporting dicts and objects."""
17
+
18
+ if isinstance(record, Mapping):
19
+ return record.get(field, default)
20
+ return getattr(record, field, default)
21
+
22
+
23
+ def _is_missing(value: Any) -> bool:
24
+ """Return True when *value* should be treated as a missing observation."""
25
+
26
+ if value is None:
27
+ return True
28
+ if isinstance(value, float): # covers NaN/inf cases
29
+ return value != value # NaN check without importing numpy
30
+ try:
31
+ if isinstance(value, Real):
32
+ return value != value
33
+ except TypeError:
34
+ pass
35
+ return False
36
+
37
+
38
+ def _clone_with_value(record: Any, value: float) -> Any:
39
+ """Return a shallow copy of *record* with its ``value`` field replaced."""
40
+
41
+ if isinstance(record, list):
42
+ raise TypeError(
43
+ "StandardScalerTransform does not support sequence FeatureRecord payloads."
44
+ )
45
+
46
+ if isinstance(record, Mapping):
47
+ cloned: MutableMapping[str, Any] = type(record)(record)
48
+ cloned["value"] = value
49
+ return cloned
50
+
51
+ if hasattr(record, "value"):
52
+ if is_dataclass(record):
53
+ return replace(record, value=value)
54
+ cloned = type(record)(**record.__dict__)
55
+ cloned.value = value
56
+ return cloned
57
+
58
+ raise TypeError(f"Cannot replace value on record type: {type(record)!r}")
59
+
60
+
7
61
  def shift_record_time(record: TimeFeatureRecord, lag: timedelta) -> TimeFeatureRecord:
8
62
  record.time = record.time - lag
9
63
  return record
@@ -13,3 +67,84 @@ def time_lag(stream: Iterator[TimeFeatureRecord], lag: str) -> Iterator[TimeFeat
13
67
  lag_td = parse_timecode(lag)
14
68
  for record in stream:
15
69
  yield shift_record_time(record, lag_td)
70
+
71
+
72
+ def drop_missing_values(
73
+ stream: Iterator[Any],
74
+ field: str = "value",
75
+ ) -> Iterator[Any]:
76
+ """Filter out records whose *field* contains a missing/null value."""
77
+
78
+ for record in stream:
79
+ value = _get_field(record, field)
80
+ if _is_missing(value):
81
+ continue
82
+ yield record
83
+
84
+
85
+ class StandardScalerTransform:
86
+ """Standardize feature values to zero mean and unit variance per feature id."""
87
+
88
+ def __init__(
89
+ self,
90
+ *,
91
+ with_mean: bool = True,
92
+ with_std: bool = True,
93
+ epsilon: float = 1e-12,
94
+ statistics: Mapping[str, Mapping[str, float]] | None = None,
95
+ ) -> None:
96
+ self.with_mean = with_mean
97
+ self.with_std = with_std
98
+ self.epsilon = epsilon
99
+ self.statistics = dict(statistics or {})
100
+ self.stats_: dict[str, dict[str, float]] = {}
101
+
102
+ def _resolve_stats(
103
+ self, feature_id: str, values: list[float]
104
+ ) -> tuple[float, float]:
105
+ if feature_id in self.statistics:
106
+ stats = self.statistics[feature_id]
107
+ mean = float(stats.get("mean", 0.0))
108
+ std = float(stats.get("std", 1.0))
109
+ else:
110
+ mean = sum(values) / len(values) if self.with_mean else 0.0
111
+ if self.with_std:
112
+ variance = sum((v - mean) ** 2 for v in values) / len(values)
113
+ std = sqrt(variance)
114
+ else:
115
+ std = 1.0
116
+ self.stats_[feature_id] = {
117
+ "mean": mean if self.with_mean else 0.0,
118
+ "std": std if self.with_std else 1.0,
119
+ }
120
+ if self.with_std:
121
+ std = max(std, self.epsilon)
122
+ else:
123
+ std = 1.0
124
+ return (mean if self.with_mean else 0.0, std)
125
+
126
+ def _extract_value(self, record: Record) -> float:
127
+ value = _get_field(record, "value")
128
+ if isinstance(value, Real):
129
+ return float(value)
130
+ raise TypeError(f"Record value must be numeric, got {value!r}")
131
+
132
+ def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
133
+ grouped = groupby(stream, key=lambda fr: fr.feature_id)
134
+ for feature_id, records in grouped:
135
+ bucket = list(records)
136
+ if not bucket:
137
+ continue
138
+ values = [self._extract_value(fr.record) for fr in bucket]
139
+ mean, std = self._resolve_stats(feature_id, values)
140
+ for fr, raw in zip(bucket, values):
141
+ normalized = raw
142
+ if self.with_mean:
143
+ normalized -= mean
144
+ if self.with_std:
145
+ normalized /= std
146
+ yield FeatureRecord(
147
+ record=_clone_with_value(fr.record, normalized),
148
+ feature_id=fr.feature_id,
149
+ group_key=fr.group_key,
150
+ )
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jerry-thomas
3
- Version: 0.0.2
3
+ Version: 0.0.5
4
4
  Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
- Author: Your Name
5
+ Author: Anders Skott Lind
6
6
  License: MIT
7
7
  Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
@@ -12,6 +12,7 @@ Requires-Dist: pydantic>=1.8
12
12
  Requires-Dist: PyYAML>=5.4
13
13
  Requires-Dist: tqdm>=4.0
14
14
  Requires-Dist: jinja2>=3.0
15
+ Requires-Dist: setuptools>=70
15
16
  Dynamic: license-file
16
17
 
17
18
  # Jerry Thomas
@@ -29,7 +29,7 @@ datapipeline/domain/record.py,sha256=WSIHMy3IvXjQqrXkysEmvhzQsOqfHjsSf2tfnwuTK_w
29
29
  datapipeline/domain/vector.py,sha256=_5xFkRaGGc-rnwmVCTwkMNk8xBkLWGupubyMQrSTEMk,1152
30
30
  datapipeline/filters/filters.py,sha256=L4Nnuxbi7KXwfFCfJULzr_-_rdnfiPLmIy_inQEySH4,2685
31
31
  datapipeline/mappers/noop.py,sha256=L8bH1QVbLH-ogIam0ppYdx7KuWQ7Dj44lvD8tvNlY0Q,111
32
- datapipeline/mappers/synthetic/time.py,sha256=tGZbVQFAhhG6ps-EJ7RXSgEU16MHJTHZtQ-c17EfpYY,738
32
+ datapipeline/mappers/synthetic/time.py,sha256=ZrJsaUCpTHKTaVKud2PHYbmclpXWcgfDoOw5oiCM0Z4,651
33
33
  datapipeline/parsers/identity.py,sha256=pdGuz0SSQGfySPpvZSnLgfTXTkC36x-7dQMMei3XhsU,321
34
34
  datapipeline/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
35
  datapipeline/pipeline/pipelines.py,sha256=87fqod7nMSnIVGHD-aBa7oWTZCfGLUWSBot9UM9qFBI,1600
@@ -87,13 +87,13 @@ datapipeline/templates/stubs/parser_custom.py.j2,sha256=0Nytq43JdTZoyRj-4Mz6HWdM
87
87
  datapipeline/templates/stubs/record.py.j2,sha256=FDZyDR1mYTBWKRMDlLTB7PduBpbcADNrB80AK47e7qE,678
88
88
  datapipeline/templates/stubs/source.yaml.j2,sha256=kdEWU7poH05UcDwkB8gNGjx2gaGDi5yhP0PGYbQ6yuE,283
89
89
  datapipeline/transforms/sequence.py,sha256=ap3LM-ZmWt8MpJPEzZAEiZqhC9Z1PFB93rHxzID4F0A,1148
90
- datapipeline/transforms/transforms.py,sha256=nZxknprRPTH6DPV2eVyZzeLB4VRsLTAqw4LU4ZAOqyw,511
90
+ datapipeline/transforms/transforms.py,sha256=PUXPHUY1dl6MSo9Gi-o-9QMG0QeMkD8aC-7BuPaXLJY,5037
91
91
  datapipeline/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
92
  datapipeline/utils/load.py,sha256=NVPEwKgK2DQSrB0OTRLf9N1yGBS5x9FxAY_gfo2BJ20,1177
93
93
  datapipeline/utils/time.py,sha256=8E-vjUV4EnHVmhAjMozaRRD9WAf9C3sCGYsYmHczfa8,1009
94
- jerry_thomas-0.0.2.dist-info/licenses/LICENSE,sha256=pkBMylAJF5yChHAkdxwFhEptLGx13i-XFEKh-Sh6DkM,1073
95
- jerry_thomas-0.0.2.dist-info/METADATA,sha256=4GkP5xKR5J8MCdotpJbL9URJk2c0-jzrZIKInddL4oU,13838
96
- jerry_thomas-0.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
- jerry_thomas-0.0.2.dist-info/entry_points.txt,sha256=2Lvi6aWL4MZKmIU8gzd9F-AAgzYPxi6ePcpPDSynrm0,1478
98
- jerry_thomas-0.0.2.dist-info/top_level.txt,sha256=N8aoNPdPyHefODO4YAm7tqTaUcw0e8LDcqycFTf8TbM,13
99
- jerry_thomas-0.0.2.dist-info/RECORD,,
94
+ jerry_thomas-0.0.5.dist-info/licenses/LICENSE,sha256=pkBMylAJF5yChHAkdxwFhEptLGx13i-XFEKh-Sh6DkM,1073
95
+ jerry_thomas-0.0.5.dist-info/METADATA,sha256=ONeBhBFGHp3O7669WEjhkF_zKkZ7Rl8ELBAETAdp_vU,13876
96
+ jerry_thomas-0.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
+ jerry_thomas-0.0.5.dist-info/entry_points.txt,sha256=z-idrww2BTME9Flc7URKhSYm3mWxE46wOw5Cfpjr-hw,1659
98
+ jerry_thomas-0.0.5.dist-info/top_level.txt,sha256=N8aoNPdPyHefODO4YAm7tqTaUcw0e8LDcqycFTf8TbM,13
99
+ jerry_thomas-0.0.5.dist-info/RECORD,,
@@ -34,7 +34,11 @@ identity = datapipeline.parsers.identity:IdentityParser
34
34
  synthetic.time = datapipeline.sources.synthetic.time.parser:TimeRowParser
35
35
 
36
36
  [datapipeline.transforms]
37
+ drop_missing = datapipeline.transforms.transforms:drop_missing_values
37
38
  time_lag = datapipeline.transforms.transforms:time_lag
38
39
 
40
+ [datapipeline.transforms.feature]
41
+ standard_scale = datapipeline.transforms.transforms:StandardScalerTransform
42
+
39
43
  [datapipeline.transforms.sequence]
40
44
  time_window = datapipeline.transforms.sequence:TimeWindowTransformer