PyPI - jerry-thomas - Versions diffs - 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

jerry-thomas 0.0.2py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

datapipeline/mappers/synthetic/time.py CHANGED Viewed

@@ -12,8 +12,7 @@ def encode(stream: Iterator[TimeFeatureRecord], mode: str) -> Iterator[TimeFeatu
         elif mode == "weekday_sin":
             val = sin(2 * pi * t.weekday() / 7)
         elif mode == "linear":
-            start = t.replace(hour=0, minute=0, second=0, microsecond=0)
-            val = (t - start).total_seconds()
+            val = t.timestamp()
         else:
             raise ValueError(f"Unsupported encode_time mode: {mode}")
         yield TimeFeatureRecord(time=rec.time, value=val)

datapipeline/transforms/transforms.py CHANGED Viewed

@@ -1,9 +1,63 @@
-from datapipeline.domain.record import TimeFeatureRecord
+from __future__ import annotations
+from dataclasses import is_dataclass, replace
 from datetime import timedelta
-from typing import Iterator
+from itertools import groupby
+from math import sqrt
+from numbers import Real
+from typing import Any, Iterator, Mapping, MutableMapping
+from datapipeline.domain.feature import FeatureRecord
+from datapipeline.domain.record import Record, TimeFeatureRecord
 from datapipeline.utils.time import parse_timecode
+def _get_field(record: Any, field: str, default: Any = None) -> Any:
+    """Retrieve attribute *field* from *record* supporting dicts and objects."""
+    if isinstance(record, Mapping):
+        return record.get(field, default)
+    return getattr(record, field, default)
+def _is_missing(value: Any) -> bool:
+    """Return True when *value* should be treated as a missing observation."""
+    if value is None:
+        return True
+    if isinstance(value, float):  # covers NaN/inf cases
+        return value != value  # NaN check without importing numpy
+    try:
+        if isinstance(value, Real):
+            return value != value
+    except TypeError:
+        pass
+    return False
+def _clone_with_value(record: Any, value: float) -> Any:
+    """Return a shallow copy of *record* with its ``value`` field replaced."""
+    if isinstance(record, list):
+        raise TypeError(
+            "StandardScalerTransform does not support sequence FeatureRecord payloads."
+        )
+    if isinstance(record, Mapping):
+        cloned: MutableMapping[str, Any] = type(record)(record)
+        cloned["value"] = value
+        return cloned
+    if hasattr(record, "value"):
+        if is_dataclass(record):
+            return replace(record, value=value)
+        cloned = type(record)(**record.__dict__)
+        cloned.value = value
+        return cloned
+    raise TypeError(f"Cannot replace value on record type: {type(record)!r}")
 def shift_record_time(record: TimeFeatureRecord, lag: timedelta) -> TimeFeatureRecord:
     record.time = record.time - lag
     return record
@@ -13,3 +67,84 @@ def time_lag(stream: Iterator[TimeFeatureRecord], lag: str) -> Iterator[TimeFeat
     lag_td = parse_timecode(lag)
     for record in stream:
         yield shift_record_time(record, lag_td)
+def drop_missing_values(
+    stream: Iterator[Any],
+    field: str = "value",
+) -> Iterator[Any]:
+    """Filter out records whose *field* contains a missing/null value."""
+    for record in stream:
+        value = _get_field(record, field)
+        if _is_missing(value):
+            continue
+        yield record
+class StandardScalerTransform:
+    """Standardize feature values to zero mean and unit variance per feature id."""
+    def __init__(
+        self,
+        *,
+        with_mean: bool = True,
+        with_std: bool = True,
+        epsilon: float = 1e-12,
+        statistics: Mapping[str, Mapping[str, float]] | None = None,
+    ) -> None:
+        self.with_mean = with_mean
+        self.with_std = with_std
+        self.epsilon = epsilon
+        self.statistics = dict(statistics or {})
+        self.stats_: dict[str, dict[str, float]] = {}
+    def _resolve_stats(
+        self, feature_id: str, values: list[float]
+    ) -> tuple[float, float]:
+        if feature_id in self.statistics:
+            stats = self.statistics[feature_id]
+            mean = float(stats.get("mean", 0.0))
+            std = float(stats.get("std", 1.0))
+        else:
+            mean = sum(values) / len(values) if self.with_mean else 0.0
+            if self.with_std:
+                variance = sum((v - mean) ** 2 for v in values) / len(values)
+                std = sqrt(variance)
+            else:
+                std = 1.0
+            self.stats_[feature_id] = {
+                "mean": mean if self.with_mean else 0.0,
+                "std": std if self.with_std else 1.0,
+            }
+        if self.with_std:
+            std = max(std, self.epsilon)
+        else:
+            std = 1.0
+        return (mean if self.with_mean else 0.0, std)
+    def _extract_value(self, record: Record) -> float:
+        value = _get_field(record, "value")
+        if isinstance(value, Real):
+            return float(value)
+        raise TypeError(f"Record value must be numeric, got {value!r}")
+    def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
+        grouped = groupby(stream, key=lambda fr: fr.feature_id)
+        for feature_id, records in grouped:
+            bucket = list(records)
+            if not bucket:
+                continue
+            values = [self._extract_value(fr.record) for fr in bucket]
+            mean, std = self._resolve_stats(feature_id, values)
+            for fr, raw in zip(bucket, values):
+                normalized = raw
+                if self.with_mean:
+                    normalized -= mean
+                if self.with_std:
+                    normalized /= std
+                yield FeatureRecord(
+                    record=_clone_with_value(fr.record, normalized),
+                    feature_id=fr.feature_id,
+                    group_key=fr.group_key,
+                )

{jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/METADATA RENAMED Viewed

@@ -1,8 +1,8 @@
 Metadata-Version: 2.4
 Name: jerry-thomas
-Version: 0.0.2
+Version: 0.0.5
 Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
-Author: Your Name
+Author: Anders Skott Lind
 License: MIT
 Requires-Python: >=3.9
 Description-Content-Type: text/markdown
@@ -12,6 +12,7 @@ Requires-Dist: pydantic>=1.8
 Requires-Dist: PyYAML>=5.4
 Requires-Dist: tqdm>=4.0
 Requires-Dist: jinja2>=3.0
+Requires-Dist: setuptools>=70
 Dynamic: license-file
 # Jerry Thomas

{jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/RECORD RENAMED Viewed

@@ -29,7 +29,7 @@ datapipeline/domain/record.py,sha256=WSIHMy3IvXjQqrXkysEmvhzQsOqfHjsSf2tfnwuTK_w
 datapipeline/domain/vector.py,sha256=_5xFkRaGGc-rnwmVCTwkMNk8xBkLWGupubyMQrSTEMk,1152
 datapipeline/filters/filters.py,sha256=L4Nnuxbi7KXwfFCfJULzr_-_rdnfiPLmIy_inQEySH4,2685
 datapipeline/mappers/noop.py,sha256=L8bH1QVbLH-ogIam0ppYdx7KuWQ7Dj44lvD8tvNlY0Q,111
-datapipeline/mappers/synthetic/time.py,sha256=tGZbVQFAhhG6ps-EJ7RXSgEU16MHJTHZtQ-c17EfpYY,738
+datapipeline/mappers/synthetic/time.py,sha256=ZrJsaUCpTHKTaVKud2PHYbmclpXWcgfDoOw5oiCM0Z4,651
 datapipeline/parsers/identity.py,sha256=pdGuz0SSQGfySPpvZSnLgfTXTkC36x-7dQMMei3XhsU,321
 datapipeline/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datapipeline/pipeline/pipelines.py,sha256=87fqod7nMSnIVGHD-aBa7oWTZCfGLUWSBot9UM9qFBI,1600
@@ -87,13 +87,13 @@ datapipeline/templates/stubs/parser_custom.py.j2,sha256=0Nytq43JdTZoyRj-4Mz6HWdM
 datapipeline/templates/stubs/record.py.j2,sha256=FDZyDR1mYTBWKRMDlLTB7PduBpbcADNrB80AK47e7qE,678
 datapipeline/templates/stubs/source.yaml.j2,sha256=kdEWU7poH05UcDwkB8gNGjx2gaGDi5yhP0PGYbQ6yuE,283
 datapipeline/transforms/sequence.py,sha256=ap3LM-ZmWt8MpJPEzZAEiZqhC9Z1PFB93rHxzID4F0A,1148
-datapipeline/transforms/transforms.py,sha256=nZxknprRPTH6DPV2eVyZzeLB4VRsLTAqw4LU4ZAOqyw,511
+datapipeline/transforms/transforms.py,sha256=PUXPHUY1dl6MSo9Gi-o-9QMG0QeMkD8aC-7BuPaXLJY,5037
 datapipeline/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datapipeline/utils/load.py,sha256=NVPEwKgK2DQSrB0OTRLf9N1yGBS5x9FxAY_gfo2BJ20,1177
 datapipeline/utils/time.py,sha256=8E-vjUV4EnHVmhAjMozaRRD9WAf9C3sCGYsYmHczfa8,1009
-jerry_thomas-0.0.2.dist-info/licenses/LICENSE,sha256=pkBMylAJF5yChHAkdxwFhEptLGx13i-XFEKh-Sh6DkM,1073
-jerry_thomas-0.0.2.dist-info/METADATA,sha256=4GkP5xKR5J8MCdotpJbL9URJk2c0-jzrZIKInddL4oU,13838
-jerry_thomas-0.0.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-jerry_thomas-0.0.2.dist-info/entry_points.txt,sha256=2Lvi6aWL4MZKmIU8gzd9F-AAgzYPxi6ePcpPDSynrm0,1478
-jerry_thomas-0.0.2.dist-info/top_level.txt,sha256=N8aoNPdPyHefODO4YAm7tqTaUcw0e8LDcqycFTf8TbM,13
-jerry_thomas-0.0.2.dist-info/RECORD,,
+jerry_thomas-0.0.5.dist-info/licenses/LICENSE,sha256=pkBMylAJF5yChHAkdxwFhEptLGx13i-XFEKh-Sh6DkM,1073
+jerry_thomas-0.0.5.dist-info/METADATA,sha256=ONeBhBFGHp3O7669WEjhkF_zKkZ7Rl8ELBAETAdp_vU,13876
+jerry_thomas-0.0.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+jerry_thomas-0.0.5.dist-info/entry_points.txt,sha256=z-idrww2BTME9Flc7URKhSYm3mWxE46wOw5Cfpjr-hw,1659
+jerry_thomas-0.0.5.dist-info/top_level.txt,sha256=N8aoNPdPyHefODO4YAm7tqTaUcw0e8LDcqycFTf8TbM,13
+jerry_thomas-0.0.5.dist-info/RECORD,,

{jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/entry_points.txt RENAMED Viewed

@@ -34,7 +34,11 @@ identity = datapipeline.parsers.identity:IdentityParser
 synthetic.time = datapipeline.sources.synthetic.time.parser:TimeRowParser
 [datapipeline.transforms]
+drop_missing = datapipeline.transforms.transforms:drop_missing_values
 time_lag = datapipeline.transforms.transforms:time_lag
+[datapipeline.transforms.feature]
+standard_scale = datapipeline.transforms.transforms:StandardScalerTransform
 [datapipeline.transforms.sequence]
 time_window = datapipeline.transforms.sequence:TimeWindowTransformer

{jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{jerry_thomas-0.0.2.dist-info → jerry_thomas-0.0.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

jerry-thomas 0.0.2__py3-none-any.whl → 0.0.5__py3-none-any.whl

jerry-thomas 0.0.2py3-none-any.whl → 0.0.5py3-none-any.whl