jerry-thomas 0.0.2__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/PKG-INFO +3 -2
  2. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/pyproject.toml +11 -3
  3. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/mappers/synthetic/time.py +1 -2
  4. jerry_thomas-0.0.5/src/datapipeline/transforms/transforms.py +150 -0
  5. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/jerry_thomas.egg-info/PKG-INFO +3 -2
  6. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/jerry_thomas.egg-info/SOURCES.txt +2 -1
  7. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/jerry_thomas.egg-info/entry_points.txt +4 -0
  8. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/jerry_thomas.egg-info/requires.txt +1 -0
  9. jerry_thomas-0.0.5/tests/test_transforms.py +76 -0
  10. jerry_thomas-0.0.2/src/datapipeline/transforms/transforms.py +0 -15
  11. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/LICENSE +0 -0
  12. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/README.md +0 -0
  13. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/setup.cfg +0 -0
  14. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/__init__.py +0 -0
  15. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/analysis/__init__.py +0 -0
  16. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/analysis/vector_analyzer.py +0 -0
  17. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/app.py +0 -0
  18. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/analyze.py +0 -0
  19. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/domain.py +0 -0
  20. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/filter.py +0 -0
  21. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/link.py +0 -0
  22. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/list_.py +0 -0
  23. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/plugin.py +0 -0
  24. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/run.py +0 -0
  25. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/commands/source.py +0 -0
  26. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/openers.py +0 -0
  27. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/cli/visuals.py +0 -0
  28. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/common/__init__.py +0 -0
  29. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/common/geo.py +0 -0
  30. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/__init__.py +0 -0
  31. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/catalog.py +0 -0
  32. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/dataset/dataset.py +0 -0
  33. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/dataset/feature.py +0 -0
  34. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/dataset/group_by.py +0 -0
  35. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/dataset/loader.py +0 -0
  36. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/dataset/normalize.py +0 -0
  37. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/config/project.py +0 -0
  38. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/domain/__init__.py +0 -0
  39. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/domain/feature.py +0 -0
  40. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/domain/record.py +0 -0
  41. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/domain/vector.py +0 -0
  42. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/filters/filters.py +0 -0
  43. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/mappers/noop.py +0 -0
  44. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/parsers/identity.py +0 -0
  45. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/__init__.py +0 -0
  46. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/pipelines.py +0 -0
  47. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/stages.py +0 -0
  48. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/utils/keygen.py +0 -0
  49. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/utils/memory_sort.py +0 -0
  50. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/utils/ordering.py +0 -0
  51. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/pipeline/utils/transform_utils.py +0 -0
  52. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/plugins.py +0 -0
  53. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/bootstrap.py +0 -0
  54. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/constants.py +0 -0
  55. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/entrypoints.py +0 -0
  56. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/factories.py +0 -0
  57. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/paths.py +0 -0
  58. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/project_paths.py +0 -0
  59. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/__init__.py +0 -0
  60. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/domain.py +0 -0
  61. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/filter.py +0 -0
  62. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/mappers.py +0 -0
  63. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/plugin.py +0 -0
  64. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/source.py +0 -0
  65. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/services/scaffold/templates.py +0 -0
  66. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/__init__.py +0 -0
  67. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/composed_loader.py +0 -0
  68. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/decoders.py +0 -0
  69. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/factory.py +0 -0
  70. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/__init__.py +0 -0
  71. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/base.py +0 -0
  72. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/generator.py +0 -0
  73. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/loader.py +0 -0
  74. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/parser.py +0 -0
  75. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/source.py +0 -0
  76. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/models/synthetic.py +0 -0
  77. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/synthetic/__init__.py +0 -0
  78. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/synthetic/time/__init__.py +0 -0
  79. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/synthetic/time/loader.py +0 -0
  80. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/synthetic/time/parser.py +0 -0
  81. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/sources/transports.py +0 -0
  82. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/streams/canonical.py +0 -0
  83. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/streams/raw.py +0 -0
  84. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/README.md +0 -0
  85. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +0 -0
  86. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +0 -0
  87. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +0 -0
  88. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/config/distilleries/time_ticks.yaml +0 -0
  89. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/config/project.yaml +0 -0
  90. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/config/recipe.yaml +0 -0
  91. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/pyproject.toml +0 -0
  92. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  93. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/dto.py.j2 +0 -0
  94. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/filter.py.j2 +0 -0
  95. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/loader_synthetic.py.j2 +0 -0
  96. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/mapper.py.j2 +0 -0
  97. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/parser.py.j2 +0 -0
  98. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/parser_custom.py.j2 +0 -0
  99. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/record.py.j2 +0 -0
  100. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/templates/stubs/source.yaml.j2 +0 -0
  101. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/transforms/sequence.py +0 -0
  102. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/utils/__init__.py +0 -0
  103. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/utils/load.py +0 -0
  104. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/datapipeline/utils/time.py +0 -0
  105. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/jerry_thomas.egg-info/dependency_links.txt +0 -0
  106. {jerry_thomas-0.0.2 → jerry_thomas-0.0.5}/src/jerry_thomas.egg-info/top_level.txt +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jerry-thomas
3
- Version: 0.0.2
3
+ Version: 0.0.5
4
4
  Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
- Author: Your Name
5
+ Author: Anders Skott Lind
6
6
  License: MIT
7
7
  Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
@@ -12,6 +12,7 @@ Requires-Dist: pydantic>=1.8
12
12
  Requires-Dist: PyYAML>=5.4
13
13
  Requires-Dist: tqdm>=4.0
14
14
  Requires-Dist: jinja2>=3.0
15
+ Requires-Dist: setuptools>=70
15
16
  Dynamic: license-file
16
17
 
17
18
  # Jerry Thomas
@@ -4,18 +4,19 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "jerry-thomas"
7
- version = "0.0.2"
7
+ version = "0.0.5"
8
8
  description = "Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  requires-python = ">=3.9"
11
11
  license = { text = "MIT" }
12
- authors = [{ name = "Your Name" }]
12
+ authors = [{ name = "Anders Skott Lind" }]
13
13
  dependencies = [
14
14
  "numpy>=1.24,<3.0",
15
15
  "pydantic>=1.8",
16
16
  "PyYAML>=5.4",
17
17
  "tqdm>=4.0",
18
18
  "jinja2>=3.0",
19
+ "setuptools>=70",
19
20
  ]
20
21
 
21
22
  [project.scripts]
@@ -67,6 +68,10 @@ greater_than_or_equal = "datapipeline.filters.filters:ge"
67
68
 
68
69
  [project.entry-points."datapipeline.transforms"]
69
70
  time_lag = "datapipeline.transforms.transforms:time_lag"
71
+ drop_missing = "datapipeline.transforms.transforms:drop_missing_values"
72
+
73
+ [project.entry-points."datapipeline.transforms.feature"]
74
+ standard_scale = "datapipeline.transforms.transforms:StandardScalerTransform"
70
75
 
71
76
  [project.entry-points."datapipeline.mappers"]
72
77
  "time.synthetic" = "datapipeline.mappers.noop:identity"
@@ -80,5 +85,8 @@ time_window = "datapipeline.transforms.sequence:TimeWindowTransformer"
80
85
  "composed.loader" = "datapipeline.sources.factory:build_loader"
81
86
 
82
87
  [project.entry-points."datapipeline.parsers"]
83
- "synthetic.time" = "datapipeline.sources.synthetic.time.parser:TimeRowParser"
88
+ "synthetic.time" = "datapipeline.sources.synthetic.time.parser:TimeRowParser"
84
89
  identity = "datapipeline.parsers.identity:IdentityParser"
90
+
91
+ [tool.pytest.ini_options]
92
+ pythonpath = ["src"]
@@ -12,8 +12,7 @@ def encode(stream: Iterator[TimeFeatureRecord], mode: str) -> Iterator[TimeFeatu
12
12
  elif mode == "weekday_sin":
13
13
  val = sin(2 * pi * t.weekday() / 7)
14
14
  elif mode == "linear":
15
- start = t.replace(hour=0, minute=0, second=0, microsecond=0)
16
- val = (t - start).total_seconds()
15
+ val = t.timestamp()
17
16
  else:
18
17
  raise ValueError(f"Unsupported encode_time mode: {mode}")
19
18
  yield TimeFeatureRecord(time=rec.time, value=val)
@@ -0,0 +1,150 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import is_dataclass, replace
4
+ from datetime import timedelta
5
+ from itertools import groupby
6
+ from math import sqrt
7
+ from numbers import Real
8
+ from typing import Any, Iterator, Mapping, MutableMapping
9
+
10
+ from datapipeline.domain.feature import FeatureRecord
11
+ from datapipeline.domain.record import Record, TimeFeatureRecord
12
+ from datapipeline.utils.time import parse_timecode
13
+
14
+
15
+ def _get_field(record: Any, field: str, default: Any = None) -> Any:
16
+ """Retrieve attribute *field* from *record* supporting dicts and objects."""
17
+
18
+ if isinstance(record, Mapping):
19
+ return record.get(field, default)
20
+ return getattr(record, field, default)
21
+
22
+
23
+ def _is_missing(value: Any) -> bool:
24
+ """Return True when *value* should be treated as a missing observation."""
25
+
26
+ if value is None:
27
+ return True
28
+ if isinstance(value, float): # covers NaN/inf cases
29
+ return value != value # NaN check without importing numpy
30
+ try:
31
+ if isinstance(value, Real):
32
+ return value != value
33
+ except TypeError:
34
+ pass
35
+ return False
36
+
37
+
38
+ def _clone_with_value(record: Any, value: float) -> Any:
39
+ """Return a shallow copy of *record* with its ``value`` field replaced."""
40
+
41
+ if isinstance(record, list):
42
+ raise TypeError(
43
+ "StandardScalerTransform does not support sequence FeatureRecord payloads."
44
+ )
45
+
46
+ if isinstance(record, Mapping):
47
+ cloned: MutableMapping[str, Any] = type(record)(record)
48
+ cloned["value"] = value
49
+ return cloned
50
+
51
+ if hasattr(record, "value"):
52
+ if is_dataclass(record):
53
+ return replace(record, value=value)
54
+ cloned = type(record)(**record.__dict__)
55
+ cloned.value = value
56
+ return cloned
57
+
58
+ raise TypeError(f"Cannot replace value on record type: {type(record)!r}")
59
+
60
+
61
+ def shift_record_time(record: TimeFeatureRecord, lag: timedelta) -> TimeFeatureRecord:
62
+ record.time = record.time - lag
63
+ return record
64
+
65
+
66
+ def time_lag(stream: Iterator[TimeFeatureRecord], lag: str) -> Iterator[TimeFeatureRecord]:
67
+ lag_td = parse_timecode(lag)
68
+ for record in stream:
69
+ yield shift_record_time(record, lag_td)
70
+
71
+
72
+ def drop_missing_values(
73
+ stream: Iterator[Any],
74
+ field: str = "value",
75
+ ) -> Iterator[Any]:
76
+ """Filter out records whose *field* contains a missing/null value."""
77
+
78
+ for record in stream:
79
+ value = _get_field(record, field)
80
+ if _is_missing(value):
81
+ continue
82
+ yield record
83
+
84
+
85
+ class StandardScalerTransform:
86
+ """Standardize feature values to zero mean and unit variance per feature id."""
87
+
88
+ def __init__(
89
+ self,
90
+ *,
91
+ with_mean: bool = True,
92
+ with_std: bool = True,
93
+ epsilon: float = 1e-12,
94
+ statistics: Mapping[str, Mapping[str, float]] | None = None,
95
+ ) -> None:
96
+ self.with_mean = with_mean
97
+ self.with_std = with_std
98
+ self.epsilon = epsilon
99
+ self.statistics = dict(statistics or {})
100
+ self.stats_: dict[str, dict[str, float]] = {}
101
+
102
+ def _resolve_stats(
103
+ self, feature_id: str, values: list[float]
104
+ ) -> tuple[float, float]:
105
+ if feature_id in self.statistics:
106
+ stats = self.statistics[feature_id]
107
+ mean = float(stats.get("mean", 0.0))
108
+ std = float(stats.get("std", 1.0))
109
+ else:
110
+ mean = sum(values) / len(values) if self.with_mean else 0.0
111
+ if self.with_std:
112
+ variance = sum((v - mean) ** 2 for v in values) / len(values)
113
+ std = sqrt(variance)
114
+ else:
115
+ std = 1.0
116
+ self.stats_[feature_id] = {
117
+ "mean": mean if self.with_mean else 0.0,
118
+ "std": std if self.with_std else 1.0,
119
+ }
120
+ if self.with_std:
121
+ std = max(std, self.epsilon)
122
+ else:
123
+ std = 1.0
124
+ return (mean if self.with_mean else 0.0, std)
125
+
126
+ def _extract_value(self, record: Record) -> float:
127
+ value = _get_field(record, "value")
128
+ if isinstance(value, Real):
129
+ return float(value)
130
+ raise TypeError(f"Record value must be numeric, got {value!r}")
131
+
132
+ def apply(self, stream: Iterator[FeatureRecord]) -> Iterator[FeatureRecord]:
133
+ grouped = groupby(stream, key=lambda fr: fr.feature_id)
134
+ for feature_id, records in grouped:
135
+ bucket = list(records)
136
+ if not bucket:
137
+ continue
138
+ values = [self._extract_value(fr.record) for fr in bucket]
139
+ mean, std = self._resolve_stats(feature_id, values)
140
+ for fr, raw in zip(bucket, values):
141
+ normalized = raw
142
+ if self.with_mean:
143
+ normalized -= mean
144
+ if self.with_std:
145
+ normalized /= std
146
+ yield FeatureRecord(
147
+ record=_clone_with_value(fr.record, normalized),
148
+ feature_id=fr.feature_id,
149
+ group_key=fr.group_key,
150
+ )
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: jerry-thomas
3
- Version: 0.0.2
3
+ Version: 0.0.5
4
4
  Summary: Jerry-Thomas: a stream-first, plugin-friendly data pipeline (mixology-themed CLI)
5
- Author: Your Name
5
+ Author: Anders Skott Lind
6
6
  License: MIT
7
7
  Requires-Python: >=3.9
8
8
  Description-Content-Type: text/markdown
@@ -12,6 +12,7 @@ Requires-Dist: pydantic>=1.8
12
12
  Requires-Dist: PyYAML>=5.4
13
13
  Requires-Dist: tqdm>=4.0
14
14
  Requires-Dist: jinja2>=3.0
15
+ Requires-Dist: setuptools>=70
15
16
  Dynamic: license-file
16
17
 
17
18
  # Jerry Thomas
@@ -99,4 +99,5 @@ src/jerry_thomas.egg-info/SOURCES.txt
99
99
  src/jerry_thomas.egg-info/dependency_links.txt
100
100
  src/jerry_thomas.egg-info/entry_points.txt
101
101
  src/jerry_thomas.egg-info/requires.txt
102
- src/jerry_thomas.egg-info/top_level.txt
102
+ src/jerry_thomas.egg-info/top_level.txt
103
+ tests/test_transforms.py
@@ -34,7 +34,11 @@ identity = datapipeline.parsers.identity:IdentityParser
34
34
  synthetic.time = datapipeline.sources.synthetic.time.parser:TimeRowParser
35
35
 
36
36
  [datapipeline.transforms]
37
+ drop_missing = datapipeline.transforms.transforms:drop_missing_values
37
38
  time_lag = datapipeline.transforms.transforms:time_lag
38
39
 
40
+ [datapipeline.transforms.feature]
41
+ standard_scale = datapipeline.transforms.transforms:StandardScalerTransform
42
+
39
43
  [datapipeline.transforms.sequence]
40
44
  time_window = datapipeline.transforms.sequence:TimeWindowTransformer
@@ -3,3 +3,4 @@ pydantic>=1.8
3
3
  PyYAML>=5.4
4
4
  tqdm>=4.0
5
5
  jinja2>=3.0
6
+ setuptools>=70
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from math import isclose
5
+
6
+ from datapipeline.domain.feature import FeatureRecord
7
+ from datapipeline.domain.record import TimeFeatureRecord
8
+ from datapipeline.transforms.transforms import (
9
+ StandardScalerTransform,
10
+ drop_missing_values,
11
+ )
12
+
13
+
14
+ def _make_time_record(value: float, hour: int) -> TimeFeatureRecord:
15
+ return TimeFeatureRecord(
16
+ time=datetime(2024, 1, 1, hour=hour, tzinfo=timezone.utc),
17
+ value=value,
18
+ )
19
+
20
+
21
+ def _make_feature_record(value: float, hour: int, feature_id: str) -> FeatureRecord:
22
+ return FeatureRecord(
23
+ record=_make_time_record(value, hour),
24
+ feature_id=feature_id,
25
+ group_key=(hour,),
26
+ )
27
+
28
+
29
+ def test_drop_missing_values_filters_none_and_nan():
30
+ stream = iter(
31
+ [
32
+ _make_time_record(1.0, 1),
33
+ _make_time_record(float("nan"), 2),
34
+ _make_time_record(3.0, 3),
35
+ _make_time_record(0.0, 4),
36
+ ]
37
+ )
38
+
39
+ cleaned = list(drop_missing_values(stream))
40
+
41
+ assert [rec.value for rec in cleaned] == [1.0, 3.0, 0.0]
42
+
43
+
44
+ def test_standard_scaler_normalizes_feature_stream():
45
+ stream = iter(
46
+ [
47
+ _make_feature_record(1.0, 0, "radiation"),
48
+ _make_feature_record(2.0, 1, "radiation"),
49
+ _make_feature_record(3.0, 2, "radiation"),
50
+ ]
51
+ )
52
+ scaler = StandardScalerTransform()
53
+
54
+ transformed = list(scaler.apply(stream))
55
+
56
+ values = [fr.record.value for fr in transformed]
57
+ expected = [-1.22474487, 0.0, 1.22474487]
58
+ for observed, target in zip(values, expected):
59
+ assert isclose(observed, target, rel_tol=1e-6)
60
+ assert isclose(scaler.stats_["radiation"]["mean"], 2.0, rel_tol=1e-6)
61
+
62
+
63
+ def test_standard_scaler_uses_provided_statistics():
64
+ stream = iter(
65
+ [
66
+ _make_feature_record(10.0, 0, "temperature"),
67
+ _make_feature_record(11.0, 1, "temperature"),
68
+ ]
69
+ )
70
+ scaler = StandardScalerTransform(
71
+ statistics={"temperature": {"mean": 5.0, "std": 5.0}}
72
+ )
73
+
74
+ transformed = list(scaler.apply(stream))
75
+
76
+ assert [fr.record.value for fr in transformed] == [1.0, 1.2]
@@ -1,15 +0,0 @@
1
- from datapipeline.domain.record import TimeFeatureRecord
2
- from datetime import timedelta
3
- from typing import Iterator
4
- from datapipeline.utils.time import parse_timecode
5
-
6
-
7
- def shift_record_time(record: TimeFeatureRecord, lag: timedelta) -> TimeFeatureRecord:
8
- record.time = record.time - lag
9
- return record
10
-
11
-
12
- def time_lag(stream: Iterator[TimeFeatureRecord], lag: str) -> Iterator[TimeFeatureRecord]:
13
- lag_td = parse_timecode(lag)
14
- for record in stream:
15
- yield shift_record_time(record, lag_td)
File without changes
File without changes
File without changes