jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,138 +0,0 @@
1
- from typing import Protocol, Callable, Optional
2
- from pathlib import Path
3
- import sys
4
- import json
5
- import pickle
6
- import tempfile
7
- import os
8
- import csv
9
- import gzip
10
-
11
-
12
- class Writer(Protocol):
13
- def write(self, rec: dict) -> None: ...
14
- def close(self) -> None: ...
15
-
16
-
17
- class TextLineWriter:
18
- def __init__(self, formatter: Callable[[dict], str], stream=None):
19
- self.formatter = formatter
20
- self.stream = stream or sys.stdout
21
-
22
- def write(self, rec: dict) -> None:
23
- print(self.formatter(rec), file=self.stream)
24
-
25
- def close(self) -> None:
26
- self.stream.flush()
27
-
28
-
29
- def JsonLinesWriter():
30
- return TextLineWriter(lambda rec: json.dumps(rec, default=str))
31
-
32
-
33
- def PrintWriter():
34
- return TextLineWriter(lambda rec: f"group={rec['key']}: {rec['values']}")
35
-
36
-
37
- class PickleWriter:
38
- def __init__(self, destination: Path, protocol: int = pickle.HIGHEST_PROTOCOL):
39
- self.dest = destination
40
- self.protocol = protocol
41
- self.tmp_path: Optional[Path] = None
42
- self._fh = None
43
- self._pickler = None
44
- self._open_tmp()
45
-
46
- def _open_tmp(self):
47
- self.dest.parent.mkdir(parents=True, exist_ok=True)
48
- tmp = tempfile.NamedTemporaryFile(
49
- dir=str(self.dest.parent), delete=False)
50
- self.tmp_path = Path(tmp.name)
51
- self._fh = tmp
52
- self._pickler = pickle.Pickler(self._fh, protocol=self.protocol)
53
-
54
- def write(self, rec: dict) -> None:
55
- self._pickler.dump((rec["key"], rec["values"]))
56
-
57
- def close(self) -> None:
58
- self._fh.close()
59
- os.replace(self.tmp_path, self.dest)
60
-
61
-
62
- class CSVWriter:
63
- def __init__(self, destination: Path):
64
- self.dest = destination
65
- self.tmp_path: Optional[Path] = None
66
- self._fh = None
67
- self._writer = None
68
- self._open_tmp()
69
-
70
- def _open_tmp(self):
71
- self.dest.parent.mkdir(parents=True, exist_ok=True)
72
- tmp = tempfile.NamedTemporaryFile(
73
- dir=str(self.dest.parent), delete=False, mode="w", newline="")
74
- self.tmp_path = Path(tmp.name)
75
- self._fh = tmp
76
- self._writer = csv.writer(self._fh)
77
- self._writer.writerow(["key", "values"]) # header
78
-
79
- def _format_field(self, value):
80
- if value is None:
81
- return ""
82
- if isinstance(value, (int, float, bool)):
83
- return value
84
- if isinstance(value, (bytes, bytearray)):
85
- return value.decode("utf-8", errors="replace")
86
- if isinstance(value, str):
87
- return value
88
- return str(value)
89
-
90
- def write(self, rec: dict) -> None:
91
- key = rec["key"]
92
- values = rec["values"]
93
- self._writer.writerow(
94
- [self._format_field(key), self._format_field(values)])
95
-
96
- def close(self) -> None:
97
- self._fh.close()
98
- os.replace(self.tmp_path, self.dest)
99
-
100
-
101
- class GzipJsonLinesWriter:
102
- def __init__(self, destination: Path):
103
- self.dest = destination
104
- self.tmp_path: Optional[Path] = None
105
- self._fh = None
106
- self._open_tmp()
107
-
108
- def _open_tmp(self):
109
- self.dest.parent.mkdir(parents=True, exist_ok=True)
110
- # binary write, text wrapper for newline handling
111
- tmp = tempfile.NamedTemporaryFile(
112
- dir=str(self.dest.parent), delete=False)
113
- self.tmp_path = Path(tmp.name)
114
- self._fh = gzip.GzipFile(filename="", mode="wb", fileobj=tmp)
115
-
116
- def write(self, rec: dict) -> None:
117
- line = json.dumps(rec, default=str).encode("utf-8") + b"\n"
118
- self._fh.write(line)
119
-
120
- def close(self) -> None:
121
- self._fh.close()
122
- os.replace(self.tmp_path, self.dest)
123
-
124
-
125
- def writer_factory(output: Optional[str]) -> Writer:
126
- if output and output.lower().endswith(".pt"):
127
- return PickleWriter(Path(output))
128
- if output and output.lower().endswith(".csv"):
129
- return CSVWriter(Path(output))
130
- if output and (output.lower().endswith(".jsonl.gz") or output.lower().endswith(".gz")):
131
- return GzipJsonLinesWriter(Path(output))
132
- mode = (output or "print").lower()
133
- if mode == "print":
134
- return PrintWriter()
135
- if mode == "stream":
136
- return JsonLinesWriter()
137
- print("Error: unsupported output format. Use 'print', 'stream', '.csv', '.jsonl.gz', or a .pt file path.", file=sys.stderr)
138
- raise SystemExit(2)
@@ -1,64 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
-
5
- from pydantic import BaseModel, Field
6
-
7
- from datapipeline.services.bootstrap import _load_by_key
8
-
9
-
10
- class PartitionedIdsConfig(BaseModel):
11
- """Configuration for writing the expected partitioned-id list."""
12
-
13
- output: str = Field(
14
- default="expected.txt",
15
- description="Artifact path relative to project.paths.artifacts.",
16
- )
17
- include_targets: bool = Field(
18
- default=False,
19
- description="When true, include dataset.targets in the discovery stream.",
20
- )
21
-
22
-
23
- class ScalerArtifactConfig(BaseModel):
24
- """Configuration for computing standard-scaler statistics."""
25
-
26
- enabled: bool = Field(
27
- default=True,
28
- description="Disable to skip generating the scaler statistics artifact.",
29
- )
30
- output: str = Field(
31
- default="scaler.pkl",
32
- description="Artifact path relative to project.paths.artifacts.",
33
- )
34
- include_targets: bool = Field(
35
- default=False,
36
- description="Include dataset.targets when fitting scaler statistics.",
37
- )
38
- split_label: str = Field(
39
- default="train",
40
- description="Split label to use when fitting scaler statistics.",
41
- )
42
-
43
-
44
- class BuildConfig(BaseModel):
45
- """Top-level build configuration describing materialized artifacts."""
46
-
47
- version: int = 1
48
- partitioned_ids: PartitionedIdsConfig = Field(
49
- default_factory=PartitionedIdsConfig,
50
- description="Partitioned-id task settings.",
51
- )
52
- scaler: ScalerArtifactConfig = Field(
53
- default_factory=ScalerArtifactConfig,
54
- description="Standard-scaler statistics artifact settings.",
55
- )
56
-
57
-
58
- def load_build_config(project_yaml: Path) -> BuildConfig:
59
- """Load build.yaml referenced by project.paths.build and validate it."""
60
-
61
- doc = _load_by_key(project_yaml, "build")
62
- if not isinstance(doc, dict):
63
- raise TypeError("build.yaml must define a mapping at the top level.")
64
- return BuildConfig.model_validate(doc)
@@ -1,116 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from pathlib import Path
5
- from typing import List, Sequence, Tuple
6
-
7
- from pydantic import BaseModel, Field, field_validator
8
-
9
- from datapipeline.config.project import ProjectConfig
10
- from datapipeline.utils.load import load_yaml
11
-
12
- VALID_LOG_LEVELS = ("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG")
13
-
14
-
15
- class RunConfig(BaseModel):
16
- """Runtime overrides applied when serving vectors."""
17
-
18
- version: int = Field(default=1)
19
- keep: str | None = Field(
20
- default=None,
21
- description="Active split label to serve. Null disables filtering.",
22
- min_length=1,
23
- )
24
- output: str | None = Field(
25
- default=None,
26
- description="Default output destination for jerry serve (print|stream|<path>).",
27
- min_length=1,
28
- )
29
- limit: int | None = Field(
30
- default=None,
31
- description="Default max number of vectors to emit during serve runs.",
32
- ge=1,
33
- )
34
- include_targets: bool = Field(
35
- default=False,
36
- description="Serve dataset.targets alongside features by default.",
37
- )
38
- throttle_ms: float | None = Field(
39
- default=None,
40
- description="Milliseconds to sleep between emitted vectors (throttle).",
41
- ge=0.0,
42
- )
43
- log_level: str | None = Field(
44
- default="INFO",
45
- description="Default logging level for serve runs (DEBUG, INFO, WARNING, ERROR, CRITICAL). Use null to inherit CLI.",
46
- )
47
-
48
- @field_validator("log_level")
49
- @classmethod
50
- def _validate_log_level(cls, value: str | None) -> str | None:
51
- if value is None:
52
- return None
53
- name = str(value).upper()
54
- if name not in VALID_LOG_LEVELS:
55
- raise ValueError(
56
- f"log_level must be one of {', '.join(VALID_LOG_LEVELS)}, got {value!r}"
57
- )
58
- return name
59
-
60
-
61
- def _resolve_run_path(project_yaml: Path, run_path: str | Path) -> Path:
62
- path = Path(run_path)
63
- if not path.is_absolute():
64
- path = project_yaml.parent / path
65
- return path.resolve()
66
-
67
-
68
- def _list_run_paths(project_yaml: Path) -> Sequence[Path]:
69
- project_data = load_yaml(project_yaml)
70
- project = ProjectConfig.model_validate(project_data)
71
- run_path_ref = getattr(project.paths, "run", None)
72
- if not run_path_ref:
73
- return []
74
- run_path = _resolve_run_path(project_yaml, run_path_ref)
75
- if not run_path.exists():
76
- raise FileNotFoundError(f"run config not found: {run_path}")
77
- if run_path.is_dir():
78
- entries = sorted(
79
- [
80
- p
81
- for p in run_path.iterdir()
82
- if p.is_file() and p.suffix in {".yaml", ".yml"}
83
- ],
84
- key=lambda p: p.name,
85
- )
86
- if not entries:
87
- raise FileNotFoundError(f"no run configs found under {run_path}")
88
- return entries
89
- return [run_path]
90
-
91
-
92
- def _load_run_from_path(path: Path) -> RunConfig:
93
- doc = load_yaml(path)
94
- if not isinstance(doc, dict):
95
- raise TypeError(f"{path} must define a mapping at the top level.")
96
- return RunConfig.model_validate(doc)
97
-
98
-
99
- def load_named_run_configs(project_yaml: Path) -> List[Tuple[str, RunConfig]]:
100
- """Return (name, config) pairs for every run file (directory-aware)."""
101
-
102
- paths = _list_run_paths(project_yaml)
103
- entries: List[Tuple[str, RunConfig]] = []
104
- for path in paths:
105
- cfg = _load_run_from_path(path)
106
- entries.append((path.stem, cfg))
107
- return entries
108
-
109
-
110
- def load_run_config(project_yaml: Path) -> RunConfig | None:
111
- """Load the first run config referenced by project.paths.run, if configured."""
112
-
113
- paths = _list_run_paths(project_yaml)
114
- if not paths:
115
- return None
116
- return _load_run_from_path(paths[0])
@@ -1,24 +0,0 @@
1
- source_id: time_ticks
2
- stream_id: time_hour_sin
3
-
4
- mapper:
5
- entrypoint: encode_time
6
- args: { mode: hour_sin }
7
-
8
- # partition_by: field you want to partition
9
-
10
- record:
11
- - filter: { operator: ge, field: time, comparand: "${start_time}" }
12
- - filter: { operator: le, field: time, comparand: "${end_time}" }
13
- # - floor_time: { resolution: 1h }
14
- # - lag: { lag: 1h }
15
-
16
- # stream:
17
- # - ensure_ticks: { tick: 1h }
18
- # - granularity: { mode: last }
19
- # - fill: { statistic: median, window: 24, min_samples: 4 }
20
-
21
- # debug:
22
- # - lint: { mode: warn, tick: 1h }
23
-
24
- # sort_batch_size: 100000
@@ -1,23 +0,0 @@
1
- source_id: time_ticks # raw source alias (see config/sources)
2
- stream_id: time_linear # this stream id used by recipes
3
-
4
- mapper: # normalize/reshape DTO -> TemporalRecord if not implemented will give you idenitymapper
5
- entrypoint: encode_time
6
- args: { mode: linear }
7
- # partition_by: station_id # optional: add partition suffixes to feature ids
8
-
9
- record: # record-level transforms
10
- - filter: { operator: ge, field: time, comparand: "${start_time}" }
11
- - filter: { operator: le, field: time, comparand: "${end_time}" }
12
- # - floor_time: { resolution: 10m } # snap timestamps to resolution
13
- # - lag: { lag: 10m } # shift timestamps backwards
14
-
15
- # stream: # per-feature stream transforms (input sorted by id,time)
16
- # - ensure_ticks: { tick: 10m } # insert missing ticks (value=None)
17
- # - granularity: { mode: first } # aggregate duplicates within a tick
18
- # - fill: { statistic: median, window: 6, min_samples: 1 } # impute gaps
19
-
20
- # debug: # optional validation-only transforms
21
- # - lint: { mode: warn, tick: 10m } # flag gaps/duplicates/order issues
22
-
23
- # sort_batch_size: 100000 # in-memory chunk size used by internal sorting
@@ -1,9 +0,0 @@
1
- version: 1
2
- partitioned_ids:
3
- output: expected.txt # relative to project.paths.artifacts
4
- include_targets: false # set true to include dataset.targets
5
- scaler:
6
- enabled: true # disable to skip scaler statistics
7
- output: scaler.pkl # relative to project.paths.artifacts
8
- include_targets: false # include targets when fitting scaler
9
- split_label: train # label from project.globals.split to fit on
@@ -1,14 +0,0 @@
1
- group_by: 1h
2
-
3
- features:
4
- - id: time
5
- record_stream: time_hour_sin
6
- # scale: { with_mean: true, with_std: true }
7
- # sequence: { size: 6, stride: 1, tick: 10m }
8
-
9
- # - id: second_feature
10
- # record_stream: anotherstream
11
- # targets:
12
- # - id: some_target
13
- # record_stream: time_linear
14
-
@@ -1,13 +0,0 @@
1
- # - drop_missing:
2
- # # require these features present OR set min_coverage below
3
- # required: [time]
4
- # min_coverage: 1.0
5
- # - fill_constant:
6
- # value: 0.0
7
- # - fill_history:
8
- # statistic: median
9
- # window: 48
10
- # min_samples: 6
11
- # - fill_horizontal:
12
- # statistic: mean
13
- # min_samples: 2
@@ -1,10 +0,0 @@
1
- version: 1
2
- # Active split label to serve; must match a label defined in globals.split.
3
- # Set to null to disable filtering or override per run via CLI.
4
- keep: test
5
- # Optional defaults for jerry serve (override via CLI when needed).
6
- output: print # print | stream | /path/to/test_file.pt
7
- limit: 100 # max vectors per serve run (null = unlimited)
8
- include_targets: false # serve dataset.targets alongside features
9
- throttle_ms: null # milliseconds to sleep between emitted vectors
10
- log_level: INFO # DEBUG for progress bars, INFO for spinner, WARNING to keep quiet (null to inherit CLI)
@@ -1,10 +0,0 @@
1
- version: 1
2
- # Active split label to serve; must match a label defined in globals.split.
3
- # Set to null to disable filtering or override per run via CLI.
4
- keep: train
5
- # Optional defaults for jerry serve (override via CLI when needed).
6
- output: print # print | stream | /path/to/train_file.pt
7
- limit: 100 # max vectors per serve run (null = unlimited)
8
- include_targets: false # serve dataset.targets alongside features
9
- throttle_ms: null # milliseconds to sleep between emitted vectors
10
- log_level: INFO # DEBUG for progress bars, INFO for spinner, WARNING to keep quiet (null to inherit CLI)
@@ -1,10 +0,0 @@
1
- version: 1
2
- # Active split label to serve; must match a label defined in globals.split.
3
- # Set to null to disable filtering or override per run via CLI.
4
- keep: val
5
- # Optional defaults for jerry serve (override via CLI when needed).
6
- output: print # print | stream | /path/to/val_file.pt
7
- limit: 100 # max vectors per serve run (null = unlimited)
8
- include_targets: false # serve dataset.targets alongside features
9
- throttle_ms: null # milliseconds to sleep between emitted vectors
10
- log_level: INFO # DEBUG for progress bars, INFO for spinner, WARNING to keep quiet (null to inherit CLI)
@@ -1,11 +0,0 @@
1
- source_id: time_ticks
2
-
3
- parser:
4
- entrypoint: "synthetic.time"
5
- args: {}
6
- loader:
7
- entrypoint: "synthetic.time"
8
- args:
9
- start: "${start_time}"
10
- end: "${end_time}"
11
- frequency: "1h"
@@ -1,210 +0,0 @@
1
- from collections import deque
2
- from collections.abc import Iterator
3
- from statistics import mean, median
4
- from typing import Any, Literal, Tuple
5
-
6
- from datapipeline.domain.vector import Vector
7
- from datapipeline.transforms.vector_utils import base_id, is_missing, clone
8
- from datapipeline.pipeline.context import PipelineContext, try_get_current_context
9
-
10
-
11
- class _ContextExpectedMixin:
12
- def __init__(self) -> None:
13
- self._context: PipelineContext | None = None
14
-
15
- def bind_context(self, context: PipelineContext) -> None:
16
- self._context = context
17
-
18
- def _expected_ids(self) -> list[str]:
19
- ctx = self._context or try_get_current_context()
20
- if not ctx:
21
- return []
22
- return ctx.load_expected_ids()
23
-
24
-
25
- class VectorDropMissingTransform(_ContextExpectedMixin):
26
- """Drop vectors that do not satisfy coverage requirements."""
27
-
28
- def __init__(
29
- self,
30
- *,
31
- required: list[str] | None = None,
32
- min_coverage: float = 1.0,
33
- ) -> None:
34
- super().__init__()
35
- if not 0.0 <= min_coverage <= 1.0:
36
- raise ValueError("min_coverage must be between 0 and 1")
37
- self.required = {str(item) for item in (required or [])}
38
- self.min_coverage = min_coverage
39
- # Always operate on full (partition) ids
40
-
41
- def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
42
- return self.apply(stream)
43
-
44
- def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
45
- for group_key, vector in stream:
46
- present = {fid for fid, value in vector.values.items()
47
- if not is_missing(value)}
48
- # Enforce hard requirements first (normalize required keys for fair comparison)
49
- if self.required:
50
- if not set(self.required).issubset(present):
51
- continue
52
-
53
- # Coverage baseline uses explicit expected if provided; otherwise dynamic set
54
- baseline = set(self._expected_ids())
55
- if baseline:
56
- coverage = len(present & baseline) / len(baseline)
57
- if coverage < self.min_coverage:
58
- continue
59
- yield group_key, vector
60
-
61
-
62
- class VectorFillConstantTransform(_ContextExpectedMixin):
63
- """Fill missing entries with a constant value."""
64
-
65
- def __init__(
66
- self,
67
- *,
68
- value: Any,
69
- ) -> None:
70
- super().__init__()
71
- self.value = value
72
-
73
- def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
74
- return self.apply(stream)
75
-
76
- def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
77
- for group_key, vector in stream:
78
- targets = self._expected_ids()
79
- if not targets:
80
- yield group_key, vector
81
- continue
82
- data = clone(vector.values)
83
- updated = False
84
- for feature in targets:
85
- if feature not in data or is_missing(data[feature]):
86
- data[feature] = self.value
87
- updated = True
88
- if updated:
89
- yield group_key, Vector(values=data)
90
- else:
91
- yield group_key, vector
92
-
93
-
94
- class VectorFillHistoryTransform(_ContextExpectedMixin):
95
- """Fill missing entries using running statistics from prior buckets."""
96
-
97
- def __init__(
98
- self,
99
- *,
100
- statistic: Literal["mean", "median"] = "median",
101
- window: int | None = None,
102
- min_samples: int = 1,
103
- ) -> None:
104
- super().__init__()
105
- if window is not None and window <= 0:
106
- raise ValueError("window must be positive when provided")
107
- if min_samples <= 0:
108
- raise ValueError("min_samples must be positive")
109
- self.statistic = statistic
110
- self.window = window
111
- self.min_samples = min_samples
112
- self.history: dict[str, deque[float]] = {}
113
-
114
- def _compute(self, feature_id: str) -> float | None:
115
- values = self.history.get(feature_id)
116
- if not values or len(values) < self.min_samples:
117
- return None
118
- if self.statistic == "mean":
119
- return float(mean(values))
120
- return float(median(values))
121
-
122
- def _push(self, feature_id: str, value: Any) -> None:
123
- if is_missing(value):
124
- return
125
- try:
126
- num = float(value)
127
- except (TypeError, ValueError):
128
- # Ignore non-scalar/non-numeric entries
129
- return
130
- bucket = self.history.setdefault(
131
- str(feature_id), deque(maxlen=self.window))
132
- bucket.append(num)
133
-
134
- def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
135
- return self.apply(stream)
136
-
137
- def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
138
- for group_key, vector in stream:
139
- targets = self._expected_ids()
140
- data = clone(vector.values)
141
- updated = False
142
- for feature in targets:
143
- if feature in data and not is_missing(data[feature]):
144
- continue
145
- fill = self._compute(feature)
146
- if fill is not None:
147
- data[feature] = fill
148
- updated = True
149
- # Push history after possibly filling
150
- for fid, value in data.items():
151
- self._push(fid, value)
152
- if updated:
153
- yield group_key, Vector(values=data)
154
- else:
155
- yield group_key, vector
156
-
157
-
158
- class VectorFillAcrossPartitionsTransform(_ContextExpectedMixin):
159
- """Fill missing entries by aggregating sibling partitions at the same timestamp."""
160
-
161
- def __init__(
162
- self,
163
- *,
164
- statistic: Literal["mean", "median"] = "median",
165
- min_samples: int = 1,
166
- ) -> None:
167
- super().__init__()
168
- if min_samples <= 0:
169
- raise ValueError("min_samples must be positive")
170
- self.statistic = statistic
171
- self.min_samples = min_samples
172
- # Always operate on full (partition) ids
173
-
174
- def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
175
- return self.apply(stream)
176
-
177
- def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
178
- for group_key, vector in stream:
179
- targets = self._expected_ids()
180
- if not targets:
181
- yield group_key, vector
182
- continue
183
-
184
- data = clone(vector.values)
185
- base_groups: dict[str, list[float]] = {}
186
- for fid, value in data.items():
187
- if is_missing(value):
188
- continue
189
- try:
190
- num = float(value)
191
- except (TypeError, ValueError):
192
- continue
193
- base_groups.setdefault(base_id(fid), []).append(num)
194
-
195
- updated = False
196
- for feature in targets:
197
- if feature in data and not is_missing(data[feature]):
198
- continue
199
- base = base_id(feature)
200
- candidates = base_groups.get(base, [])
201
- if len(candidates) < self.min_samples:
202
- continue
203
- fill = mean(candidates) if self.statistic == "mean" else median(
204
- candidates)
205
- data[feature] = float(fill)
206
- updated = True
207
- if updated:
208
- yield group_key, Vector(values=data)
209
- else:
210
- yield group_key, vector