jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
from typing import Protocol, Callable, Optional
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
import sys
|
|
4
|
-
import json
|
|
5
|
-
import pickle
|
|
6
|
-
import tempfile
|
|
7
|
-
import os
|
|
8
|
-
import csv
|
|
9
|
-
import gzip
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class Writer(Protocol):
|
|
13
|
-
def write(self, rec: dict) -> None: ...
|
|
14
|
-
def close(self) -> None: ...
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class TextLineWriter:
|
|
18
|
-
def __init__(self, formatter: Callable[[dict], str], stream=None):
|
|
19
|
-
self.formatter = formatter
|
|
20
|
-
self.stream = stream or sys.stdout
|
|
21
|
-
|
|
22
|
-
def write(self, rec: dict) -> None:
|
|
23
|
-
print(self.formatter(rec), file=self.stream)
|
|
24
|
-
|
|
25
|
-
def close(self) -> None:
|
|
26
|
-
self.stream.flush()
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def JsonLinesWriter():
|
|
30
|
-
return TextLineWriter(lambda rec: json.dumps(rec, default=str))
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def PrintWriter():
|
|
34
|
-
return TextLineWriter(lambda rec: f"group={rec['key']}: {rec['values']}")
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class PickleWriter:
|
|
38
|
-
def __init__(self, destination: Path, protocol: int = pickle.HIGHEST_PROTOCOL):
|
|
39
|
-
self.dest = destination
|
|
40
|
-
self.protocol = protocol
|
|
41
|
-
self.tmp_path: Optional[Path] = None
|
|
42
|
-
self._fh = None
|
|
43
|
-
self._pickler = None
|
|
44
|
-
self._open_tmp()
|
|
45
|
-
|
|
46
|
-
def _open_tmp(self):
|
|
47
|
-
self.dest.parent.mkdir(parents=True, exist_ok=True)
|
|
48
|
-
tmp = tempfile.NamedTemporaryFile(
|
|
49
|
-
dir=str(self.dest.parent), delete=False)
|
|
50
|
-
self.tmp_path = Path(tmp.name)
|
|
51
|
-
self._fh = tmp
|
|
52
|
-
self._pickler = pickle.Pickler(self._fh, protocol=self.protocol)
|
|
53
|
-
|
|
54
|
-
def write(self, rec: dict) -> None:
|
|
55
|
-
self._pickler.dump((rec["key"], rec["values"]))
|
|
56
|
-
|
|
57
|
-
def close(self) -> None:
|
|
58
|
-
self._fh.close()
|
|
59
|
-
os.replace(self.tmp_path, self.dest)
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class CSVWriter:
|
|
63
|
-
def __init__(self, destination: Path):
|
|
64
|
-
self.dest = destination
|
|
65
|
-
self.tmp_path: Optional[Path] = None
|
|
66
|
-
self._fh = None
|
|
67
|
-
self._writer = None
|
|
68
|
-
self._open_tmp()
|
|
69
|
-
|
|
70
|
-
def _open_tmp(self):
|
|
71
|
-
self.dest.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
-
tmp = tempfile.NamedTemporaryFile(
|
|
73
|
-
dir=str(self.dest.parent), delete=False, mode="w", newline="")
|
|
74
|
-
self.tmp_path = Path(tmp.name)
|
|
75
|
-
self._fh = tmp
|
|
76
|
-
self._writer = csv.writer(self._fh)
|
|
77
|
-
self._writer.writerow(["key", "values"]) # header
|
|
78
|
-
|
|
79
|
-
def _format_field(self, value):
|
|
80
|
-
if value is None:
|
|
81
|
-
return ""
|
|
82
|
-
if isinstance(value, (int, float, bool)):
|
|
83
|
-
return value
|
|
84
|
-
if isinstance(value, (bytes, bytearray)):
|
|
85
|
-
return value.decode("utf-8", errors="replace")
|
|
86
|
-
if isinstance(value, str):
|
|
87
|
-
return value
|
|
88
|
-
return str(value)
|
|
89
|
-
|
|
90
|
-
def write(self, rec: dict) -> None:
|
|
91
|
-
key = rec["key"]
|
|
92
|
-
values = rec["values"]
|
|
93
|
-
self._writer.writerow(
|
|
94
|
-
[self._format_field(key), self._format_field(values)])
|
|
95
|
-
|
|
96
|
-
def close(self) -> None:
|
|
97
|
-
self._fh.close()
|
|
98
|
-
os.replace(self.tmp_path, self.dest)
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
class GzipJsonLinesWriter:
|
|
102
|
-
def __init__(self, destination: Path):
|
|
103
|
-
self.dest = destination
|
|
104
|
-
self.tmp_path: Optional[Path] = None
|
|
105
|
-
self._fh = None
|
|
106
|
-
self._open_tmp()
|
|
107
|
-
|
|
108
|
-
def _open_tmp(self):
|
|
109
|
-
self.dest.parent.mkdir(parents=True, exist_ok=True)
|
|
110
|
-
# binary write, text wrapper for newline handling
|
|
111
|
-
tmp = tempfile.NamedTemporaryFile(
|
|
112
|
-
dir=str(self.dest.parent), delete=False)
|
|
113
|
-
self.tmp_path = Path(tmp.name)
|
|
114
|
-
self._fh = gzip.GzipFile(filename="", mode="wb", fileobj=tmp)
|
|
115
|
-
|
|
116
|
-
def write(self, rec: dict) -> None:
|
|
117
|
-
line = json.dumps(rec, default=str).encode("utf-8") + b"\n"
|
|
118
|
-
self._fh.write(line)
|
|
119
|
-
|
|
120
|
-
def close(self) -> None:
|
|
121
|
-
self._fh.close()
|
|
122
|
-
os.replace(self.tmp_path, self.dest)
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def writer_factory(output: Optional[str]) -> Writer:
|
|
126
|
-
if output and output.lower().endswith(".pt"):
|
|
127
|
-
return PickleWriter(Path(output))
|
|
128
|
-
if output and output.lower().endswith(".csv"):
|
|
129
|
-
return CSVWriter(Path(output))
|
|
130
|
-
if output and (output.lower().endswith(".jsonl.gz") or output.lower().endswith(".gz")):
|
|
131
|
-
return GzipJsonLinesWriter(Path(output))
|
|
132
|
-
mode = (output or "print").lower()
|
|
133
|
-
if mode == "print":
|
|
134
|
-
return PrintWriter()
|
|
135
|
-
if mode == "stream":
|
|
136
|
-
return JsonLinesWriter()
|
|
137
|
-
print("Error: unsupported output format. Use 'print', 'stream', '.csv', '.jsonl.gz', or a .pt file path.", file=sys.stderr)
|
|
138
|
-
raise SystemExit(2)
|
datapipeline/config/build.py
DELETED
|
@@ -1,64 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
from pydantic import BaseModel, Field
|
|
6
|
-
|
|
7
|
-
from datapipeline.services.bootstrap import _load_by_key
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class PartitionedIdsConfig(BaseModel):
|
|
11
|
-
"""Configuration for writing the expected partitioned-id list."""
|
|
12
|
-
|
|
13
|
-
output: str = Field(
|
|
14
|
-
default="expected.txt",
|
|
15
|
-
description="Artifact path relative to project.paths.artifacts.",
|
|
16
|
-
)
|
|
17
|
-
include_targets: bool = Field(
|
|
18
|
-
default=False,
|
|
19
|
-
description="When true, include dataset.targets in the discovery stream.",
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class ScalerArtifactConfig(BaseModel):
|
|
24
|
-
"""Configuration for computing standard-scaler statistics."""
|
|
25
|
-
|
|
26
|
-
enabled: bool = Field(
|
|
27
|
-
default=True,
|
|
28
|
-
description="Disable to skip generating the scaler statistics artifact.",
|
|
29
|
-
)
|
|
30
|
-
output: str = Field(
|
|
31
|
-
default="scaler.pkl",
|
|
32
|
-
description="Artifact path relative to project.paths.artifacts.",
|
|
33
|
-
)
|
|
34
|
-
include_targets: bool = Field(
|
|
35
|
-
default=False,
|
|
36
|
-
description="Include dataset.targets when fitting scaler statistics.",
|
|
37
|
-
)
|
|
38
|
-
split_label: str = Field(
|
|
39
|
-
default="train",
|
|
40
|
-
description="Split label to use when fitting scaler statistics.",
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class BuildConfig(BaseModel):
|
|
45
|
-
"""Top-level build configuration describing materialized artifacts."""
|
|
46
|
-
|
|
47
|
-
version: int = 1
|
|
48
|
-
partitioned_ids: PartitionedIdsConfig = Field(
|
|
49
|
-
default_factory=PartitionedIdsConfig,
|
|
50
|
-
description="Partitioned-id task settings.",
|
|
51
|
-
)
|
|
52
|
-
scaler: ScalerArtifactConfig = Field(
|
|
53
|
-
default_factory=ScalerArtifactConfig,
|
|
54
|
-
description="Standard-scaler statistics artifact settings.",
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def load_build_config(project_yaml: Path) -> BuildConfig:
|
|
59
|
-
"""Load build.yaml referenced by project.paths.build and validate it."""
|
|
60
|
-
|
|
61
|
-
doc = _load_by_key(project_yaml, "build")
|
|
62
|
-
if not isinstance(doc, dict):
|
|
63
|
-
raise TypeError("build.yaml must define a mapping at the top level.")
|
|
64
|
-
return BuildConfig.model_validate(doc)
|
datapipeline/config/run.py
DELETED
|
@@ -1,116 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import List, Sequence, Tuple
|
|
6
|
-
|
|
7
|
-
from pydantic import BaseModel, Field, field_validator
|
|
8
|
-
|
|
9
|
-
from datapipeline.config.project import ProjectConfig
|
|
10
|
-
from datapipeline.utils.load import load_yaml
|
|
11
|
-
|
|
12
|
-
VALID_LOG_LEVELS = ("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG")
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class RunConfig(BaseModel):
|
|
16
|
-
"""Runtime overrides applied when serving vectors."""
|
|
17
|
-
|
|
18
|
-
version: int = Field(default=1)
|
|
19
|
-
keep: str | None = Field(
|
|
20
|
-
default=None,
|
|
21
|
-
description="Active split label to serve. Null disables filtering.",
|
|
22
|
-
min_length=1,
|
|
23
|
-
)
|
|
24
|
-
output: str | None = Field(
|
|
25
|
-
default=None,
|
|
26
|
-
description="Default output destination for jerry serve (print|stream|<path>).",
|
|
27
|
-
min_length=1,
|
|
28
|
-
)
|
|
29
|
-
limit: int | None = Field(
|
|
30
|
-
default=None,
|
|
31
|
-
description="Default max number of vectors to emit during serve runs.",
|
|
32
|
-
ge=1,
|
|
33
|
-
)
|
|
34
|
-
include_targets: bool = Field(
|
|
35
|
-
default=False,
|
|
36
|
-
description="Serve dataset.targets alongside features by default.",
|
|
37
|
-
)
|
|
38
|
-
throttle_ms: float | None = Field(
|
|
39
|
-
default=None,
|
|
40
|
-
description="Milliseconds to sleep between emitted vectors (throttle).",
|
|
41
|
-
ge=0.0,
|
|
42
|
-
)
|
|
43
|
-
log_level: str | None = Field(
|
|
44
|
-
default="INFO",
|
|
45
|
-
description="Default logging level for serve runs (DEBUG, INFO, WARNING, ERROR, CRITICAL). Use null to inherit CLI.",
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
@field_validator("log_level")
|
|
49
|
-
@classmethod
|
|
50
|
-
def _validate_log_level(cls, value: str | None) -> str | None:
|
|
51
|
-
if value is None:
|
|
52
|
-
return None
|
|
53
|
-
name = str(value).upper()
|
|
54
|
-
if name not in VALID_LOG_LEVELS:
|
|
55
|
-
raise ValueError(
|
|
56
|
-
f"log_level must be one of {', '.join(VALID_LOG_LEVELS)}, got {value!r}"
|
|
57
|
-
)
|
|
58
|
-
return name
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def _resolve_run_path(project_yaml: Path, run_path: str | Path) -> Path:
|
|
62
|
-
path = Path(run_path)
|
|
63
|
-
if not path.is_absolute():
|
|
64
|
-
path = project_yaml.parent / path
|
|
65
|
-
return path.resolve()
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def _list_run_paths(project_yaml: Path) -> Sequence[Path]:
|
|
69
|
-
project_data = load_yaml(project_yaml)
|
|
70
|
-
project = ProjectConfig.model_validate(project_data)
|
|
71
|
-
run_path_ref = getattr(project.paths, "run", None)
|
|
72
|
-
if not run_path_ref:
|
|
73
|
-
return []
|
|
74
|
-
run_path = _resolve_run_path(project_yaml, run_path_ref)
|
|
75
|
-
if not run_path.exists():
|
|
76
|
-
raise FileNotFoundError(f"run config not found: {run_path}")
|
|
77
|
-
if run_path.is_dir():
|
|
78
|
-
entries = sorted(
|
|
79
|
-
[
|
|
80
|
-
p
|
|
81
|
-
for p in run_path.iterdir()
|
|
82
|
-
if p.is_file() and p.suffix in {".yaml", ".yml"}
|
|
83
|
-
],
|
|
84
|
-
key=lambda p: p.name,
|
|
85
|
-
)
|
|
86
|
-
if not entries:
|
|
87
|
-
raise FileNotFoundError(f"no run configs found under {run_path}")
|
|
88
|
-
return entries
|
|
89
|
-
return [run_path]
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _load_run_from_path(path: Path) -> RunConfig:
|
|
93
|
-
doc = load_yaml(path)
|
|
94
|
-
if not isinstance(doc, dict):
|
|
95
|
-
raise TypeError(f"{path} must define a mapping at the top level.")
|
|
96
|
-
return RunConfig.model_validate(doc)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def load_named_run_configs(project_yaml: Path) -> List[Tuple[str, RunConfig]]:
|
|
100
|
-
"""Return (name, config) pairs for every run file (directory-aware)."""
|
|
101
|
-
|
|
102
|
-
paths = _list_run_paths(project_yaml)
|
|
103
|
-
entries: List[Tuple[str, RunConfig]] = []
|
|
104
|
-
for path in paths:
|
|
105
|
-
cfg = _load_run_from_path(path)
|
|
106
|
-
entries.append((path.stem, cfg))
|
|
107
|
-
return entries
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def load_run_config(project_yaml: Path) -> RunConfig | None:
|
|
111
|
-
"""Load the first run config referenced by project.paths.run, if configured."""
|
|
112
|
-
|
|
113
|
-
paths = _list_run_paths(project_yaml)
|
|
114
|
-
if not paths:
|
|
115
|
-
return None
|
|
116
|
-
return _load_run_from_path(paths[0])
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
source_id: time_ticks
|
|
2
|
-
stream_id: time_hour_sin
|
|
3
|
-
|
|
4
|
-
mapper:
|
|
5
|
-
entrypoint: encode_time
|
|
6
|
-
args: { mode: hour_sin }
|
|
7
|
-
|
|
8
|
-
# partition_by: field you want to partition
|
|
9
|
-
|
|
10
|
-
record:
|
|
11
|
-
- filter: { operator: ge, field: time, comparand: "${start_time}" }
|
|
12
|
-
- filter: { operator: le, field: time, comparand: "${end_time}" }
|
|
13
|
-
# - floor_time: { resolution: 1h }
|
|
14
|
-
# - lag: { lag: 1h }
|
|
15
|
-
|
|
16
|
-
# stream:
|
|
17
|
-
# - ensure_ticks: { tick: 1h }
|
|
18
|
-
# - granularity: { mode: last }
|
|
19
|
-
# - fill: { statistic: median, window: 24, min_samples: 4 }
|
|
20
|
-
|
|
21
|
-
# debug:
|
|
22
|
-
# - lint: { mode: warn, tick: 1h }
|
|
23
|
-
|
|
24
|
-
# sort_batch_size: 100000
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
source_id: time_ticks # raw source alias (see config/sources)
|
|
2
|
-
stream_id: time_linear # this stream id used by recipes
|
|
3
|
-
|
|
4
|
-
mapper: # normalize/reshape DTO -> TemporalRecord if not implemented will give you idenitymapper
|
|
5
|
-
entrypoint: encode_time
|
|
6
|
-
args: { mode: linear }
|
|
7
|
-
# partition_by: station_id # optional: add partition suffixes to feature ids
|
|
8
|
-
|
|
9
|
-
record: # record-level transforms
|
|
10
|
-
- filter: { operator: ge, field: time, comparand: "${start_time}" }
|
|
11
|
-
- filter: { operator: le, field: time, comparand: "${end_time}" }
|
|
12
|
-
# - floor_time: { resolution: 10m } # snap timestamps to resolution
|
|
13
|
-
# - lag: { lag: 10m } # shift timestamps backwards
|
|
14
|
-
|
|
15
|
-
# stream: # per-feature stream transforms (input sorted by id,time)
|
|
16
|
-
# - ensure_ticks: { tick: 10m } # insert missing ticks (value=None)
|
|
17
|
-
# - granularity: { mode: first } # aggregate duplicates within a tick
|
|
18
|
-
# - fill: { statistic: median, window: 6, min_samples: 1 } # impute gaps
|
|
19
|
-
|
|
20
|
-
# debug: # optional validation-only transforms
|
|
21
|
-
# - lint: { mode: warn, tick: 10m } # flag gaps/duplicates/order issues
|
|
22
|
-
|
|
23
|
-
# sort_batch_size: 100000 # in-memory chunk size used by internal sorting
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
version: 1
|
|
2
|
-
partitioned_ids:
|
|
3
|
-
output: expected.txt # relative to project.paths.artifacts
|
|
4
|
-
include_targets: false # set true to include dataset.targets
|
|
5
|
-
scaler:
|
|
6
|
-
enabled: true # disable to skip scaler statistics
|
|
7
|
-
output: scaler.pkl # relative to project.paths.artifacts
|
|
8
|
-
include_targets: false # include targets when fitting scaler
|
|
9
|
-
split_label: train # label from project.globals.split to fit on
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
group_by: 1h
|
|
2
|
-
|
|
3
|
-
features:
|
|
4
|
-
- id: time
|
|
5
|
-
record_stream: time_hour_sin
|
|
6
|
-
# scale: { with_mean: true, with_std: true }
|
|
7
|
-
# sequence: { size: 6, stride: 1, tick: 10m }
|
|
8
|
-
|
|
9
|
-
# - id: second_feature
|
|
10
|
-
# record_stream: anotherstream
|
|
11
|
-
# targets:
|
|
12
|
-
# - id: some_target
|
|
13
|
-
# record_stream: time_linear
|
|
14
|
-
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# - drop_missing:
|
|
2
|
-
# # require these features present OR set min_coverage below
|
|
3
|
-
# required: [time]
|
|
4
|
-
# min_coverage: 1.0
|
|
5
|
-
# - fill_constant:
|
|
6
|
-
# value: 0.0
|
|
7
|
-
# - fill_history:
|
|
8
|
-
# statistic: median
|
|
9
|
-
# window: 48
|
|
10
|
-
# min_samples: 6
|
|
11
|
-
# - fill_horizontal:
|
|
12
|
-
# statistic: mean
|
|
13
|
-
# min_samples: 2
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
version: 1
|
|
2
|
-
# Active split label to serve; must match a label defined in globals.split.
|
|
3
|
-
# Set to null to disable filtering or override per run via CLI.
|
|
4
|
-
keep: test
|
|
5
|
-
# Optional defaults for jerry serve (override via CLI when needed).
|
|
6
|
-
output: print # print | stream | /path/to/test_file.pt
|
|
7
|
-
limit: 100 # max vectors per serve run (null = unlimited)
|
|
8
|
-
include_targets: false # serve dataset.targets alongside features
|
|
9
|
-
throttle_ms: null # milliseconds to sleep between emitted vectors
|
|
10
|
-
log_level: INFO # DEBUG for progress bars, INFO for spinner, WARNING to keep quiet (null to inherit CLI)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
version: 1
|
|
2
|
-
# Active split label to serve; must match a label defined in globals.split.
|
|
3
|
-
# Set to null to disable filtering or override per run via CLI.
|
|
4
|
-
keep: train
|
|
5
|
-
# Optional defaults for jerry serve (override via CLI when needed).
|
|
6
|
-
output: print # print | stream | /path/to/train_file.pt
|
|
7
|
-
limit: 100 # max vectors per serve run (null = unlimited)
|
|
8
|
-
include_targets: false # serve dataset.targets alongside features
|
|
9
|
-
throttle_ms: null # milliseconds to sleep between emitted vectors
|
|
10
|
-
log_level: INFO # DEBUG for progress bars, INFO for spinner, WARNING to keep quiet (null to inherit CLI)
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
version: 1
|
|
2
|
-
# Active split label to serve; must match a label defined in globals.split.
|
|
3
|
-
# Set to null to disable filtering or override per run via CLI.
|
|
4
|
-
keep: val
|
|
5
|
-
# Optional defaults for jerry serve (override via CLI when needed).
|
|
6
|
-
output: print # print | stream | /path/to/val_file.pt
|
|
7
|
-
limit: 100 # max vectors per serve run (null = unlimited)
|
|
8
|
-
include_targets: false # serve dataset.targets alongside features
|
|
9
|
-
throttle_ms: null # milliseconds to sleep between emitted vectors
|
|
10
|
-
log_level: INFO # DEBUG for progress bars, INFO for spinner, WARNING to keep quiet (null to inherit CLI)
|
|
@@ -1,210 +0,0 @@
|
|
|
1
|
-
from collections import deque
|
|
2
|
-
from collections.abc import Iterator
|
|
3
|
-
from statistics import mean, median
|
|
4
|
-
from typing import Any, Literal, Tuple
|
|
5
|
-
|
|
6
|
-
from datapipeline.domain.vector import Vector
|
|
7
|
-
from datapipeline.transforms.vector_utils import base_id, is_missing, clone
|
|
8
|
-
from datapipeline.pipeline.context import PipelineContext, try_get_current_context
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class _ContextExpectedMixin:
|
|
12
|
-
def __init__(self) -> None:
|
|
13
|
-
self._context: PipelineContext | None = None
|
|
14
|
-
|
|
15
|
-
def bind_context(self, context: PipelineContext) -> None:
|
|
16
|
-
self._context = context
|
|
17
|
-
|
|
18
|
-
def _expected_ids(self) -> list[str]:
|
|
19
|
-
ctx = self._context or try_get_current_context()
|
|
20
|
-
if not ctx:
|
|
21
|
-
return []
|
|
22
|
-
return ctx.load_expected_ids()
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class VectorDropMissingTransform(_ContextExpectedMixin):
|
|
26
|
-
"""Drop vectors that do not satisfy coverage requirements."""
|
|
27
|
-
|
|
28
|
-
def __init__(
|
|
29
|
-
self,
|
|
30
|
-
*,
|
|
31
|
-
required: list[str] | None = None,
|
|
32
|
-
min_coverage: float = 1.0,
|
|
33
|
-
) -> None:
|
|
34
|
-
super().__init__()
|
|
35
|
-
if not 0.0 <= min_coverage <= 1.0:
|
|
36
|
-
raise ValueError("min_coverage must be between 0 and 1")
|
|
37
|
-
self.required = {str(item) for item in (required or [])}
|
|
38
|
-
self.min_coverage = min_coverage
|
|
39
|
-
# Always operate on full (partition) ids
|
|
40
|
-
|
|
41
|
-
def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
42
|
-
return self.apply(stream)
|
|
43
|
-
|
|
44
|
-
def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
45
|
-
for group_key, vector in stream:
|
|
46
|
-
present = {fid for fid, value in vector.values.items()
|
|
47
|
-
if not is_missing(value)}
|
|
48
|
-
# Enforce hard requirements first (normalize required keys for fair comparison)
|
|
49
|
-
if self.required:
|
|
50
|
-
if not set(self.required).issubset(present):
|
|
51
|
-
continue
|
|
52
|
-
|
|
53
|
-
# Coverage baseline uses explicit expected if provided; otherwise dynamic set
|
|
54
|
-
baseline = set(self._expected_ids())
|
|
55
|
-
if baseline:
|
|
56
|
-
coverage = len(present & baseline) / len(baseline)
|
|
57
|
-
if coverage < self.min_coverage:
|
|
58
|
-
continue
|
|
59
|
-
yield group_key, vector
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
class VectorFillConstantTransform(_ContextExpectedMixin):
|
|
63
|
-
"""Fill missing entries with a constant value."""
|
|
64
|
-
|
|
65
|
-
def __init__(
|
|
66
|
-
self,
|
|
67
|
-
*,
|
|
68
|
-
value: Any,
|
|
69
|
-
) -> None:
|
|
70
|
-
super().__init__()
|
|
71
|
-
self.value = value
|
|
72
|
-
|
|
73
|
-
def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
74
|
-
return self.apply(stream)
|
|
75
|
-
|
|
76
|
-
def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
77
|
-
for group_key, vector in stream:
|
|
78
|
-
targets = self._expected_ids()
|
|
79
|
-
if not targets:
|
|
80
|
-
yield group_key, vector
|
|
81
|
-
continue
|
|
82
|
-
data = clone(vector.values)
|
|
83
|
-
updated = False
|
|
84
|
-
for feature in targets:
|
|
85
|
-
if feature not in data or is_missing(data[feature]):
|
|
86
|
-
data[feature] = self.value
|
|
87
|
-
updated = True
|
|
88
|
-
if updated:
|
|
89
|
-
yield group_key, Vector(values=data)
|
|
90
|
-
else:
|
|
91
|
-
yield group_key, vector
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
class VectorFillHistoryTransform(_ContextExpectedMixin):
|
|
95
|
-
"""Fill missing entries using running statistics from prior buckets."""
|
|
96
|
-
|
|
97
|
-
def __init__(
|
|
98
|
-
self,
|
|
99
|
-
*,
|
|
100
|
-
statistic: Literal["mean", "median"] = "median",
|
|
101
|
-
window: int | None = None,
|
|
102
|
-
min_samples: int = 1,
|
|
103
|
-
) -> None:
|
|
104
|
-
super().__init__()
|
|
105
|
-
if window is not None and window <= 0:
|
|
106
|
-
raise ValueError("window must be positive when provided")
|
|
107
|
-
if min_samples <= 0:
|
|
108
|
-
raise ValueError("min_samples must be positive")
|
|
109
|
-
self.statistic = statistic
|
|
110
|
-
self.window = window
|
|
111
|
-
self.min_samples = min_samples
|
|
112
|
-
self.history: dict[str, deque[float]] = {}
|
|
113
|
-
|
|
114
|
-
def _compute(self, feature_id: str) -> float | None:
|
|
115
|
-
values = self.history.get(feature_id)
|
|
116
|
-
if not values or len(values) < self.min_samples:
|
|
117
|
-
return None
|
|
118
|
-
if self.statistic == "mean":
|
|
119
|
-
return float(mean(values))
|
|
120
|
-
return float(median(values))
|
|
121
|
-
|
|
122
|
-
def _push(self, feature_id: str, value: Any) -> None:
|
|
123
|
-
if is_missing(value):
|
|
124
|
-
return
|
|
125
|
-
try:
|
|
126
|
-
num = float(value)
|
|
127
|
-
except (TypeError, ValueError):
|
|
128
|
-
# Ignore non-scalar/non-numeric entries
|
|
129
|
-
return
|
|
130
|
-
bucket = self.history.setdefault(
|
|
131
|
-
str(feature_id), deque(maxlen=self.window))
|
|
132
|
-
bucket.append(num)
|
|
133
|
-
|
|
134
|
-
def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
135
|
-
return self.apply(stream)
|
|
136
|
-
|
|
137
|
-
def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
138
|
-
for group_key, vector in stream:
|
|
139
|
-
targets = self._expected_ids()
|
|
140
|
-
data = clone(vector.values)
|
|
141
|
-
updated = False
|
|
142
|
-
for feature in targets:
|
|
143
|
-
if feature in data and not is_missing(data[feature]):
|
|
144
|
-
continue
|
|
145
|
-
fill = self._compute(feature)
|
|
146
|
-
if fill is not None:
|
|
147
|
-
data[feature] = fill
|
|
148
|
-
updated = True
|
|
149
|
-
# Push history after possibly filling
|
|
150
|
-
for fid, value in data.items():
|
|
151
|
-
self._push(fid, value)
|
|
152
|
-
if updated:
|
|
153
|
-
yield group_key, Vector(values=data)
|
|
154
|
-
else:
|
|
155
|
-
yield group_key, vector
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
class VectorFillAcrossPartitionsTransform(_ContextExpectedMixin):
|
|
159
|
-
"""Fill missing entries by aggregating sibling partitions at the same timestamp."""
|
|
160
|
-
|
|
161
|
-
def __init__(
|
|
162
|
-
self,
|
|
163
|
-
*,
|
|
164
|
-
statistic: Literal["mean", "median"] = "median",
|
|
165
|
-
min_samples: int = 1,
|
|
166
|
-
) -> None:
|
|
167
|
-
super().__init__()
|
|
168
|
-
if min_samples <= 0:
|
|
169
|
-
raise ValueError("min_samples must be positive")
|
|
170
|
-
self.statistic = statistic
|
|
171
|
-
self.min_samples = min_samples
|
|
172
|
-
# Always operate on full (partition) ids
|
|
173
|
-
|
|
174
|
-
def __call__(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
175
|
-
return self.apply(stream)
|
|
176
|
-
|
|
177
|
-
def apply(self, stream: Iterator[Tuple[Any, Vector]]) -> Iterator[Tuple[Any, Vector]]:
|
|
178
|
-
for group_key, vector in stream:
|
|
179
|
-
targets = self._expected_ids()
|
|
180
|
-
if not targets:
|
|
181
|
-
yield group_key, vector
|
|
182
|
-
continue
|
|
183
|
-
|
|
184
|
-
data = clone(vector.values)
|
|
185
|
-
base_groups: dict[str, list[float]] = {}
|
|
186
|
-
for fid, value in data.items():
|
|
187
|
-
if is_missing(value):
|
|
188
|
-
continue
|
|
189
|
-
try:
|
|
190
|
-
num = float(value)
|
|
191
|
-
except (TypeError, ValueError):
|
|
192
|
-
continue
|
|
193
|
-
base_groups.setdefault(base_id(fid), []).append(num)
|
|
194
|
-
|
|
195
|
-
updated = False
|
|
196
|
-
for feature in targets:
|
|
197
|
-
if feature in data and not is_missing(data[feature]):
|
|
198
|
-
continue
|
|
199
|
-
base = base_id(feature)
|
|
200
|
-
candidates = base_groups.get(base, [])
|
|
201
|
-
if len(candidates) < self.min_samples:
|
|
202
|
-
continue
|
|
203
|
-
fill = mean(candidates) if self.statistic == "mean" else median(
|
|
204
|
-
candidates)
|
|
205
|
-
data[feature] = float(fill)
|
|
206
|
-
updated = True
|
|
207
|
-
if updated:
|
|
208
|
-
yield group_key, Vector(values=data)
|
|
209
|
-
else:
|
|
210
|
-
yield group_key, vector
|