jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field, field_validator
|
|
8
|
+
|
|
9
|
+
from datapipeline.config.tasks import (
|
|
10
|
+
VALID_PROGRESS_STYLES,
|
|
11
|
+
VALID_VISUAL_PROVIDERS,
|
|
12
|
+
)
|
|
13
|
+
from datapipeline.utils.load import load_yaml
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SharedDefaults(BaseModel):
|
|
17
|
+
visuals: Optional[str] = Field(
|
|
18
|
+
default=None, description="AUTO | TQDM | RICH | OFF"
|
|
19
|
+
)
|
|
20
|
+
progress: Optional[str] = Field(
|
|
21
|
+
default=None, description="AUTO | SPINNER | BARS | OFF"
|
|
22
|
+
)
|
|
23
|
+
log_level: Optional[str] = Field(default=None, description="DEFAULT LOG LEVEL")
|
|
24
|
+
|
|
25
|
+
@field_validator("visuals", "progress", "log_level", mode="before")
|
|
26
|
+
@classmethod
|
|
27
|
+
def _normalize(cls, value: object):
|
|
28
|
+
if value is None:
|
|
29
|
+
return None
|
|
30
|
+
if isinstance(value, str):
|
|
31
|
+
text = value.strip()
|
|
32
|
+
return text if text else None
|
|
33
|
+
return value
|
|
34
|
+
|
|
35
|
+
@field_validator("visuals", mode="before")
|
|
36
|
+
@classmethod
|
|
37
|
+
def _normalize_visuals(cls, value):
|
|
38
|
+
if value is None:
|
|
39
|
+
return None
|
|
40
|
+
if isinstance(value, bool):
|
|
41
|
+
return "OFF" if value is False else "AUTO"
|
|
42
|
+
name = str(value).upper()
|
|
43
|
+
if name not in VALID_VISUAL_PROVIDERS:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"visuals must be one of {', '.join(VALID_VISUAL_PROVIDERS)}, got {value!r}"
|
|
46
|
+
)
|
|
47
|
+
return name
|
|
48
|
+
|
|
49
|
+
@field_validator("progress", mode="before")
|
|
50
|
+
@classmethod
|
|
51
|
+
def _normalize_progress(cls, value):
|
|
52
|
+
if value is None:
|
|
53
|
+
return None
|
|
54
|
+
if isinstance(value, bool):
|
|
55
|
+
return "OFF" if value is False else "AUTO"
|
|
56
|
+
name = str(value).upper()
|
|
57
|
+
if name not in VALID_PROGRESS_STYLES:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"progress must be one of {', '.join(VALID_PROGRESS_STYLES)}, got {value!r}"
|
|
60
|
+
)
|
|
61
|
+
return name
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ServeDefaults(BaseModel):
|
|
65
|
+
log_level: Optional[str] = None
|
|
66
|
+
limit: Optional[int] = None
|
|
67
|
+
stage: Optional[int] = None
|
|
68
|
+
throttle_ms: Optional[float] = None
|
|
69
|
+
|
|
70
|
+
class OutputDefaults(BaseModel):
|
|
71
|
+
transport: str
|
|
72
|
+
format: str
|
|
73
|
+
payload: str = Field(default="sample")
|
|
74
|
+
directory: Optional[str] = Field(
|
|
75
|
+
default=None,
|
|
76
|
+
description="Base directory for fs outputs (relative paths are resolved from jerry.yaml).",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
output: Optional[OutputDefaults] = None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class BuildDefaults(BaseModel):
|
|
83
|
+
log_level: Optional[str] = None
|
|
84
|
+
mode: Optional[str] = None
|
|
85
|
+
|
|
86
|
+
@field_validator("mode", mode="before")
|
|
87
|
+
@classmethod
|
|
88
|
+
def _normalize_mode(cls, value: object):
|
|
89
|
+
if value is None:
|
|
90
|
+
return None
|
|
91
|
+
if isinstance(value, bool):
|
|
92
|
+
return "OFF" if value is False else "AUTO"
|
|
93
|
+
text = str(value).strip()
|
|
94
|
+
if not text:
|
|
95
|
+
return None
|
|
96
|
+
name = text.upper()
|
|
97
|
+
valid_modes = {"AUTO", "FORCE", "OFF"}
|
|
98
|
+
if name not in valid_modes:
|
|
99
|
+
options = ", ".join(sorted(valid_modes))
|
|
100
|
+
raise ValueError(f"build.mode must be one of {options}, got {value!r}")
|
|
101
|
+
return name
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class WorkspaceConfig(BaseModel):
|
|
105
|
+
plugin_root: Optional[str] = None
|
|
106
|
+
datasets: dict[str, str] = Field(
|
|
107
|
+
default_factory=dict,
|
|
108
|
+
description="Named dataset aliases mapping to project.yaml paths (relative to jerry.yaml).",
|
|
109
|
+
)
|
|
110
|
+
default_dataset: Optional[str] = Field(
|
|
111
|
+
default=None,
|
|
112
|
+
description="Optional default dataset alias when --dataset/--project are omitted.",
|
|
113
|
+
)
|
|
114
|
+
shared: SharedDefaults = Field(default_factory=SharedDefaults)
|
|
115
|
+
serve: ServeDefaults = Field(default_factory=ServeDefaults)
|
|
116
|
+
build: BuildDefaults = Field(default_factory=BuildDefaults)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class WorkspaceContext:
|
|
121
|
+
file_path: Path
|
|
122
|
+
config: WorkspaceConfig
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def root(self) -> Path:
|
|
126
|
+
return self.file_path.parent
|
|
127
|
+
|
|
128
|
+
def resolve_plugin_root(self) -> Optional[Path]:
|
|
129
|
+
raw = self.config.plugin_root
|
|
130
|
+
if not raw:
|
|
131
|
+
return None
|
|
132
|
+
candidate = Path(raw)
|
|
133
|
+
return (
|
|
134
|
+
candidate.resolve()
|
|
135
|
+
if candidate.is_absolute()
|
|
136
|
+
else (self.root / candidate).resolve()
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def load_workspace_context(start_dir: Optional[Path] = None) -> Optional[WorkspaceContext]:
|
|
141
|
+
"""Search from start_dir upward for jerry.yaml and return parsed config."""
|
|
142
|
+
directory = (start_dir or Path.cwd()).resolve()
|
|
143
|
+
for path in [directory, *directory.parents]:
|
|
144
|
+
candidate = path / "jerry.yaml"
|
|
145
|
+
if candidate.is_file():
|
|
146
|
+
data = load_yaml(candidate)
|
|
147
|
+
if not isinstance(data, dict):
|
|
148
|
+
raise TypeError("jerry.yaml must define a mapping at the top level")
|
|
149
|
+
# Allow users to set serve/build/shared to null to fall back to defaults
|
|
150
|
+
for key in ("shared", "serve", "build"):
|
|
151
|
+
if key in data and data[key] is None:
|
|
152
|
+
data.pop(key)
|
|
153
|
+
cfg = WorkspaceConfig.model_validate(data)
|
|
154
|
+
return WorkspaceContext(file_path=candidate, config=cfg)
|
|
155
|
+
return None
|
datapipeline/domain/__init__.py
CHANGED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .sample import Sample
|
|
2
|
+
from .vector import Vector
|
|
3
|
+
from .feature import FeatureRecord, FeatureRecordSequence
|
|
4
|
+
from .record import TemporalRecord
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"Sample",
|
|
8
|
+
"Vector",
|
|
9
|
+
"FeatureRecord",
|
|
10
|
+
"FeatureRecordSequence",
|
|
11
|
+
"TemporalRecord",
|
|
12
|
+
]
|
datapipeline/domain/record.py
CHANGED
|
@@ -26,3 +26,14 @@ class TemporalRecord(Record):
|
|
|
26
26
|
data.pop("time", None)
|
|
27
27
|
data.pop("value", None)
|
|
28
28
|
return data
|
|
29
|
+
|
|
30
|
+
def __eq__(self, other: object) -> bool:
|
|
31
|
+
if self is other:
|
|
32
|
+
return True
|
|
33
|
+
if not isinstance(other, TemporalRecord):
|
|
34
|
+
return NotImplemented
|
|
35
|
+
return (
|
|
36
|
+
self.time == other.time
|
|
37
|
+
and self.value == other.value
|
|
38
|
+
and self._identity_fields() == other._identity_fields()
|
|
39
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, asdict
|
|
4
|
+
from typing import Any, Iterator, Optional, Literal
|
|
5
|
+
|
|
6
|
+
from .vector import Vector
|
|
7
|
+
|
|
8
|
+
PayloadMode = Literal["sample", "vector"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class Sample:
|
|
13
|
+
"""
|
|
14
|
+
Represents a single grouped vector sample emitted by the pipeline.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
key: Group identifier (tuple when group_by cadence > 1).
|
|
18
|
+
features: Feature vector payload.
|
|
19
|
+
targets: Optional target vector when requested.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
key: Any
|
|
23
|
+
features: Vector
|
|
24
|
+
targets: Optional[Vector] = None
|
|
25
|
+
|
|
26
|
+
def __iter__(self) -> Iterator[Any]:
|
|
27
|
+
"""Retain tuple-like unpacking compatibility."""
|
|
28
|
+
yield self.key
|
|
29
|
+
yield self.features
|
|
30
|
+
|
|
31
|
+
def __len__(self) -> int:
|
|
32
|
+
return 2
|
|
33
|
+
|
|
34
|
+
def __getitem__(self, idx: int) -> Any:
|
|
35
|
+
if idx == 0:
|
|
36
|
+
return self.key
|
|
37
|
+
if idx == 1:
|
|
38
|
+
return self.features
|
|
39
|
+
raise IndexError(idx)
|
|
40
|
+
|
|
41
|
+
def with_targets(self, targets: Optional[Vector]) -> "Sample":
|
|
42
|
+
return Sample(key=self.key, features=self.features, targets=targets)
|
|
43
|
+
|
|
44
|
+
def with_features(self, features: Vector) -> "Sample":
|
|
45
|
+
return Sample(key=self.key, features=features, targets=self.targets)
|
|
46
|
+
|
|
47
|
+
def as_full_payload(self) -> dict[str, Any]:
|
|
48
|
+
return asdict(self)
|
|
49
|
+
|
|
50
|
+
def as_vector_payload(self) -> dict[str, Any]:
|
|
51
|
+
data: dict[str, Any] = {"features": list(self.features.values.values())}
|
|
52
|
+
if self.targets is not None:
|
|
53
|
+
data["targets"] = list(self.targets.values.values())
|
|
54
|
+
return data
|
|
@@ -8,6 +8,7 @@ from typing import Any, Literal
|
|
|
8
8
|
|
|
9
9
|
from datapipeline.config.dataset.dataset import FeatureDatasetConfig
|
|
10
10
|
from datapipeline.config.dataset.loader import load_dataset
|
|
11
|
+
from datapipeline.domain.sample import Sample
|
|
11
12
|
from datapipeline.domain.vector import Vector
|
|
12
13
|
from datapipeline.pipeline.context import PipelineContext
|
|
13
14
|
from datapipeline.pipeline.pipelines import build_vector_pipeline
|
|
@@ -69,22 +70,21 @@ class VectorAdapter:
|
|
|
69
70
|
self,
|
|
70
71
|
*,
|
|
71
72
|
limit: int | None = None,
|
|
72
|
-
include_targets: bool = False,
|
|
73
73
|
) -> Iterator[tuple[Sequence[Any], Vector]]:
|
|
74
74
|
features = list(_ensure_features(self.dataset))
|
|
75
|
-
|
|
76
|
-
try:
|
|
77
|
-
features += list(getattr(self.dataset, "targets", []) or [])
|
|
78
|
-
except Exception:
|
|
79
|
-
pass
|
|
75
|
+
target_cfgs = list(getattr(self.dataset, "targets", []) or [])
|
|
80
76
|
context = PipelineContext(self.runtime)
|
|
81
77
|
vectors = build_vector_pipeline(
|
|
82
|
-
context,
|
|
78
|
+
context,
|
|
79
|
+
features,
|
|
80
|
+
self.dataset.group_by,
|
|
81
|
+
target_configs=target_cfgs,
|
|
83
82
|
)
|
|
84
|
-
|
|
83
|
+
base_stream = post_process(context, vectors)
|
|
84
|
+
sample_iter = base_stream
|
|
85
85
|
if limit is not None:
|
|
86
|
-
|
|
87
|
-
return
|
|
86
|
+
sample_iter = islice(sample_iter, limit)
|
|
87
|
+
return ((sample.key, sample.features) for sample in sample_iter)
|
|
88
88
|
|
|
89
89
|
def iter_rows(
|
|
90
90
|
self,
|
|
@@ -94,24 +94,38 @@ class VectorAdapter:
|
|
|
94
94
|
group_format: GroupFormat = "mapping",
|
|
95
95
|
group_column: str = "group",
|
|
96
96
|
flatten_sequences: bool = False,
|
|
97
|
-
include_targets: bool = False,
|
|
98
97
|
) -> Iterator[dict[str, Any]]:
|
|
99
|
-
|
|
98
|
+
features = list(_ensure_features(self.dataset))
|
|
99
|
+
target_cfgs = list(getattr(self.dataset, "targets", []) or [])
|
|
100
|
+
context = PipelineContext(self.runtime)
|
|
101
|
+
vectors = build_vector_pipeline(
|
|
102
|
+
context,
|
|
103
|
+
features,
|
|
104
|
+
self.dataset.group_by,
|
|
105
|
+
target_configs=target_cfgs,
|
|
106
|
+
)
|
|
107
|
+
base_stream = post_process(context, vectors)
|
|
108
|
+
if limit is not None:
|
|
109
|
+
base_stream = islice(base_stream, limit)
|
|
100
110
|
group_by = self.dataset.group_by
|
|
101
111
|
|
|
102
112
|
def _rows() -> Iterator[dict[str, Any]]:
|
|
103
|
-
for
|
|
113
|
+
for sample in base_stream:
|
|
104
114
|
row: dict[str, Any] = {}
|
|
105
115
|
if include_group:
|
|
106
116
|
row[group_column] = _normalize_group(
|
|
107
|
-
|
|
117
|
+
sample.key, group_by, group_format
|
|
108
118
|
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
119
|
+
vectors = [sample.features]
|
|
120
|
+
if sample.targets:
|
|
121
|
+
vectors.append(sample.targets)
|
|
122
|
+
for vector in vectors:
|
|
123
|
+
for feature_id, value in vector.values.items():
|
|
124
|
+
if flatten_sequences and isinstance(value, list):
|
|
125
|
+
for idx, item in enumerate(value):
|
|
126
|
+
row[f"{feature_id}[{idx}]"] = item
|
|
127
|
+
else:
|
|
128
|
+
row[feature_id] = value
|
|
115
129
|
yield row
|
|
116
130
|
|
|
117
131
|
return _rows()
|
|
@@ -15,7 +15,6 @@ def dataframe_from_vectors(
|
|
|
15
15
|
group_format: GroupFormat = "mapping",
|
|
16
16
|
group_column: str = "group",
|
|
17
17
|
flatten_sequences: bool = False,
|
|
18
|
-
include_targets: bool = False,
|
|
19
18
|
open_stream: Callable[[str], Iterable[Any]] | None = None,
|
|
20
19
|
):
|
|
21
20
|
"""Return a Pandas DataFrame built from project vectors.
|
|
@@ -37,7 +36,6 @@ def dataframe_from_vectors(
|
|
|
37
36
|
group_format=group_format,
|
|
38
37
|
group_column=group_column,
|
|
39
38
|
flatten_sequences=flatten_sequences,
|
|
40
|
-
include_targets=include_targets,
|
|
41
39
|
open_stream=open_stream,
|
|
42
40
|
)
|
|
43
41
|
return pd.DataFrame(rows)
|
|
@@ -13,13 +13,12 @@ def stream_vectors(
|
|
|
13
13
|
project_yaml: str | Path,
|
|
14
14
|
*,
|
|
15
15
|
limit: int | None = None,
|
|
16
|
-
include_targets: bool = False,
|
|
17
16
|
) -> Iterator[tuple[Sequence[Any], Vector]]:
|
|
18
17
|
"""Yield ``(group_key, Vector)`` pairs for the configured project."""
|
|
19
18
|
|
|
20
19
|
adapter = VectorAdapter.from_project(project_yaml)
|
|
21
20
|
try:
|
|
22
|
-
return adapter.stream(limit=limit
|
|
21
|
+
return adapter.stream(limit=limit)
|
|
23
22
|
except ValueError:
|
|
24
23
|
return iter(())
|
|
25
24
|
|
|
@@ -32,7 +31,6 @@ def iter_vector_rows(
|
|
|
32
31
|
group_format: GroupFormat = "mapping",
|
|
33
32
|
group_column: str = "group",
|
|
34
33
|
flatten_sequences: bool = False,
|
|
35
|
-
include_targets: bool = False,
|
|
36
34
|
) -> Iterator[dict[str, Any]]:
|
|
37
35
|
"""Return an iterator of row dictionaries derived from vectors."""
|
|
38
36
|
|
|
@@ -44,7 +42,6 @@ def iter_vector_rows(
|
|
|
44
42
|
group_format=group_format,
|
|
45
43
|
group_column=group_column,
|
|
46
44
|
flatten_sequences=flatten_sequences,
|
|
47
|
-
include_targets=include_targets,
|
|
48
45
|
)
|
|
49
46
|
except ValueError:
|
|
50
47
|
return iter(())
|
|
@@ -58,7 +55,6 @@ def collect_vector_rows(
|
|
|
58
55
|
group_format: GroupFormat = "mapping",
|
|
59
56
|
group_column: str = "group",
|
|
60
57
|
flatten_sequences: bool = False,
|
|
61
|
-
include_targets: bool = False,
|
|
62
58
|
open_stream=None,
|
|
63
59
|
) -> list[dict[str, Any]]:
|
|
64
60
|
"""Materialize :func:`iter_vector_rows` into a list for eager workflows."""
|
|
@@ -70,7 +66,6 @@ def collect_vector_rows(
|
|
|
70
66
|
group_format=group_format,
|
|
71
67
|
group_column=group_column,
|
|
72
68
|
flatten_sequences=flatten_sequences,
|
|
73
|
-
include_targets=include_targets,
|
|
74
69
|
)
|
|
75
70
|
return list(iterator)
|
|
76
71
|
|
|
@@ -35,7 +35,6 @@ def torch_dataset(
|
|
|
35
35
|
dtype: Any | None = None,
|
|
36
36
|
device: Any | None = None,
|
|
37
37
|
flatten_sequences: bool = False,
|
|
38
|
-
include_targets: bool = False,
|
|
39
38
|
):
|
|
40
39
|
"""Build a torch.utils.data.Dataset that yields tensors from vectors."""
|
|
41
40
|
|
|
@@ -52,10 +51,9 @@ def torch_dataset(
|
|
|
52
51
|
limit=limit,
|
|
53
52
|
include_group=False,
|
|
54
53
|
flatten_sequences=flatten_sequences,
|
|
55
|
-
include_targets=include_targets,
|
|
56
54
|
)
|
|
57
55
|
|
|
58
|
-
if
|
|
56
|
+
if target_columns is None:
|
|
59
57
|
try:
|
|
60
58
|
ds = load_dataset(Path(project_yaml), "vectors")
|
|
61
59
|
target_columns = [cfg.id for cfg in getattr(ds, "targets", []) or []]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from datapipeline.io.writers import (
|
|
4
|
+
JsonLinesFileWriter,
|
|
5
|
+
JsonLinesStdoutWriter,
|
|
6
|
+
GzipJsonLinesWriter,
|
|
7
|
+
CsvFileWriter,
|
|
8
|
+
PickleFileWriter,
|
|
9
|
+
LineWriter,
|
|
10
|
+
)
|
|
11
|
+
from datapipeline.io.protocols import Writer
|
|
12
|
+
from datapipeline.io.serializers import (
|
|
13
|
+
json_line_serializer,
|
|
14
|
+
print_serializer,
|
|
15
|
+
csv_row_serializer,
|
|
16
|
+
pickle_serializer,
|
|
17
|
+
record_json_line_serializer,
|
|
18
|
+
record_print_serializer,
|
|
19
|
+
record_csv_row_serializer,
|
|
20
|
+
record_pickle_serializer,
|
|
21
|
+
)
|
|
22
|
+
from datapipeline.io.sinks import StdoutTextSink, RichStdoutSink, ReprRichFormatter, JsonRichFormatter, PlainRichFormatter
|
|
23
|
+
from datapipeline.io.output import OutputTarget
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def stdout_sink_for(format_: str, visuals: Optional[str]) -> StdoutTextSink:
|
|
27
|
+
"""Select an appropriate stdout sink given format and visuals preference.
|
|
28
|
+
|
|
29
|
+
Behavior:
|
|
30
|
+
- visuals == "rich" or "auto" -> attempt Rich formatting; fallback to plain on error.
|
|
31
|
+
- anything else -> plain stdout (no Rich formatting).
|
|
32
|
+
"""
|
|
33
|
+
fmt = (format_ or "print").lower()
|
|
34
|
+
provider = (visuals or "auto").lower()
|
|
35
|
+
|
|
36
|
+
use_rich = provider == "rich" or provider == "auto"
|
|
37
|
+
if not use_rich:
|
|
38
|
+
return StdoutTextSink()
|
|
39
|
+
|
|
40
|
+
# Prefer Rich when possible; gracefully degrade to plain stdout on any failure.
|
|
41
|
+
try:
|
|
42
|
+
if fmt in {"json", "json-lines", "jsonl"}:
|
|
43
|
+
return RichStdoutSink(JsonRichFormatter())
|
|
44
|
+
if fmt == "print":
|
|
45
|
+
return RichStdoutSink(ReprRichFormatter())
|
|
46
|
+
return RichStdoutSink(PlainRichFormatter())
|
|
47
|
+
except Exception:
|
|
48
|
+
return StdoutTextSink()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def writer_factory(
|
|
52
|
+
target: OutputTarget,
|
|
53
|
+
*,
|
|
54
|
+
visuals: Optional[str] = None,
|
|
55
|
+
item_type: str = "sample",
|
|
56
|
+
) -> Writer:
|
|
57
|
+
transport = target.transport.lower()
|
|
58
|
+
format_ = target.format.lower()
|
|
59
|
+
payload = target.payload
|
|
60
|
+
|
|
61
|
+
if item_type not in {"sample", "record"}:
|
|
62
|
+
raise ValueError(f"Unsupported writer item_type '{item_type}'")
|
|
63
|
+
|
|
64
|
+
if transport == "stdout":
|
|
65
|
+
sink = stdout_sink_for(format_, visuals)
|
|
66
|
+
if format_ in {"json-lines", "json", "jsonl"}:
|
|
67
|
+
serializer = (
|
|
68
|
+
record_json_line_serializer()
|
|
69
|
+
if item_type == "record"
|
|
70
|
+
else json_line_serializer(payload)
|
|
71
|
+
)
|
|
72
|
+
return LineWriter(sink, serializer)
|
|
73
|
+
if format_ == "print":
|
|
74
|
+
serializer = (
|
|
75
|
+
record_print_serializer()
|
|
76
|
+
if item_type == "record"
|
|
77
|
+
else print_serializer(payload)
|
|
78
|
+
)
|
|
79
|
+
return LineWriter(sink, serializer)
|
|
80
|
+
raise ValueError(f"Unsupported stdout format '{target.format}'")
|
|
81
|
+
|
|
82
|
+
destination = target.destination
|
|
83
|
+
if destination is None:
|
|
84
|
+
raise ValueError("fs output requires a destination path")
|
|
85
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
|
|
87
|
+
suffix = "".join(destination.suffixes).lower()
|
|
88
|
+
if format_ in {"json-lines", "json", "jsonl"}:
|
|
89
|
+
serializer = (
|
|
90
|
+
record_json_line_serializer()
|
|
91
|
+
if item_type == "record"
|
|
92
|
+
else json_line_serializer(payload)
|
|
93
|
+
)
|
|
94
|
+
if suffix.endswith(".jsonl.gz") or suffix.endswith(".json.gz") or suffix.endswith(".gz"):
|
|
95
|
+
return GzipJsonLinesWriter(destination, serializer=serializer)
|
|
96
|
+
return JsonLinesFileWriter(destination, serializer=serializer)
|
|
97
|
+
if format_ == "csv":
|
|
98
|
+
serializer = (
|
|
99
|
+
record_csv_row_serializer()
|
|
100
|
+
if item_type == "record"
|
|
101
|
+
else csv_row_serializer(payload)
|
|
102
|
+
)
|
|
103
|
+
return CsvFileWriter(destination, serializer=serializer)
|
|
104
|
+
if format_ == "pickle":
|
|
105
|
+
serializer = (
|
|
106
|
+
record_pickle_serializer()
|
|
107
|
+
if item_type == "record"
|
|
108
|
+
else pickle_serializer(payload)
|
|
109
|
+
)
|
|
110
|
+
return PickleFileWriter(destination, serializer=serializer)
|
|
111
|
+
|
|
112
|
+
raise ValueError(f"Unsupported fs format '{target.format}'")
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from datapipeline.config.tasks import ServeOutputConfig
|
|
8
|
+
from datapipeline.services.runs import RunPaths, start_run_for_directory
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _format_suffix(fmt: str) -> str:
|
|
12
|
+
suffix_map = {
|
|
13
|
+
"json-lines": ".jsonl",
|
|
14
|
+
"json": ".json",
|
|
15
|
+
"csv": ".csv",
|
|
16
|
+
"pickle": ".pkl",
|
|
17
|
+
}
|
|
18
|
+
return suffix_map.get(fmt, ".out")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _default_filename_for_format(fmt: str) -> str:
|
|
22
|
+
suffix = _format_suffix(fmt)
|
|
23
|
+
return f"vectors{suffix}"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _sanitize_segment(value: str) -> str:
|
|
27
|
+
cleaned = "".join(
|
|
28
|
+
ch if ch.isalnum() or ch in ("_", "-", ".") else "_"
|
|
29
|
+
for ch in value.strip()
|
|
30
|
+
)
|
|
31
|
+
return cleaned or "run"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class OutputTarget:
|
|
36
|
+
"""Resolved writer target describing how and where to emit records."""
|
|
37
|
+
|
|
38
|
+
transport: str # stdout | fs
|
|
39
|
+
format: str # print | json-lines | json | csv | pickle
|
|
40
|
+
destination: Optional[Path]
|
|
41
|
+
payload: str = "sample"
|
|
42
|
+
run: RunPaths | None = None
|
|
43
|
+
|
|
44
|
+
def for_feature(self, feature_id: str) -> "OutputTarget":
|
|
45
|
+
if self.transport != "fs" or self.destination is None:
|
|
46
|
+
return self
|
|
47
|
+
safe_feature = "".join(
|
|
48
|
+
ch if ch.isalnum() or ch in ("_", "-", ".") else "_"
|
|
49
|
+
for ch in str(feature_id)
|
|
50
|
+
)
|
|
51
|
+
dest = self.destination
|
|
52
|
+
suffix = "".join(dest.suffixes)
|
|
53
|
+
stem = dest.name[: -len(suffix)] if suffix else dest.name
|
|
54
|
+
new_name = f"{stem}.{safe_feature}{suffix}"
|
|
55
|
+
new_path = dest.with_name(new_name)
|
|
56
|
+
return OutputTarget(
|
|
57
|
+
transport=self.transport,
|
|
58
|
+
format=self.format,
|
|
59
|
+
destination=new_path,
|
|
60
|
+
payload=self.payload,
|
|
61
|
+
run=self.run,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class OutputResolutionError(ValueError):
|
|
66
|
+
"""Raised when CLI/config output options cannot be resolved."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def resolve_output_target(
|
|
70
|
+
*,
|
|
71
|
+
cli_output: ServeOutputConfig | None,
|
|
72
|
+
config_output: ServeOutputConfig | None,
|
|
73
|
+
default: ServeOutputConfig | None = None,
|
|
74
|
+
base_path: Path | None = None,
|
|
75
|
+
run_name: str | None = None,
|
|
76
|
+
payload_override: str | None = None,
|
|
77
|
+
stage: int | None = None,
|
|
78
|
+
create_run: bool = False,
|
|
79
|
+
) -> OutputTarget:
|
|
80
|
+
"""
|
|
81
|
+
Resolve the effective output target using CLI override, run config, or default.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
base_path = base_path or Path.cwd()
|
|
85
|
+
|
|
86
|
+
config = cli_output or config_output or default
|
|
87
|
+
if config is None:
|
|
88
|
+
config = ServeOutputConfig(transport="stdout", format="print")
|
|
89
|
+
|
|
90
|
+
payload = payload_override or config.payload or "sample"
|
|
91
|
+
|
|
92
|
+
if config.transport == "stdout":
|
|
93
|
+
return OutputTarget(
|
|
94
|
+
transport="stdout",
|
|
95
|
+
format=config.format,
|
|
96
|
+
destination=None,
|
|
97
|
+
payload=payload,
|
|
98
|
+
run=None,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if config.directory is None:
|
|
102
|
+
raise OutputResolutionError("fs output requires a directory")
|
|
103
|
+
directory = (
|
|
104
|
+
config.directory
|
|
105
|
+
if config.directory.is_absolute()
|
|
106
|
+
else (base_path / config.directory).resolve()
|
|
107
|
+
)
|
|
108
|
+
if create_run:
|
|
109
|
+
run_paths, _ = start_run_for_directory(directory, stage=stage)
|
|
110
|
+
base_dest_dir = run_paths.dataset_dir
|
|
111
|
+
else:
|
|
112
|
+
run_paths = None
|
|
113
|
+
# When not creating a managed run, nest outputs under an optional
|
|
114
|
+
# run_name subdirectory to keep layouts consistent with tests/CLI.
|
|
115
|
+
base_dest_dir = directory
|
|
116
|
+
if run_name:
|
|
117
|
+
base_dest_dir = base_dest_dir / _sanitize_segment(run_name)
|
|
118
|
+
suffix = _format_suffix(config.format)
|
|
119
|
+
filename_stem = config.filename or run_name
|
|
120
|
+
if filename_stem:
|
|
121
|
+
filename = f"{filename_stem}{suffix}"
|
|
122
|
+
else:
|
|
123
|
+
filename = _default_filename_for_format(config.format)
|
|
124
|
+
dest_path = (base_dest_dir / filename).resolve()
|
|
125
|
+
|
|
126
|
+
return OutputTarget(
|
|
127
|
+
transport="fs",
|
|
128
|
+
format=config.format,
|
|
129
|
+
destination=dest_path,
|
|
130
|
+
payload=payload,
|
|
131
|
+
run=run_paths,
|
|
132
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Protocol, Optional, runtime_checkable
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@runtime_checkable
|
|
6
|
+
class Writer(Protocol):
|
|
7
|
+
def write(self, rec: dict) -> None: ...
|
|
8
|
+
def close(self) -> None: ...
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class HeaderCapable(Protocol):
|
|
13
|
+
"""Writers that can accept an injected logical 'header record' as the first write."""
|
|
14
|
+
|
|
15
|
+
def write_header(self, header: dict) -> None: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@runtime_checkable
|
|
19
|
+
class HasFilePath(Protocol):
|
|
20
|
+
@property
|
|
21
|
+
def file_path(self) -> Optional[Path]: ...
|