jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +5 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +54 -10
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +76 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.1.dist-info/METADATA +825 -0
- jerry_thomas-1.0.1.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, Tuple
|
|
8
|
+
|
|
9
|
+
from datapipeline.config.dataset.loader import load_dataset
|
|
10
|
+
from datapipeline.config.metadata import (
|
|
11
|
+
VectorMetadata,
|
|
12
|
+
Window,
|
|
13
|
+
FEATURE_VECTORS_COUNT_KEY,
|
|
14
|
+
TARGET_VECTORS_COUNT_KEY,
|
|
15
|
+
)
|
|
16
|
+
from datapipeline.config.tasks import MetadataTask
|
|
17
|
+
from datapipeline.runtime import Runtime
|
|
18
|
+
from datapipeline.utils.paths import ensure_parent
|
|
19
|
+
from datapipeline.config.dataset.normalize import floor_time_to_bucket
|
|
20
|
+
from datapipeline.utils.time import parse_timecode
|
|
21
|
+
|
|
22
|
+
from .utils import collect_schema_entries, metadata_entries_from_stats
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _entry_window(entry: dict) -> tuple[datetime | None, datetime | None]:
|
|
26
|
+
start = entry.get("first_ts")
|
|
27
|
+
end = entry.get("last_ts")
|
|
28
|
+
return (start if isinstance(start, datetime) else None, end if isinstance(end, datetime) else None)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _group_ranges(entries: list[dict], key_name: str) -> list[tuple[datetime, datetime]]:
|
|
32
|
+
grouped: dict[str, list[tuple[datetime, datetime]]] = defaultdict(list)
|
|
33
|
+
for entry in entries:
|
|
34
|
+
start, end = _entry_window(entry)
|
|
35
|
+
if start is None or end is None:
|
|
36
|
+
continue
|
|
37
|
+
group_key = entry.get(key_name) or entry.get("id")
|
|
38
|
+
if not isinstance(group_key, str):
|
|
39
|
+
continue
|
|
40
|
+
grouped[group_key].append((start, end))
|
|
41
|
+
ranges: list[tuple[datetime, datetime]] = []
|
|
42
|
+
for values in grouped.values():
|
|
43
|
+
group_start = min(start for start, _ in values)
|
|
44
|
+
group_end = max(end for _, end in values)
|
|
45
|
+
ranges.append((group_start, group_end))
|
|
46
|
+
return ranges
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _range_union(ranges):
|
|
50
|
+
if not ranges:
|
|
51
|
+
return None, None
|
|
52
|
+
start = min(r[0] for r in ranges)
|
|
53
|
+
end = max(r[1] for r in ranges)
|
|
54
|
+
if start >= end:
|
|
55
|
+
return None, None
|
|
56
|
+
return start, end
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _range_intersection(ranges):
|
|
60
|
+
if not ranges:
|
|
61
|
+
return None, None
|
|
62
|
+
start = max(r[0] for r in ranges)
|
|
63
|
+
end = min(r[1] for r in ranges)
|
|
64
|
+
if start >= end:
|
|
65
|
+
return None, None
|
|
66
|
+
return start, end
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _window_bounds_from_stats(
|
|
70
|
+
feature_stats: list[dict],
|
|
71
|
+
target_stats: list[dict],
|
|
72
|
+
*,
|
|
73
|
+
mode: str,
|
|
74
|
+
) -> tuple[datetime | None, datetime | None]:
|
|
75
|
+
base_ranges = _group_ranges(
|
|
76
|
+
feature_stats, "base_id") + _group_ranges(target_stats, "base_id")
|
|
77
|
+
partition_ranges = _group_ranges(
|
|
78
|
+
feature_stats, "id") + _group_ranges(target_stats, "id")
|
|
79
|
+
|
|
80
|
+
if mode == "intersection":
|
|
81
|
+
return _range_intersection(base_ranges)
|
|
82
|
+
if mode == "strict":
|
|
83
|
+
return _range_intersection(partition_ranges)
|
|
84
|
+
if mode == "relaxed":
|
|
85
|
+
return _range_union(partition_ranges)
|
|
86
|
+
# default to union
|
|
87
|
+
return _range_union(base_ranges if base_ranges else partition_ranges)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _window_size(start: datetime | None, end: datetime | None, cadence: str | None) -> int | None:
|
|
91
|
+
if start is None or end is None or cadence is None:
|
|
92
|
+
return None
|
|
93
|
+
try:
|
|
94
|
+
anchored_start = floor_time_to_bucket(start, cadence)
|
|
95
|
+
anchored_end = floor_time_to_bucket(end, cadence)
|
|
96
|
+
step = parse_timecode(cadence)
|
|
97
|
+
if anchored_end < anchored_start:
|
|
98
|
+
return None
|
|
99
|
+
return int(((anchored_end - anchored_start) / step)) + 1
|
|
100
|
+
except Exception:
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def materialize_metadata(runtime: Runtime, task_cfg: MetadataTask) -> Tuple[str, Dict[str, object]] | None:
|
|
105
|
+
if not task_cfg.enabled:
|
|
106
|
+
return None
|
|
107
|
+
dataset = load_dataset(runtime.project_yaml, "vectors")
|
|
108
|
+
features_cfgs = list(dataset.features or [])
|
|
109
|
+
feature_stats, feature_vectors, feature_min, feature_max = collect_schema_entries(
|
|
110
|
+
runtime,
|
|
111
|
+
features_cfgs,
|
|
112
|
+
dataset.group_by,
|
|
113
|
+
cadence_strategy=task_cfg.cadence_strategy,
|
|
114
|
+
collect_metadata=True,
|
|
115
|
+
)
|
|
116
|
+
target_meta: list[dict] = []
|
|
117
|
+
target_vectors = 0
|
|
118
|
+
target_cfgs = list(dataset.targets or [])
|
|
119
|
+
target_stats: list[dict] = []
|
|
120
|
+
target_min = target_max = None
|
|
121
|
+
if target_cfgs:
|
|
122
|
+
target_stats, target_vectors, target_min, target_max = collect_schema_entries(
|
|
123
|
+
runtime,
|
|
124
|
+
target_cfgs,
|
|
125
|
+
dataset.group_by,
|
|
126
|
+
cadence_strategy=task_cfg.cadence_strategy,
|
|
127
|
+
collect_metadata=True,
|
|
128
|
+
)
|
|
129
|
+
target_meta = metadata_entries_from_stats(
|
|
130
|
+
target_stats, task_cfg.cadence_strategy)
|
|
131
|
+
feature_meta = metadata_entries_from_stats(
|
|
132
|
+
feature_stats, task_cfg.cadence_strategy)
|
|
133
|
+
|
|
134
|
+
generated_at = datetime.now(timezone.utc)
|
|
135
|
+
window_obj: Window | None = None
|
|
136
|
+
computed_start, computed_end = _window_bounds_from_stats(
|
|
137
|
+
feature_stats,
|
|
138
|
+
target_stats if target_cfgs else [],
|
|
139
|
+
mode=task_cfg.window_mode,
|
|
140
|
+
)
|
|
141
|
+
start = computed_start
|
|
142
|
+
end = computed_end
|
|
143
|
+
if start is not None and end is not None and start < end:
|
|
144
|
+
size = _window_size(start, end, dataset.group_by)
|
|
145
|
+
window_obj = Window(start=start, end=end,
|
|
146
|
+
mode=task_cfg.window_mode, size=size)
|
|
147
|
+
|
|
148
|
+
doc = VectorMetadata(
|
|
149
|
+
schema_version=1,
|
|
150
|
+
generated_at=generated_at,
|
|
151
|
+
features=feature_meta,
|
|
152
|
+
targets=target_meta,
|
|
153
|
+
counts={
|
|
154
|
+
FEATURE_VECTORS_COUNT_KEY: feature_vectors,
|
|
155
|
+
TARGET_VECTORS_COUNT_KEY: target_vectors,
|
|
156
|
+
},
|
|
157
|
+
window=window_obj,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
relative_path = Path(task_cfg.output)
|
|
161
|
+
destination = (runtime.artifacts_root / relative_path).resolve()
|
|
162
|
+
ensure_parent(destination)
|
|
163
|
+
with destination.open("w", encoding="utf-8") as fh:
|
|
164
|
+
json.dump(doc.model_dump(mode="json"), fh, indent=2)
|
|
165
|
+
|
|
166
|
+
meta: Dict[str, object] = {
|
|
167
|
+
"features": len(feature_meta),
|
|
168
|
+
"targets": len(target_meta),
|
|
169
|
+
}
|
|
170
|
+
return str(relative_path), meta
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Iterator, Tuple
|
|
5
|
+
|
|
6
|
+
from datapipeline.config.tasks import ScalerTask
|
|
7
|
+
from datapipeline.config.dataset.loader import load_dataset
|
|
8
|
+
from datapipeline.domain.sample import Sample
|
|
9
|
+
from datapipeline.pipeline.context import PipelineContext
|
|
10
|
+
from datapipeline.pipeline.pipelines import build_vector_pipeline
|
|
11
|
+
from datapipeline.pipeline.split import build_labeler
|
|
12
|
+
from datapipeline.runtime import Runtime
|
|
13
|
+
from datapipeline.transforms.feature.scaler import StandardScaler
|
|
14
|
+
from datapipeline.utils.paths import ensure_parent
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def materialize_scaler_statistics(runtime: Runtime, task_cfg: ScalerTask) -> Tuple[str, Dict[str, object]] | None:
|
|
18
|
+
if not task_cfg.enabled:
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
dataset = load_dataset(runtime.project_yaml, "vectors")
|
|
22
|
+
feature_cfgs = list(dataset.features or [])
|
|
23
|
+
target_cfgs = list(dataset.targets or [])
|
|
24
|
+
if not feature_cfgs and not target_cfgs:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
sanitized_features = [cfg.model_copy(update={"scale": False}) for cfg in feature_cfgs]
|
|
28
|
+
sanitized_targets = [cfg.model_copy(update={"scale": False}) for cfg in target_cfgs]
|
|
29
|
+
|
|
30
|
+
context = PipelineContext(runtime)
|
|
31
|
+
vectors = build_vector_pipeline(
|
|
32
|
+
context,
|
|
33
|
+
sanitized_features,
|
|
34
|
+
dataset.group_by,
|
|
35
|
+
target_configs=sanitized_targets,
|
|
36
|
+
rectangular=False,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
cfg = getattr(runtime, "split", None)
|
|
40
|
+
labeler = build_labeler(cfg) if cfg else None
|
|
41
|
+
if not labeler and task_cfg.split_label != "all":
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
f"Cannot compute scaler statistics for split '{task_cfg.split_label}' "
|
|
44
|
+
"when no split configuration is defined in the project."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def _train_stream() -> Iterator[Sample]:
|
|
48
|
+
for sample in vectors:
|
|
49
|
+
if labeler and labeler.label(sample.key, sample.features) != task_cfg.split_label:
|
|
50
|
+
continue
|
|
51
|
+
yield sample
|
|
52
|
+
|
|
53
|
+
scaler = StandardScaler()
|
|
54
|
+
total_observations = scaler.fit(_train_stream())
|
|
55
|
+
|
|
56
|
+
if not scaler.statistics:
|
|
57
|
+
raise RuntimeError(
|
|
58
|
+
f"No scaler statistics computed for split '{task_cfg.split_label}'."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
relative_path = Path(task_cfg.output)
|
|
62
|
+
destination = (runtime.artifacts_root / relative_path).resolve()
|
|
63
|
+
ensure_parent(destination)
|
|
64
|
+
|
|
65
|
+
scaler.save(destination)
|
|
66
|
+
|
|
67
|
+
meta: Dict[str, object] = {
|
|
68
|
+
"features": len(scaler.statistics),
|
|
69
|
+
"split": task_cfg.split_label,
|
|
70
|
+
"observations": total_observations,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
return str(relative_path), meta
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, Tuple
|
|
7
|
+
|
|
8
|
+
from datapipeline.config.tasks import SchemaTask
|
|
9
|
+
from datapipeline.config.dataset.loader import load_dataset
|
|
10
|
+
from datapipeline.runtime import Runtime
|
|
11
|
+
from datapipeline.utils.paths import ensure_parent
|
|
12
|
+
from datapipeline.utils.window import resolve_window_bounds
|
|
13
|
+
|
|
14
|
+
from .utils import collect_schema_entries, schema_entries_from_stats
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def materialize_vector_schema(runtime: Runtime, task_cfg: SchemaTask) -> Tuple[str, Dict[str, object]] | None:
|
|
18
|
+
if not task_cfg.enabled:
|
|
19
|
+
return None
|
|
20
|
+
dataset = load_dataset(runtime.project_yaml, "vectors")
|
|
21
|
+
features_cfgs = list(dataset.features or [])
|
|
22
|
+
feature_stats, feature_vectors, feature_min, feature_max = collect_schema_entries(
|
|
23
|
+
runtime,
|
|
24
|
+
features_cfgs,
|
|
25
|
+
dataset.group_by,
|
|
26
|
+
cadence_strategy=task_cfg.cadence_strategy,
|
|
27
|
+
collect_metadata=False,
|
|
28
|
+
)
|
|
29
|
+
target_entries: list[dict] = []
|
|
30
|
+
target_cfgs = list(dataset.targets or [])
|
|
31
|
+
target_min = target_max = None
|
|
32
|
+
if target_cfgs:
|
|
33
|
+
target_stats, _, target_min, target_max = collect_schema_entries(
|
|
34
|
+
runtime,
|
|
35
|
+
target_cfgs,
|
|
36
|
+
dataset.group_by,
|
|
37
|
+
cadence_strategy=task_cfg.cadence_strategy,
|
|
38
|
+
collect_metadata=False,
|
|
39
|
+
)
|
|
40
|
+
target_entries = schema_entries_from_stats(target_stats, task_cfg.cadence_strategy)
|
|
41
|
+
feature_entries = schema_entries_from_stats(feature_stats, task_cfg.cadence_strategy)
|
|
42
|
+
|
|
43
|
+
doc = {
|
|
44
|
+
"schema_version": 1,
|
|
45
|
+
"generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
46
|
+
}
|
|
47
|
+
doc["features"] = feature_entries
|
|
48
|
+
doc["targets"] = target_entries
|
|
49
|
+
|
|
50
|
+
relative_path = Path(task_cfg.output)
|
|
51
|
+
destination = (runtime.artifacts_root / relative_path).resolve()
|
|
52
|
+
ensure_parent(destination)
|
|
53
|
+
with destination.open("w", encoding="utf-8") as fh:
|
|
54
|
+
json.dump(doc, fh, indent=2)
|
|
55
|
+
|
|
56
|
+
meta: Dict[str, object] = {
|
|
57
|
+
"features": len(feature_entries),
|
|
58
|
+
"targets": len(target_entries),
|
|
59
|
+
}
|
|
60
|
+
return str(relative_path), meta
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import Counter, OrderedDict
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from datapipeline.pipeline.context import PipelineContext
|
|
8
|
+
from datapipeline.pipeline.pipelines import build_vector_pipeline
|
|
9
|
+
from datapipeline.runtime import Runtime
|
|
10
|
+
from datapipeline.transforms.vector_utils import base_id as _base_feature_id
|
|
11
|
+
from datapipeline.transforms.utils import is_missing
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _type_name(value: object) -> str:
|
|
15
|
+
if value is None:
|
|
16
|
+
return "null"
|
|
17
|
+
return type(value).__name__
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def collect_schema_entries(
|
|
21
|
+
runtime: Runtime,
|
|
22
|
+
configs,
|
|
23
|
+
group_by: str,
|
|
24
|
+
*,
|
|
25
|
+
cadence_strategy: str,
|
|
26
|
+
collect_metadata: bool,
|
|
27
|
+
) -> tuple[list[dict], int, datetime | None, datetime | None]:
|
|
28
|
+
configs = list(configs or [])
|
|
29
|
+
if not configs:
|
|
30
|
+
return [], 0, None, None
|
|
31
|
+
sanitized = [cfg.model_copy(update={"scale": False}) for cfg in configs]
|
|
32
|
+
context = PipelineContext(runtime)
|
|
33
|
+
vectors = build_vector_pipeline(
|
|
34
|
+
context,
|
|
35
|
+
sanitized,
|
|
36
|
+
group_by,
|
|
37
|
+
rectangular=False,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
stats: OrderedDict[str, dict] = OrderedDict()
|
|
41
|
+
vector_count = 0
|
|
42
|
+
min_time: datetime | None = None
|
|
43
|
+
max_time: datetime | None = None
|
|
44
|
+
for sample in vectors:
|
|
45
|
+
vector_count += 1
|
|
46
|
+
ts = sample.key[0] if isinstance(sample.key, tuple) and sample.key else None
|
|
47
|
+
if isinstance(ts, datetime):
|
|
48
|
+
min_time = ts if min_time is None else min(min_time, ts)
|
|
49
|
+
max_time = ts if max_time is None else max(max_time, ts)
|
|
50
|
+
payload = sample.features
|
|
51
|
+
for fid, value in payload.values.items():
|
|
52
|
+
entry = stats.get(fid)
|
|
53
|
+
if not entry:
|
|
54
|
+
entry = stats[fid] = {
|
|
55
|
+
"id": fid,
|
|
56
|
+
"base_id": _base_feature_id(fid),
|
|
57
|
+
"kind": None,
|
|
58
|
+
"max_length": None,
|
|
59
|
+
"present_count": 0,
|
|
60
|
+
"null_count": 0,
|
|
61
|
+
"scalar_types": set(),
|
|
62
|
+
"element_types": set(),
|
|
63
|
+
"min_length": None,
|
|
64
|
+
"lengths": Counter(),
|
|
65
|
+
"first_ts": None,
|
|
66
|
+
"last_ts": None,
|
|
67
|
+
}
|
|
68
|
+
if isinstance(ts, datetime):
|
|
69
|
+
prev_start = entry.get("first_ts")
|
|
70
|
+
entry["first_ts"] = ts if prev_start is None else min(prev_start, ts)
|
|
71
|
+
prev_end = entry.get("last_ts")
|
|
72
|
+
entry["last_ts"] = ts if prev_end is None else max(prev_end, ts)
|
|
73
|
+
if collect_metadata:
|
|
74
|
+
entry["present_count"] += 1
|
|
75
|
+
if is_missing(value):
|
|
76
|
+
if collect_metadata:
|
|
77
|
+
entry["null_count"] += 1
|
|
78
|
+
continue
|
|
79
|
+
if isinstance(value, list):
|
|
80
|
+
entry["kind"] = "list"
|
|
81
|
+
length = len(value)
|
|
82
|
+
entry["min_length"] = length if entry["min_length"] is None else min(
|
|
83
|
+
entry["min_length"], length
|
|
84
|
+
)
|
|
85
|
+
entry["max_length"] = length if entry["max_length"] is None else max(
|
|
86
|
+
entry["max_length"], length
|
|
87
|
+
)
|
|
88
|
+
if collect_metadata:
|
|
89
|
+
entry["lengths"][length] += 1
|
|
90
|
+
entry["observed_elements"] = entry.get("observed_elements", 0) + sum(
|
|
91
|
+
1 for v in value if not is_missing(v)
|
|
92
|
+
)
|
|
93
|
+
if not value:
|
|
94
|
+
entry["element_types"].add("empty")
|
|
95
|
+
else:
|
|
96
|
+
entry["element_types"].update(_type_name(v) for v in value)
|
|
97
|
+
else:
|
|
98
|
+
if entry["kind"] != "list":
|
|
99
|
+
entry["kind"] = "scalar"
|
|
100
|
+
if collect_metadata:
|
|
101
|
+
entry["scalar_types"].add(_type_name(value))
|
|
102
|
+
|
|
103
|
+
return list(stats.values()), vector_count, min_time, max_time
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _resolve_cadence_target(stats: dict, strategy: str) -> int | None:
|
|
107
|
+
if strategy == "max":
|
|
108
|
+
max_len = stats.get("max_length")
|
|
109
|
+
if isinstance(max_len, (int, float)) and max_len > 0:
|
|
110
|
+
return int(max_len)
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def schema_entries_from_stats(entries: list[dict], cadence_strategy: str) -> list[dict]:
|
|
115
|
+
doc: list[dict] = []
|
|
116
|
+
for entry in entries:
|
|
117
|
+
kind = entry.get("kind") or "scalar"
|
|
118
|
+
item = {
|
|
119
|
+
"id": entry["id"],
|
|
120
|
+
"base_id": entry["base_id"],
|
|
121
|
+
"kind": kind,
|
|
122
|
+
}
|
|
123
|
+
if kind == "list":
|
|
124
|
+
target = _resolve_cadence_target(entry, cadence_strategy)
|
|
125
|
+
if target is not None:
|
|
126
|
+
item["cadence"] = {"strategy": cadence_strategy, "target": target}
|
|
127
|
+
doc.append(item)
|
|
128
|
+
return doc
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _to_iso(ts: datetime | None) -> str | None:
|
|
132
|
+
if isinstance(ts, datetime):
|
|
133
|
+
text = ts.isoformat()
|
|
134
|
+
if text.endswith("+00:00"):
|
|
135
|
+
return text[:-6] + "Z"
|
|
136
|
+
return text
|
|
137
|
+
return None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def metadata_entries_from_stats(entries: list[dict], cadence_strategy: str) -> list[dict]:
|
|
141
|
+
meta_entries: list[dict] = []
|
|
142
|
+
for entry in entries:
|
|
143
|
+
kind = entry.get("kind") or "scalar"
|
|
144
|
+
item: dict[str, Any] = {
|
|
145
|
+
"id": entry["id"],
|
|
146
|
+
"base_id": entry["base_id"],
|
|
147
|
+
"kind": kind,
|
|
148
|
+
"present_count": entry.get("present_count", 0),
|
|
149
|
+
"null_count": entry.get("null_count", 0),
|
|
150
|
+
}
|
|
151
|
+
first_ts = _to_iso(entry.get("first_ts"))
|
|
152
|
+
last_ts = _to_iso(entry.get("last_ts"))
|
|
153
|
+
if first_ts:
|
|
154
|
+
item["first_observed"] = first_ts
|
|
155
|
+
if last_ts:
|
|
156
|
+
item["last_observed"] = last_ts
|
|
157
|
+
if kind == "list":
|
|
158
|
+
item["element_types"] = sorted(entry.get("element_types", []))
|
|
159
|
+
lengths = entry.get("lengths") or {}
|
|
160
|
+
item["lengths"] = {str(length): count for length, count in sorted(lengths.items())}
|
|
161
|
+
target = _resolve_cadence_target(entry, cadence_strategy)
|
|
162
|
+
if target is not None:
|
|
163
|
+
item["cadence"] = {"strategy": cadence_strategy, "target": target}
|
|
164
|
+
if "observed_elements" in entry:
|
|
165
|
+
item["observed_elements"] = int(entry.get("observed_elements", 0))
|
|
166
|
+
else:
|
|
167
|
+
item["value_types"] = sorted(entry.get("scalar_types", []))
|
|
168
|
+
meta_entries.append(item)
|
|
169
|
+
return meta_entries
|