jerry-thomas 0.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/__init__.py +0 -0
- datapipeline/analysis/__init__.py +0 -0
- datapipeline/analysis/vector_analyzer.py +49 -0
- datapipeline/cli/app.py +208 -0
- datapipeline/cli/commands/analyze.py +32 -0
- datapipeline/cli/commands/domain.py +9 -0
- datapipeline/cli/commands/filter.py +10 -0
- datapipeline/cli/commands/link.py +95 -0
- datapipeline/cli/commands/list_.py +22 -0
- datapipeline/cli/commands/plugin.py +10 -0
- datapipeline/cli/commands/run.py +151 -0
- datapipeline/cli/commands/source.py +17 -0
- datapipeline/cli/openers.py +11 -0
- datapipeline/cli/visuals.py +91 -0
- datapipeline/common/__init__.py +0 -0
- datapipeline/common/geo.py +13 -0
- datapipeline/config/__init__.py +0 -0
- datapipeline/config/catalog.py +22 -0
- datapipeline/config/dataset/dataset.py +19 -0
- datapipeline/config/dataset/feature.py +24 -0
- datapipeline/config/dataset/group_by.py +31 -0
- datapipeline/config/dataset/loader.py +19 -0
- datapipeline/config/dataset/normalize.py +10 -0
- datapipeline/config/project.py +24 -0
- datapipeline/domain/__init__.py +0 -0
- datapipeline/domain/feature.py +10 -0
- datapipeline/domain/record.py +20 -0
- datapipeline/domain/vector.py +44 -0
- datapipeline/filters/filters.py +88 -0
- datapipeline/mappers/noop.py +5 -0
- datapipeline/mappers/synthetic/time.py +19 -0
- datapipeline/parsers/identity.py +14 -0
- datapipeline/pipeline/__init__.py +0 -0
- datapipeline/pipeline/pipelines.py +46 -0
- datapipeline/pipeline/stages.py +64 -0
- datapipeline/pipeline/utils/keygen.py +20 -0
- datapipeline/pipeline/utils/memory_sort.py +27 -0
- datapipeline/pipeline/utils/ordering.py +52 -0
- datapipeline/pipeline/utils/transform_utils.py +120 -0
- datapipeline/plugins.py +7 -0
- datapipeline/services/bootstrap.py +158 -0
- datapipeline/services/constants.py +12 -0
- datapipeline/services/entrypoints.py +69 -0
- datapipeline/services/factories.py +18 -0
- datapipeline/services/paths.py +28 -0
- datapipeline/services/project_paths.py +35 -0
- datapipeline/services/scaffold/__init__.py +2 -0
- datapipeline/services/scaffold/domain.py +23 -0
- datapipeline/services/scaffold/filter.py +32 -0
- datapipeline/services/scaffold/mappers.py +52 -0
- datapipeline/services/scaffold/plugin.py +23 -0
- datapipeline/services/scaffold/source.py +165 -0
- datapipeline/services/scaffold/templates.py +32 -0
- datapipeline/sources/__init__.py +0 -0
- datapipeline/sources/composed_loader.py +38 -0
- datapipeline/sources/decoders.py +64 -0
- datapipeline/sources/factory.py +53 -0
- datapipeline/sources/models/__init__.py +18 -0
- datapipeline/sources/models/base.py +12 -0
- datapipeline/sources/models/generator.py +23 -0
- datapipeline/sources/models/loader.py +52 -0
- datapipeline/sources/models/parser.py +11 -0
- datapipeline/sources/models/source.py +28 -0
- datapipeline/sources/models/synthetic.py +11 -0
- datapipeline/sources/synthetic/__init__.py +0 -0
- datapipeline/sources/synthetic/time/__init__.py +0 -0
- datapipeline/sources/synthetic/time/loader.py +30 -0
- datapipeline/sources/synthetic/time/parser.py +9 -0
- datapipeline/sources/transports.py +66 -0
- datapipeline/streams/canonical.py +28 -0
- datapipeline/streams/raw.py +16 -0
- datapipeline/templates/plugin_skeleton/README.md +48 -0
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.yaml +4 -0
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.yaml +4 -0
- datapipeline/templates/plugin_skeleton/config/contracts/time_ticks.yaml +2 -0
- datapipeline/templates/plugin_skeleton/config/distilleries/time_ticks.yaml +9 -0
- datapipeline/templates/plugin_skeleton/config/project.yaml +8 -0
- datapipeline/templates/plugin_skeleton/config/recipe.yaml +17 -0
- datapipeline/templates/plugin_skeleton/pyproject.toml +11 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/stubs/dto.py.j2 +24 -0
- datapipeline/templates/stubs/filter.py.j2 +16 -0
- datapipeline/templates/stubs/loader_synthetic.py.j2 +38 -0
- datapipeline/templates/stubs/mapper.py.j2 +20 -0
- datapipeline/templates/stubs/parser.py.j2 +18 -0
- datapipeline/templates/stubs/parser_custom.py.j2 +14 -0
- datapipeline/templates/stubs/record.py.j2 +18 -0
- datapipeline/templates/stubs/source.yaml.j2 +11 -0
- datapipeline/transforms/sequence.py +31 -0
- datapipeline/transforms/transforms.py +15 -0
- datapipeline/utils/__init__.py +0 -0
- datapipeline/utils/load.py +36 -0
- datapipeline/utils/time.py +32 -0
- jerry_thomas-0.0.2.dist-info/METADATA +301 -0
- jerry_thomas-0.0.2.dist-info/RECORD +99 -0
- jerry_thomas-0.0.2.dist-info/WHEEL +5 -0
- jerry_thomas-0.0.2.dist-info/entry_points.txt +40 -0
- jerry_thomas-0.0.2.dist-info/licenses/LICENSE +21 -0
- jerry_thomas-0.0.2.dist-info/top_level.txt +1 -0
datapipeline/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from collections import defaultdict, Counter
|
|
2
|
+
from typing import Any, Hashable, Iterable
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class VectorStatsCollector:
|
|
8
|
+
def __init__(self, expected_feature_ids: Iterable[str]):
|
|
9
|
+
self.expected_feature_ids = set(expected_feature_ids)
|
|
10
|
+
self.missing_features = Counter()
|
|
11
|
+
self.empty_vectors = 0
|
|
12
|
+
self.total_vectors = 0
|
|
13
|
+
self.per_group_missing = defaultdict(set)
|
|
14
|
+
|
|
15
|
+
def update(self, group_key: Hashable, feature_vector: dict[str, Any]):
|
|
16
|
+
self.total_vectors += 1
|
|
17
|
+
|
|
18
|
+
present_features = set(feature_vector.keys())
|
|
19
|
+
|
|
20
|
+
if not present_features:
|
|
21
|
+
self.empty_vectors += 1
|
|
22
|
+
|
|
23
|
+
missing = self.expected_feature_ids - present_features
|
|
24
|
+
for feature_id in missing:
|
|
25
|
+
self.missing_features[feature_id] += 1
|
|
26
|
+
self.per_group_missing[group_key].add(feature_id)
|
|
27
|
+
|
|
28
|
+
# š Check for features present but with missing/invalid values
|
|
29
|
+
for fid in present_features & self.expected_feature_ids:
|
|
30
|
+
val = feature_vector[fid]
|
|
31
|
+
if val is None or (isinstance(val, float) and np.isnan(val)):
|
|
32
|
+
self.missing_features[fid] += 1
|
|
33
|
+
self.per_group_missing[group_key].add(fid)
|
|
34
|
+
|
|
35
|
+
def print_report(self):
|
|
36
|
+
print("\n=== Vector Quality Report ===")
|
|
37
|
+
print(f"Total vectors processed: {self.total_vectors}")
|
|
38
|
+
print(f"Empty vectors: {self.empty_vectors}")
|
|
39
|
+
print(
|
|
40
|
+
f"Features expected: {sorted(self.expected_feature_ids)[:10] }... (total {len(self.expected_feature_ids)})")
|
|
41
|
+
|
|
42
|
+
print("\nā Missing feature counts:")
|
|
43
|
+
for fid, count in sorted(self.missing_features.items(), key=lambda x: -x[1]):
|
|
44
|
+
print(f" - {fid}: missing in {count} vectors")
|
|
45
|
+
|
|
46
|
+
if self.per_group_missing:
|
|
47
|
+
print("\nā Groups with missing features (sample):")
|
|
48
|
+
for group_key, missing in list(self.per_group_missing.items())[:5]:
|
|
49
|
+
print(f" - Group {group_key}: {sorted(missing)}")
|
datapipeline/cli/app.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
|
|
3
|
+
from datapipeline.cli.commands.run import handle_prep, handle_serve
|
|
4
|
+
from datapipeline.cli.commands.analyze import analyze as handle_analyze
|
|
5
|
+
from datapipeline.cli.commands.plugin import station as handle_station
|
|
6
|
+
from datapipeline.cli.commands.source import handle as handle_source
|
|
7
|
+
from datapipeline.cli.commands.domain import handle as handle_domain
|
|
8
|
+
from datapipeline.cli.commands.link import handle as handle_link
|
|
9
|
+
from datapipeline.cli.commands.list_ import handle as handle_list
|
|
10
|
+
from datapipeline.cli.commands.filter import handle as handle_filter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main() -> None:
|
|
14
|
+
parser = argparse.ArgumentParser(
|
|
15
|
+
prog="jerry",
|
|
16
|
+
description="Mixology-themed CLI for building and serving data pipelines.",
|
|
17
|
+
)
|
|
18
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
19
|
+
|
|
20
|
+
# prep (debug mode with visuals)
|
|
21
|
+
p_prep = sub.add_parser(
|
|
22
|
+
"prep",
|
|
23
|
+
help="run pipeline stages with visual progress",
|
|
24
|
+
)
|
|
25
|
+
prep_sub = p_prep.add_subparsers(dest="prep_cmd", required=True)
|
|
26
|
+
prep_steps = {
|
|
27
|
+
"pour": "preview record-stage output",
|
|
28
|
+
"build": "inspect feature-stage output",
|
|
29
|
+
"stir": "examine vector-stage output",
|
|
30
|
+
}
|
|
31
|
+
for step, help_text in prep_steps.items():
|
|
32
|
+
sp = prep_sub.add_parser(step, help=help_text)
|
|
33
|
+
sp.add_argument(
|
|
34
|
+
"--project", "-p", default="config/project.yaml", help="path to project.yaml"
|
|
35
|
+
)
|
|
36
|
+
sp.add_argument("--limit", "-n", type=int, default=20)
|
|
37
|
+
|
|
38
|
+
# serve (production run, no visuals)
|
|
39
|
+
p_serve = sub.add_parser(
|
|
40
|
+
"serve",
|
|
41
|
+
help="produce vectors without progress visuals",
|
|
42
|
+
)
|
|
43
|
+
p_serve.add_argument(
|
|
44
|
+
"--project", "-p", default="config/project.yaml", help="path to project.yaml"
|
|
45
|
+
)
|
|
46
|
+
p_serve.add_argument(
|
|
47
|
+
"--limit", "-n", type=int, default=None,
|
|
48
|
+
help="optional cap on the number of vectors to emit",
|
|
49
|
+
)
|
|
50
|
+
p_serve.add_argument(
|
|
51
|
+
"--output", "-o", default="print",
|
|
52
|
+
help="output destination: 'print', 'stream', or a file ending in .pt",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# taste (analysis)
|
|
56
|
+
p_taste = sub.add_parser(
|
|
57
|
+
"taste",
|
|
58
|
+
help="analyze vector completeness and feature stats",
|
|
59
|
+
)
|
|
60
|
+
p_taste.add_argument(
|
|
61
|
+
"--project", "-p", default="config/project.yaml", help="path to project.yaml"
|
|
62
|
+
)
|
|
63
|
+
p_taste.add_argument("--limit", "-n", type=int, default=None)
|
|
64
|
+
|
|
65
|
+
# distillery (sources)
|
|
66
|
+
p_dist = sub.add_parser(
|
|
67
|
+
"distillery",
|
|
68
|
+
help="add or list raw sources",
|
|
69
|
+
)
|
|
70
|
+
dist_sub = p_dist.add_subparsers(dest="dist_cmd", required=True)
|
|
71
|
+
p_dist_add = dist_sub.add_parser(
|
|
72
|
+
"add",
|
|
73
|
+
help="create a provider+dataset source",
|
|
74
|
+
description=(
|
|
75
|
+
"Scaffold a source using transport + format.\n\n"
|
|
76
|
+
"Examples:\n"
|
|
77
|
+
" fs CSV: -t fs -f csv\n"
|
|
78
|
+
" fs NDJSON: -t fs -f json-lines\n"
|
|
79
|
+
" URL JSON: -t url -f json\n"
|
|
80
|
+
" Synthetic: -t synthetic\n\n"
|
|
81
|
+
"Note: set 'glob: true' in the generated YAML if your 'path' contains wildcards."
|
|
82
|
+
),
|
|
83
|
+
)
|
|
84
|
+
p_dist_add.add_argument("--provider", "-p", required=True)
|
|
85
|
+
p_dist_add.add_argument("--dataset", "-d", required=True)
|
|
86
|
+
p_dist_add.add_argument(
|
|
87
|
+
"--transport", "-t",
|
|
88
|
+
choices=["fs", "url", "synthetic"],
|
|
89
|
+
required=True,
|
|
90
|
+
help="how data is accessed: fs/url/synthetic",
|
|
91
|
+
)
|
|
92
|
+
p_dist_add.add_argument(
|
|
93
|
+
"--format", "-f",
|
|
94
|
+
choices=["csv", "json", "json-lines"],
|
|
95
|
+
help="data format for fs/url transports (ignored otherwise)",
|
|
96
|
+
)
|
|
97
|
+
dist_sub.add_parser("list", help="list known sources")
|
|
98
|
+
|
|
99
|
+
# spirit (domains)
|
|
100
|
+
p_spirit = sub.add_parser(
|
|
101
|
+
"spirit",
|
|
102
|
+
help="add or list domains",
|
|
103
|
+
)
|
|
104
|
+
spirit_sub = p_spirit.add_subparsers(dest="spirit_cmd", required=True)
|
|
105
|
+
p_spirit_add = spirit_sub.add_parser(
|
|
106
|
+
"add",
|
|
107
|
+
help="create a domain",
|
|
108
|
+
description=(
|
|
109
|
+
"Create a domain package. Defaults to Record base. "
|
|
110
|
+
"Use --time-aware to base on TimeFeatureRecord (adds 'time' and 'value' fields)."
|
|
111
|
+
),
|
|
112
|
+
)
|
|
113
|
+
p_spirit_add.add_argument("--domain", "-d", required=True)
|
|
114
|
+
p_spirit_add.add_argument(
|
|
115
|
+
"--time-aware",
|
|
116
|
+
"-t",
|
|
117
|
+
action="store_true",
|
|
118
|
+
help="use TimeFeatureRecord base (UTC-aware 'time' + 'value' fields) instead of Record",
|
|
119
|
+
)
|
|
120
|
+
spirit_sub.add_parser("list", help="list known domains")
|
|
121
|
+
|
|
122
|
+
# contract (link source ā domain)
|
|
123
|
+
p_contract = sub.add_parser(
|
|
124
|
+
"contract",
|
|
125
|
+
help="link a distillery source to a spirit domain",
|
|
126
|
+
)
|
|
127
|
+
p_contract.add_argument("--time-aware", "-t", action="store_true")
|
|
128
|
+
|
|
129
|
+
# station (plugin scaffolding)
|
|
130
|
+
p_station = sub.add_parser(
|
|
131
|
+
"station",
|
|
132
|
+
help="scaffold plugin workspaces",
|
|
133
|
+
)
|
|
134
|
+
station_sub = p_station.add_subparsers(dest="station_cmd", required=True)
|
|
135
|
+
p_station_init = station_sub.add_parser(
|
|
136
|
+
"init", help="create a plugin skeleton")
|
|
137
|
+
p_station_init.add_argument("--name", "-n", required=True)
|
|
138
|
+
p_station_init.add_argument("--out", "-o", default=".")
|
|
139
|
+
|
|
140
|
+
# filter (unchanged helper)
|
|
141
|
+
p_filt = sub.add_parser("filter", help="manage filters")
|
|
142
|
+
filt_sub = p_filt.add_subparsers(dest="filter_cmd", required=True)
|
|
143
|
+
p_filt_create = filt_sub.add_parser(
|
|
144
|
+
"create", help="create a filter function")
|
|
145
|
+
p_filt_create.add_argument(
|
|
146
|
+
"--name", "-n", required=True,
|
|
147
|
+
help="filter entrypoint name and function/module name",
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
args = parser.parse_args()
|
|
151
|
+
|
|
152
|
+
if args.cmd == "prep":
|
|
153
|
+
handle_prep(action=args.prep_cmd,
|
|
154
|
+
project=args.project, limit=args.limit)
|
|
155
|
+
return
|
|
156
|
+
|
|
157
|
+
if args.cmd == "serve":
|
|
158
|
+
handle_serve(
|
|
159
|
+
project=args.project,
|
|
160
|
+
limit=getattr(args, "limit", None),
|
|
161
|
+
output=args.output,
|
|
162
|
+
)
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
if args.cmd == "taste":
|
|
166
|
+
handle_analyze(project=args.project,
|
|
167
|
+
limit=getattr(args, "limit", None))
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
if args.cmd == "distillery":
|
|
171
|
+
if args.dist_cmd == "list":
|
|
172
|
+
handle_list(subcmd="sources")
|
|
173
|
+
else:
|
|
174
|
+
handle_source(
|
|
175
|
+
subcmd="add",
|
|
176
|
+
provider=getattr(args, "provider", None),
|
|
177
|
+
dataset=getattr(args, "dataset", None),
|
|
178
|
+
transport=getattr(args, "transport", None),
|
|
179
|
+
format=getattr(args, "format", None),
|
|
180
|
+
)
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
if args.cmd == "spirit":
|
|
184
|
+
if args.spirit_cmd == "list":
|
|
185
|
+
handle_list(subcmd="domains")
|
|
186
|
+
else:
|
|
187
|
+
handle_domain(
|
|
188
|
+
subcmd="add",
|
|
189
|
+
domain=getattr(args, "domain", None),
|
|
190
|
+
time_aware=getattr(args, "time_aware", False),
|
|
191
|
+
)
|
|
192
|
+
return
|
|
193
|
+
|
|
194
|
+
if args.cmd == "contract":
|
|
195
|
+
handle_link(time_aware=getattr(args, "time_aware", False))
|
|
196
|
+
return
|
|
197
|
+
|
|
198
|
+
if args.cmd == "station":
|
|
199
|
+
handle_station(
|
|
200
|
+
subcmd=args.station_cmd,
|
|
201
|
+
name=getattr(args, "name", None),
|
|
202
|
+
out=getattr(args, "out", "."),
|
|
203
|
+
)
|
|
204
|
+
return
|
|
205
|
+
|
|
206
|
+
if args.cmd == "filter":
|
|
207
|
+
handle_filter(subcmd=args.filter_cmd, name=getattr(args, "name", None))
|
|
208
|
+
return
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.bootstrap import bootstrap
|
|
5
|
+
from datapipeline.config.dataset.loader import load_dataset
|
|
6
|
+
from datapipeline.pipeline.pipelines import build_vector_pipeline
|
|
7
|
+
from datapipeline.cli.openers import open_canonical_stream_visual
|
|
8
|
+
from datapipeline.analysis.vector_analyzer import VectorStatsCollector
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze(project: str, limit: Optional[int] = None) -> None:
|
|
12
|
+
project_path = Path(project)
|
|
13
|
+
dataset = load_dataset(project_path, "vectors")
|
|
14
|
+
bootstrap(project_path)
|
|
15
|
+
|
|
16
|
+
expected_feature_ids = [cfg.feature_id for cfg in (dataset.features or [])]
|
|
17
|
+
if not expected_feature_ids:
|
|
18
|
+
print("(no features configured; nothing to analyze)")
|
|
19
|
+
return
|
|
20
|
+
|
|
21
|
+
collector = VectorStatsCollector(expected_feature_ids)
|
|
22
|
+
|
|
23
|
+
count = 0
|
|
24
|
+
for group_key, vector in build_vector_pipeline(
|
|
25
|
+
dataset.features, dataset.group_by, open_canonical_stream_visual
|
|
26
|
+
):
|
|
27
|
+
collector.update(group_key, vector.values)
|
|
28
|
+
count += 1
|
|
29
|
+
if limit and count >= limit:
|
|
30
|
+
break
|
|
31
|
+
|
|
32
|
+
collector.print_report()
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
from datapipeline.services.scaffold.domain import create_domain
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def handle(subcmd: str, domain: str | None, time_aware: bool) -> None:
|
|
5
|
+
if subcmd in {"create", "add"}:
|
|
6
|
+
if not domain:
|
|
7
|
+
print("ā --domain is required")
|
|
8
|
+
raise SystemExit(2)
|
|
9
|
+
create_domain(domain=domain, time_aware=time_aware, root=None)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from datapipeline.services.scaffold.filter import create_filter
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def handle(subcmd: str, name: str | None) -> None:
|
|
5
|
+
if subcmd == "create":
|
|
6
|
+
if not name:
|
|
7
|
+
print("ā --name is required for filter create")
|
|
8
|
+
raise SystemExit(2)
|
|
9
|
+
create_filter(name=name, root=None)
|
|
10
|
+
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
3
|
+
from datapipeline.services.entrypoints import read_group_entries
|
|
4
|
+
import yaml
|
|
5
|
+
from datapipeline.services.constants import FILTERS_GROUP, MAPPER_KEY, ENTRYPOINT_KEY, ARGS_KEY, SOURCE_KEY
|
|
6
|
+
from datapipeline.services.project_paths import sources_dir as resolve_sources_dir, streams_dir as resolve_streams_dir
|
|
7
|
+
from datapipeline.services.scaffold.mappers import attach_source_to_domain
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _pick_from_list(prompt: str, options: list[str]) -> str:
|
|
12
|
+
print(prompt, file=sys.stderr)
|
|
13
|
+
for i, opt in enumerate(options, 1):
|
|
14
|
+
print(f" [{i}] {opt}", file=sys.stderr)
|
|
15
|
+
while True:
|
|
16
|
+
sel = input("> ").strip()
|
|
17
|
+
if sel.isdigit():
|
|
18
|
+
idx = int(sel)
|
|
19
|
+
if 1 <= idx <= len(options):
|
|
20
|
+
return options[idx - 1]
|
|
21
|
+
print("Please enter a number from the list.", file=sys.stderr)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def handle(time_aware: bool) -> None:
|
|
25
|
+
root_dir, name, pyproject = pkg_root(None)
|
|
26
|
+
|
|
27
|
+
# Discover sources by scanning sources_dir YAMLs
|
|
28
|
+
proj_path = root_dir / "config" / "project.yaml"
|
|
29
|
+
sources_dir = resolve_sources_dir(proj_path)
|
|
30
|
+
source_options = []
|
|
31
|
+
if sources_dir.exists():
|
|
32
|
+
source_options = sorted(p.stem for p in sources_dir.glob("*.y*ml"))
|
|
33
|
+
if not source_options:
|
|
34
|
+
print("ā No sources found. Create one first (jerry distillery add ...)")
|
|
35
|
+
raise SystemExit(2)
|
|
36
|
+
|
|
37
|
+
src_key = _pick_from_list("Select a source to link:", source_options)
|
|
38
|
+
# Expect aliases from sources_dir filenames: provider_dataset.yaml
|
|
39
|
+
parts = src_key.split("_", 1)
|
|
40
|
+
if len(parts) != 2:
|
|
41
|
+
print("ā Source alias must be 'provider_dataset' (from sources/<alias>.yaml)", file=sys.stderr)
|
|
42
|
+
raise SystemExit(2)
|
|
43
|
+
provider, dataset = parts[0], parts[1]
|
|
44
|
+
|
|
45
|
+
# Discover domains by scanning the package, fallback to EPs if needed
|
|
46
|
+
base = resolve_base_pkg_dir(root_dir, name)
|
|
47
|
+
domain_options = []
|
|
48
|
+
for dirname in ("domains",):
|
|
49
|
+
dom_dir = base / dirname
|
|
50
|
+
if dom_dir.exists():
|
|
51
|
+
domain_options.extend(
|
|
52
|
+
[p.name for p in dom_dir.iterdir() if p.is_dir()
|
|
53
|
+
and (p / "model.py").exists()]
|
|
54
|
+
)
|
|
55
|
+
domain_options = sorted(set(domain_options))
|
|
56
|
+
if not domain_options:
|
|
57
|
+
domain_options = sorted(
|
|
58
|
+
read_group_entries(pyproject, FILTERS_GROUP).keys())
|
|
59
|
+
if not domain_options:
|
|
60
|
+
print("ā No domains found. Create one first (jerry spirit add ...)")
|
|
61
|
+
raise SystemExit(2)
|
|
62
|
+
|
|
63
|
+
dom_name = _pick_from_list("Select a domain to link to:", domain_options)
|
|
64
|
+
|
|
65
|
+
# create mapper + EP (domain.origin)
|
|
66
|
+
attach_source_to_domain(domain=dom_name, provider=provider,
|
|
67
|
+
dataset=dataset, time_aware=time_aware, root=None)
|
|
68
|
+
|
|
69
|
+
def _slug(s: str) -> str:
|
|
70
|
+
s = s.strip().lower()
|
|
71
|
+
s = re.sub(r"[^a-z0-9]+", "_", s)
|
|
72
|
+
return s.strip("_")
|
|
73
|
+
ep_key = f"{_slug(dom_name)}.{_slug(provider)}"
|
|
74
|
+
print(f"ā
Registered mapper entry point as '{ep_key}'.")
|
|
75
|
+
|
|
76
|
+
# Inject per-file canonical stream into streams directory
|
|
77
|
+
streams_path = resolve_streams_dir(proj_path)
|
|
78
|
+
|
|
79
|
+
canonical_alias = src_key # default canonical stream alias
|
|
80
|
+
mapper_ep = ep_key
|
|
81
|
+
# Write a single-file canonical spec into streams directory
|
|
82
|
+
try:
|
|
83
|
+
# Ensure streams_path is a directory path
|
|
84
|
+
streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
|
|
85
|
+
streams_dir.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
cfile = streams_dir / f"{canonical_alias}.yaml"
|
|
87
|
+
data = {
|
|
88
|
+
SOURCE_KEY: src_key,
|
|
89
|
+
MAPPER_KEY: {ENTRYPOINT_KEY: mapper_ep, ARGS_KEY: {}},
|
|
90
|
+
}
|
|
91
|
+
with cfile.open("w", encoding="utf-8") as f:
|
|
92
|
+
yaml.safe_dump(data, f, sort_keys=False, default_flow_style=False)
|
|
93
|
+
print(f"⨠Created canonical spec: {cfile}")
|
|
94
|
+
except Exception as e:
|
|
95
|
+
print(f"ā Failed to write canonical spec: {e}", file=sys.stderr)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
2
|
+
from datapipeline.services.project_paths import sources_dir as sources_dir_from_project
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def handle(subcmd: str) -> None:
|
|
6
|
+
root_dir, name, pyproject = pkg_root(None)
|
|
7
|
+
if subcmd == "sources":
|
|
8
|
+
# Discover sources by scanning sources_dir for YAML files
|
|
9
|
+
proj_path = root_dir / "config" / "project.yaml"
|
|
10
|
+
sources_dir = sources_dir_from_project(proj_path)
|
|
11
|
+
if sources_dir.exists():
|
|
12
|
+
aliases = sorted(p.stem for p in sources_dir.glob("*.y*ml"))
|
|
13
|
+
for a in aliases:
|
|
14
|
+
print(a)
|
|
15
|
+
elif subcmd == "domains":
|
|
16
|
+
base = resolve_base_pkg_dir(root_dir, name)
|
|
17
|
+
dom_dir = base / "domains"
|
|
18
|
+
if dom_dir.exists():
|
|
19
|
+
names = sorted(p.name for p in dom_dir.iterdir()
|
|
20
|
+
if p.is_dir() and (p / "model.py").exists())
|
|
21
|
+
for k in names:
|
|
22
|
+
print(k)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from datapipeline.services.scaffold.plugin import scaffold_plugin
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def station(subcmd: str, name: str | None, out: str) -> None:
|
|
6
|
+
if subcmd == "init":
|
|
7
|
+
if not name:
|
|
8
|
+
print("ā --name is required for station init")
|
|
9
|
+
raise SystemExit(2)
|
|
10
|
+
scaffold_plugin(name, Path(out))
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import pickle
|
|
5
|
+
import sys
|
|
6
|
+
from itertools import islice
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Iterator, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from datapipeline.cli.openers import open_canonical_stream_visual
|
|
13
|
+
from datapipeline.config.dataset.dataset import FeatureDatasetConfig, RecordDatasetConfig
|
|
14
|
+
from datapipeline.config.dataset.loader import load_dataset
|
|
15
|
+
from datapipeline.pipeline.pipelines import (
|
|
16
|
+
build_feature_pipeline,
|
|
17
|
+
build_record_pipeline,
|
|
18
|
+
build_vector_pipeline,
|
|
19
|
+
)
|
|
20
|
+
from datapipeline.services.bootstrap import bootstrap
|
|
21
|
+
from datapipeline.streams.canonical import open_canonical_stream
|
|
22
|
+
from datapipeline.domain.vector import Vector
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _print_head(iterable: Iterator[object], limit: int) -> int:
|
|
26
|
+
count = 0
|
|
27
|
+
try:
|
|
28
|
+
for item in iterable:
|
|
29
|
+
tqdm.write(str(item))
|
|
30
|
+
count += 1
|
|
31
|
+
if count >= limit:
|
|
32
|
+
break
|
|
33
|
+
except KeyboardInterrupt:
|
|
34
|
+
pass
|
|
35
|
+
return count
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _run_records(dataset: RecordDatasetConfig, limit: int) -> None:
|
|
39
|
+
for cfg in dataset.features:
|
|
40
|
+
print(f"\nš· pouring records for {cfg.feature_id}")
|
|
41
|
+
records = build_record_pipeline(cfg, open_canonical_stream_visual)
|
|
42
|
+
printed = _print_head(records, limit)
|
|
43
|
+
print(f"(poured {printed} records)")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _run_features(dataset: FeatureDatasetConfig, limit: int) -> None:
|
|
47
|
+
group_by = dataset.group_by
|
|
48
|
+
for cfg in dataset.features:
|
|
49
|
+
feature_id = getattr(cfg, "feature_id", "?")
|
|
50
|
+
print(f"\nš ļø building features for {feature_id}")
|
|
51
|
+
features = build_feature_pipeline(
|
|
52
|
+
cfg, group_by, open_canonical_stream_visual)
|
|
53
|
+
printed = _print_head(features, limit)
|
|
54
|
+
tqdm.write(f"(built {printed} feature records)")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _run_vectors(dataset: FeatureDatasetConfig, limit: int) -> None:
|
|
58
|
+
print("\nš„ stirring vectors")
|
|
59
|
+
vectors = build_vector_pipeline(
|
|
60
|
+
dataset.features, dataset.group_by, open_canonical_stream_visual)
|
|
61
|
+
printed = _print_head(vectors, limit)
|
|
62
|
+
print(f"(stirred {printed} vectors)")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def handle_prep(action: str, project: str, limit: int = 20) -> None:
|
|
66
|
+
stage_lookup = {"pour": "records", "build": "features", "stir": "vectors"}
|
|
67
|
+
if action not in stage_lookup:
|
|
68
|
+
raise ValueError(f"Unknown prep action: {action}")
|
|
69
|
+
|
|
70
|
+
project_path = Path(project)
|
|
71
|
+
dataset = load_dataset(project_path, stage_lookup[action])
|
|
72
|
+
bootstrap(project_path)
|
|
73
|
+
|
|
74
|
+
features = list(dataset.features or [])
|
|
75
|
+
if not features:
|
|
76
|
+
print("(no features configured; nothing to prep)")
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
if action == "pour":
|
|
80
|
+
_run_records(dataset, limit)
|
|
81
|
+
elif action == "build":
|
|
82
|
+
_run_features(dataset, limit)
|
|
83
|
+
else:
|
|
84
|
+
_run_vectors(dataset, limit)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _limit_vectors(vectors: Iterator[Tuple[object, Vector]], limit: Optional[int]) -> Iterator[Tuple[object, Vector]]:
|
|
88
|
+
if limit is None:
|
|
89
|
+
yield from vectors
|
|
90
|
+
else:
|
|
91
|
+
yield from islice(vectors, limit)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _serve_print(vectors: Iterator[Tuple[object, Vector]], limit: Optional[int]) -> None:
|
|
95
|
+
count = 0
|
|
96
|
+
try:
|
|
97
|
+
for group_key, vector in _limit_vectors(vectors, limit):
|
|
98
|
+
print(f"group={group_key}: {vector.values}")
|
|
99
|
+
count += 1
|
|
100
|
+
except KeyboardInterrupt:
|
|
101
|
+
pass
|
|
102
|
+
print(f"(served {count} vectors to stdout)")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _serve_stream(vectors: Iterator[Tuple[object, Vector]], limit: Optional[int]) -> None:
|
|
106
|
+
count = 0
|
|
107
|
+
try:
|
|
108
|
+
for group_key, vector in _limit_vectors(vectors, limit):
|
|
109
|
+
payload = {"group": list(group_key) if isinstance(group_key, tuple) else group_key,
|
|
110
|
+
"values": vector.values}
|
|
111
|
+
print(json.dumps(payload, default=str))
|
|
112
|
+
count += 1
|
|
113
|
+
except KeyboardInterrupt:
|
|
114
|
+
pass
|
|
115
|
+
print(f"(streamed {count} vectors)", file=sys.stderr)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _serve_pt(vectors: Iterator[Tuple[object, Vector]], limit: Optional[int], destination: Path) -> None:
|
|
119
|
+
data = []
|
|
120
|
+
for group_key, vector in _limit_vectors(vectors, limit):
|
|
121
|
+
normalized_key = list(group_key) if isinstance(
|
|
122
|
+
group_key, tuple) else group_key
|
|
123
|
+
data.append((normalized_key, vector.values))
|
|
124
|
+
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
with destination.open("wb") as fh:
|
|
126
|
+
pickle.dump(data, fh)
|
|
127
|
+
print(f"š¾ Saved {len(data)} vectors to {destination}")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def handle_serve(project: str, limit: Optional[int], output: str) -> None:
|
|
131
|
+
project_path = Path(project)
|
|
132
|
+
dataset = load_dataset(project_path, "vectors")
|
|
133
|
+
bootstrap(project_path)
|
|
134
|
+
|
|
135
|
+
features = list(dataset.features or [])
|
|
136
|
+
if not features:
|
|
137
|
+
print("(no features configured; nothing to serve)")
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
vectors = build_vector_pipeline(
|
|
141
|
+
dataset.features, dataset.group_by, open_canonical_stream)
|
|
142
|
+
|
|
143
|
+
if output == "print":
|
|
144
|
+
_serve_print(vectors, limit)
|
|
145
|
+
elif output == "stream":
|
|
146
|
+
_serve_stream(vectors, limit)
|
|
147
|
+
elif output.endswith(".pt"):
|
|
148
|
+
_serve_pt(vectors, limit, Path(output))
|
|
149
|
+
else:
|
|
150
|
+
print("ā Unsupported output format. Use 'print', 'stream', or a .pt file path.")
|
|
151
|
+
raise SystemExit(2)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from datapipeline.services.scaffold.source import create_source
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def handle(subcmd: str, provider: str | None, dataset: str | None,
|
|
5
|
+
transport: str | None = None, format: str | None = None) -> None:
|
|
6
|
+
if subcmd in {"create", "add"}:
|
|
7
|
+
if not provider or not dataset:
|
|
8
|
+
print("ā --provider and --dataset are required")
|
|
9
|
+
raise SystemExit(2)
|
|
10
|
+
if not transport:
|
|
11
|
+
print("ā --transport is required (fs|url|synthetic)")
|
|
12
|
+
raise SystemExit(2)
|
|
13
|
+
if transport in {"fs", "url"} and not format:
|
|
14
|
+
print("ā --format is required for fs/url transports (csv|json|json-lines)")
|
|
15
|
+
raise SystemExit(2)
|
|
16
|
+
create_source(provider=provider, dataset=dataset,
|
|
17
|
+
transport=transport, format=format, root=None)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from typing import Iterator, Any
|
|
2
|
+
from datapipeline.streams.canonical import canonical_entry
|
|
3
|
+
from datapipeline.streams.raw import open_raw_stream
|
|
4
|
+
from datapipeline.cli.visuals import wrap_with_tqdm
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def open_canonical_stream_visual(alias: str, show: bool = True) -> Iterator[Any]:
|
|
8
|
+
entry = canonical_entry(alias)
|
|
9
|
+
raw = open_raw_stream(entry.source_alias)
|
|
10
|
+
wrapped = wrap_with_tqdm(raw, stream_alias=alias, show=show)
|
|
11
|
+
return entry.mapper(wrapped, **entry.mapper_args)
|