jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +292 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +54 -18
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/paths.py +10 -1
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +129 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -2
- datapipeline/templates/stubs/filter.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
- datapipeline/templates/stubs/parser.py.j2 +5 -1
- datapipeline/templates/stubs/record.py.j2 +1 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.1.dist-info/METADATA +269 -0
- jerry_thomas-2.0.1.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -3,30 +3,63 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
from datapipeline.config.workspace import WorkspaceContext
|
|
5
5
|
from datapipeline.cli.workspace_utils import resolve_default_project_yaml
|
|
6
|
-
from datapipeline.services.paths import pkg_root
|
|
7
|
-
from datapipeline.services.entrypoints import read_group_entries
|
|
8
|
-
from datapipeline.services.constants import FILTERS_GROUP
|
|
9
|
-
from datapipeline.services.project_paths import
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
6
|
+
from datapipeline.services.paths import pkg_root
|
|
7
|
+
from datapipeline.services.entrypoints import read_group_entries
|
|
8
|
+
from datapipeline.services.constants import FILTERS_GROUP
|
|
9
|
+
from datapipeline.services.project_paths import resolve_project_yaml_path
|
|
10
|
+
from datapipeline.services.scaffold.contract_yaml import (
|
|
11
|
+
write_ingest_contract,
|
|
12
|
+
write_composed_contract,
|
|
13
|
+
compose_inputs,
|
|
14
14
|
)
|
|
15
|
-
from datapipeline.services.scaffold.
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
15
|
+
from datapipeline.services.scaffold.discovery import (
|
|
16
|
+
list_domains,
|
|
17
|
+
list_mappers,
|
|
18
|
+
list_sources,
|
|
19
|
+
list_streams,
|
|
20
|
+
)
|
|
21
|
+
from datapipeline.services.scaffold.utils import (
|
|
22
|
+
info,
|
|
23
|
+
status,
|
|
24
|
+
error_exit,
|
|
25
|
+
pick_from_menu,
|
|
26
|
+
pick_from_list,
|
|
27
|
+
pick_multiple_from_list,
|
|
28
|
+
choose_name,
|
|
29
|
+
)
|
|
30
|
+
from datapipeline.services.scaffold.layout import default_stream_id
|
|
31
|
+
from datapipeline.cli.commands.mapper import handle as handle_mapper
|
|
32
|
+
from datapipeline.services.scaffold.mapper import create_composed_mapper
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _select_mapper(*, allow_identity: bool, allow_create: bool, root: Path | None) -> str:
|
|
36
|
+
mappers = list_mappers(root=root)
|
|
37
|
+
options: list[tuple[str, str]] = []
|
|
38
|
+
if allow_create:
|
|
39
|
+
options.append(("create", "Create new mapper (default)"))
|
|
40
|
+
if mappers:
|
|
41
|
+
options.append(("existing", "Select existing mapper"))
|
|
42
|
+
if allow_identity:
|
|
43
|
+
options.append(("identity", "Identity mapper"))
|
|
44
|
+
options.append(("custom", "Custom mapper"))
|
|
45
|
+
|
|
46
|
+
if not options:
|
|
47
|
+
error_exit("No mapper options available")
|
|
48
|
+
|
|
49
|
+
choice = pick_from_menu("Mapper:", options)
|
|
50
|
+
if choice == "existing":
|
|
51
|
+
return pick_from_menu(
|
|
52
|
+
"Select mapper entrypoint:",
|
|
53
|
+
[(k, k) for k in sorted(mappers.keys())],
|
|
54
|
+
)
|
|
55
|
+
if choice == "create":
|
|
56
|
+
return handle_mapper(name=None, plugin_root=root)
|
|
57
|
+
if choice == "identity":
|
|
58
|
+
return "identity"
|
|
59
|
+
ep = input("Mapper entrypoint: ").strip()
|
|
60
|
+
if not ep:
|
|
61
|
+
error_exit("Mapper entrypoint is required")
|
|
62
|
+
return ep
|
|
30
63
|
|
|
31
64
|
|
|
32
65
|
def handle(
|
|
@@ -38,20 +71,19 @@ def handle(
|
|
|
38
71
|
root_dir, name, pyproject = pkg_root(plugin_root)
|
|
39
72
|
default_project = resolve_default_project_yaml(workspace)
|
|
40
73
|
# Select contract type: Ingest (source->stream) or Composed (streams->stream)
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
74
|
+
info("Contract type:")
|
|
75
|
+
info(" [1] Ingest (source → stream)")
|
|
76
|
+
info(" [2] Composed (streams → stream)")
|
|
44
77
|
sel = input("> ").strip()
|
|
45
78
|
if sel == "2":
|
|
46
79
|
if use_identity:
|
|
47
|
-
|
|
48
|
-
raise SystemExit(2)
|
|
80
|
+
error_exit("--identity is only supported for ingest contracts.")
|
|
49
81
|
# Defer to composed scaffolder (fully interactive)
|
|
50
82
|
scaffold_conflux(
|
|
51
83
|
stream_id=None,
|
|
52
84
|
inputs=None,
|
|
53
85
|
mapper_path=None,
|
|
54
|
-
with_mapper_stub=
|
|
86
|
+
with_mapper_stub=False,
|
|
55
87
|
plugin_root=plugin_root,
|
|
56
88
|
project_yaml=default_project,
|
|
57
89
|
)
|
|
@@ -60,129 +92,47 @@ def handle(
|
|
|
60
92
|
# Discover sources by scanning sources_dir YAMLs
|
|
61
93
|
# Default to dataset-scoped project config
|
|
62
94
|
proj_path = default_project or resolve_project_yaml_path(root_dir)
|
|
63
|
-
|
|
64
|
-
ensure_project_scaffold(proj_path)
|
|
65
|
-
sources_dir = resolve_sources_dir(proj_path)
|
|
66
|
-
source_options: list[str] = []
|
|
67
|
-
if sources_dir.exists():
|
|
68
|
-
# Recursively scan YAMLs and read declared source id (alias)
|
|
69
|
-
from datapipeline.utils.load import load_yaml
|
|
70
|
-
from datapipeline.services.constants import PARSER_KEY, LOADER_KEY, SOURCE_ID_KEY
|
|
71
|
-
for p in sorted(sources_dir.rglob("*.y*ml")):
|
|
72
|
-
try:
|
|
73
|
-
data = load_yaml(p)
|
|
74
|
-
except Exception:
|
|
75
|
-
continue
|
|
76
|
-
if isinstance(data, dict) and isinstance(data.get(PARSER_KEY), dict) and isinstance(data.get(LOADER_KEY), dict):
|
|
77
|
-
alias = data.get(SOURCE_ID_KEY)
|
|
78
|
-
if isinstance(alias, str):
|
|
79
|
-
source_options.append(alias)
|
|
80
|
-
source_options = sorted(set(source_options))
|
|
95
|
+
source_options = list_sources(proj_path)
|
|
81
96
|
if not source_options:
|
|
82
|
-
|
|
83
|
-
raise SystemExit(2)
|
|
97
|
+
error_exit("No sources found. Create one first (jerry source create ...)")
|
|
84
98
|
|
|
85
|
-
src_key =
|
|
86
|
-
"Select a source for the contract:", source_options)
|
|
99
|
+
src_key = pick_from_list("Select source:", source_options)
|
|
87
100
|
# Expect aliases as 'provider.dataset' (from source file's id)
|
|
88
101
|
parts = src_key.split(".", 1)
|
|
89
102
|
if len(parts) != 2:
|
|
90
|
-
|
|
91
|
-
raise SystemExit(2)
|
|
103
|
+
error_exit("Source alias must be 'provider.dataset' (from source file's id)")
|
|
92
104
|
provider, dataset = parts[0], parts[1]
|
|
93
105
|
|
|
94
|
-
|
|
95
|
-
base = resolve_base_pkg_dir(root_dir, name)
|
|
96
|
-
domain_options = []
|
|
97
|
-
for dirname in ("domains",):
|
|
98
|
-
dom_dir = base / dirname
|
|
99
|
-
if dom_dir.exists():
|
|
100
|
-
domain_options.extend(
|
|
101
|
-
[p.name for p in dom_dir.iterdir() if p.is_dir()
|
|
102
|
-
and (p / "model.py").exists()]
|
|
103
|
-
)
|
|
104
|
-
domain_options = sorted(set(domain_options))
|
|
106
|
+
domain_options = list_domains(root=plugin_root)
|
|
105
107
|
if not domain_options:
|
|
106
108
|
domain_options = sorted(
|
|
107
109
|
read_group_entries(pyproject, FILTERS_GROUP).keys())
|
|
108
110
|
if not domain_options:
|
|
109
|
-
|
|
110
|
-
raise SystemExit(2)
|
|
111
|
+
error_exit("No domains found. Create one first (jerry domain create ...)")
|
|
111
112
|
|
|
112
|
-
dom_name =
|
|
113
|
-
"Select a domain to contract with:", domain_options)
|
|
114
|
-
|
|
115
|
-
def _slug(s: str) -> str:
|
|
116
|
-
s = s.strip().lower()
|
|
117
|
-
s = re.sub(r"[^a-z0-9]+", "_", s)
|
|
118
|
-
return s.strip("_")
|
|
113
|
+
dom_name = pick_from_list("Select domain:", domain_options)
|
|
119
114
|
|
|
120
115
|
if use_identity:
|
|
121
116
|
mapper_ep = "identity"
|
|
122
|
-
|
|
117
|
+
status("ok", "Using built-in mapper entry point 'identity'.")
|
|
123
118
|
else:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
provider=provider,
|
|
128
|
-
dataset=dataset,
|
|
119
|
+
mapper_ep = _select_mapper(
|
|
120
|
+
allow_identity=True,
|
|
121
|
+
allow_create=True,
|
|
129
122
|
root=plugin_root,
|
|
130
123
|
)
|
|
131
|
-
ep_key = f"{_slug(dom_name)}.{_slug(dataset)}"
|
|
132
|
-
print(f"[ok] Registered mapper entry point as '{ep_key}'.")
|
|
133
|
-
mapper_ep = ep_key
|
|
134
124
|
|
|
135
125
|
# Derive canonical stream id as domain.dataset[.variant]
|
|
136
|
-
|
|
126
|
+
info("Optional variant suffix (press Enter to skip):")
|
|
137
127
|
variant = input("> ").strip()
|
|
138
|
-
|
|
139
|
-
canonical_alias = f"{_slug(dom_name)}.{_slug(dataset)}.{_slug(variant)}"
|
|
140
|
-
else:
|
|
141
|
-
canonical_alias = f"{_slug(dom_name)}.{_slug(dataset)}"
|
|
128
|
+
stream_id = choose_name("Stream id", default=default_stream_id(dom_name, dataset, variant or None))
|
|
142
129
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
try:
|
|
150
|
-
# Ensure streams_path is a directory path
|
|
151
|
-
streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
|
|
152
|
-
streams_dir.mkdir(parents=True, exist_ok=True)
|
|
153
|
-
cfile = streams_dir / f"{canonical_alias}.yaml"
|
|
154
|
-
# Build a richer scaffold as YAML text to preserve comments
|
|
155
|
-
scaffold = f"""
|
|
156
|
-
kind: ingest
|
|
157
|
-
source: {src_key}
|
|
158
|
-
id: {canonical_alias} # format: domain.dataset.(variant)
|
|
159
|
-
|
|
160
|
-
mapper:
|
|
161
|
-
entrypoint: {mapper_ep}
|
|
162
|
-
args: {{}}
|
|
163
|
-
|
|
164
|
-
# partition_by: <field or [fields]>
|
|
165
|
-
# sort_batch_size: 100000 # in-memory sort chunk size
|
|
166
|
-
|
|
167
|
-
record: # record-level transforms
|
|
168
|
-
- filter: {{ operator: ge, field: time, comparand: "${{start_time}}" }}
|
|
169
|
-
- filter: {{ operator: le, field: time, comparand: "${{end_time}}" }}
|
|
170
|
-
# - floor_time: {{ resolution: 10m }}
|
|
171
|
-
# - lag: {{ lag: 10m }}
|
|
172
|
-
|
|
173
|
-
# stream: # per-feature transforms (input sorted by id,time)
|
|
174
|
-
# - ensure_ticks: {{ tick: 10m }}
|
|
175
|
-
# - granularity: {{ mode: first }}
|
|
176
|
-
# - fill: {{ statistic: median, window: 6, min_samples: 1 }}
|
|
177
|
-
|
|
178
|
-
# debug: # optional validation-only checks
|
|
179
|
-
# - lint: {{ mode: warn, tick: 10m }}
|
|
180
|
-
"""
|
|
181
|
-
with cfile.open("w", encoding="utf-8") as f:
|
|
182
|
-
f.write(scaffold)
|
|
183
|
-
print(f"[new] canonical spec: {cfile}")
|
|
184
|
-
except Exception as e:
|
|
185
|
-
print(f"[error] Failed to write canonical spec: {e}", file=sys.stderr)
|
|
130
|
+
write_ingest_contract(
|
|
131
|
+
project_yaml=proj_path,
|
|
132
|
+
stream_id=stream_id,
|
|
133
|
+
source=src_key,
|
|
134
|
+
mapper_entrypoint=mapper_ep,
|
|
135
|
+
)
|
|
186
136
|
|
|
187
137
|
|
|
188
138
|
def scaffold_conflux(
|
|
@@ -200,174 +150,86 @@ def scaffold_conflux(
|
|
|
200
150
|
mapper_path default: <pkg>.domains.<domain>:mapper where domain = stream_id.split('.')[0]
|
|
201
151
|
"""
|
|
202
152
|
root_dir, name, _ = pkg_root(plugin_root)
|
|
203
|
-
# Resolve default project path early for interactive selections
|
|
204
153
|
proj_path = project_yaml or resolve_project_yaml_path(root_dir)
|
|
205
|
-
ensure_project_scaffold(proj_path)
|
|
206
|
-
# Defer target domain selection until after choosing inputs
|
|
207
|
-
|
|
208
|
-
# We will write the contract after selecting inputs and target domain
|
|
209
|
-
# Build inputs string first: interactive select, then target domain
|
|
210
154
|
if not inputs:
|
|
211
|
-
|
|
212
|
-
streams: list[str] = []
|
|
213
|
-
sdir = resolve_streams_dir(proj_path)
|
|
214
|
-
if sdir.exists():
|
|
215
|
-
from datapipeline.utils.load import load_yaml
|
|
216
|
-
from datapipeline.services.constants import STREAM_ID_KEY
|
|
217
|
-
for p in sorted(sdir.rglob("*.y*ml")):
|
|
218
|
-
try:
|
|
219
|
-
data = load_yaml(p)
|
|
220
|
-
except Exception:
|
|
221
|
-
continue
|
|
222
|
-
if isinstance(data, dict) and data.get("kind") in {"ingest", "composed"}:
|
|
223
|
-
sid = data.get(STREAM_ID_KEY)
|
|
224
|
-
if isinstance(sid, str) and sid:
|
|
225
|
-
streams.append(sid)
|
|
226
|
-
streams = sorted(set(streams))
|
|
155
|
+
streams = list_streams(proj_path)
|
|
227
156
|
if not streams:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
idxs = [int(x) for x in sel.split(',') if x.strip()]
|
|
238
|
-
except ValueError:
|
|
239
|
-
print("[error] Invalid selection.", file=sys.stderr)
|
|
240
|
-
raise SystemExit(2)
|
|
241
|
-
picked = []
|
|
242
|
-
for i in idxs:
|
|
243
|
-
if 1 <= i <= len(streams):
|
|
244
|
-
picked.append(streams[i-1])
|
|
245
|
-
if not picked:
|
|
246
|
-
print("[error] No inputs selected.", file=sys.stderr)
|
|
247
|
-
raise SystemExit(2)
|
|
248
|
-
# Build default aliases using domain+variant to avoid collisions.
|
|
249
|
-
# Stream id format: domain.dataset.variant (variant optional)
|
|
250
|
-
built = []
|
|
251
|
-
for ref in picked:
|
|
252
|
-
parts = ref.split(".")
|
|
253
|
-
if len(parts) >= 3:
|
|
254
|
-
domain, variant = parts[0], parts[-1]
|
|
255
|
-
alias = f"{domain}_{variant}"
|
|
256
|
-
elif len(parts) == 2:
|
|
257
|
-
# No explicit variant -> use domain as alias
|
|
258
|
-
alias = parts[0]
|
|
259
|
-
else:
|
|
260
|
-
# Fallback to full ref if unexpected
|
|
261
|
-
alias = ref
|
|
262
|
-
built.append(f"{alias}={ref}")
|
|
263
|
-
inputs = ",".join(built)
|
|
264
|
-
|
|
265
|
-
# YAML list items do not need commas; avoid embedding commas in item text
|
|
266
|
-
inputs_list = "\n - ".join(
|
|
267
|
-
s.strip() for s in inputs.split(",") if s.strip()
|
|
268
|
-
)
|
|
157
|
+
error_exit("No canonical streams found. Create them first via 'jerry contract' (ingest).")
|
|
158
|
+
picked = pick_multiple_from_list(
|
|
159
|
+
"Select one or more input streams (comma-separated numbers):",
|
|
160
|
+
streams,
|
|
161
|
+
)
|
|
162
|
+
inputs_list, driver_key = compose_inputs(picked)
|
|
163
|
+
else:
|
|
164
|
+
inputs_list = "\n - ".join(s.strip() for s in inputs.split(",") if s.strip())
|
|
165
|
+
driver_key = inputs.split(",")[0].split("=")[0].strip()
|
|
269
166
|
|
|
270
167
|
# If no stream_id, select target domain now and derive stream id (mirror ingest flow)
|
|
271
168
|
if not stream_id:
|
|
272
|
-
|
|
273
|
-
domain_options: list[str] = []
|
|
274
|
-
dom_dir = base / "domains"
|
|
275
|
-
if dom_dir.exists():
|
|
276
|
-
domain_options.extend(
|
|
277
|
-
[p.name for p in dom_dir.iterdir() if p.is_dir()
|
|
278
|
-
and (p / "model.py").exists()]
|
|
279
|
-
)
|
|
280
|
-
domain_options = sorted(set(domain_options))
|
|
169
|
+
domain_options = list_domains(root=plugin_root)
|
|
281
170
|
if not domain_options:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
print("Select a target domain for the composed stream:", file=sys.stderr)
|
|
171
|
+
error_exit("No domains found. Create one first (jerry domain create ...)")
|
|
172
|
+
info("Select domain:")
|
|
285
173
|
for i, opt in enumerate(domain_options, 1):
|
|
286
|
-
|
|
174
|
+
info(f" [{i}] {opt}")
|
|
287
175
|
sel = input("> ").strip()
|
|
288
176
|
try:
|
|
289
177
|
idx = int(sel)
|
|
290
178
|
if idx < 1 or idx > len(domain_options):
|
|
291
179
|
raise ValueError
|
|
292
180
|
except Exception:
|
|
293
|
-
|
|
294
|
-
raise SystemExit(2)
|
|
181
|
+
error_exit("Invalid selection.")
|
|
295
182
|
domain = domain_options[idx - 1]
|
|
296
183
|
stream_id = f"{domain}.processed"
|
|
297
|
-
# Default mapper path uses import-safe package dir, not project name
|
|
298
|
-
pkg_base = resolve_base_pkg_dir(root_dir, name).name
|
|
299
|
-
mapper_path = mapper_path or f"{pkg_base}.mappers.{domain}:mapper"
|
|
300
184
|
else:
|
|
301
|
-
domain = stream_id.split(
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
"""
|
|
315
|
-
from typing import Iterator, Mapping
|
|
316
|
-
from datapipeline.domain.record import TemporalRecord
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
def mapper(
|
|
320
|
-
inputs: Mapping[str, Iterator[TemporalRecord]],
|
|
321
|
-
*, driver: str | None = None, aux: Mapping[str, Iterator[TemporalRecord]] | None = None, context=None, **params
|
|
322
|
-
) -> Iterator[TemporalRecord]:
|
|
323
|
-
# TODO: implement domain math; inputs are ordered/regularized; aux is raw
|
|
324
|
-
key = driver or next(iter(inputs.keys()))
|
|
325
|
-
for rec in inputs[key]:
|
|
326
|
-
yield rec # replace with your dataclass and computation
|
|
327
|
-
""".lstrip()
|
|
185
|
+
domain = stream_id.split(".")[0]
|
|
186
|
+
|
|
187
|
+
# Mapper selection for composed contracts (no identity)
|
|
188
|
+
if not mapper_path:
|
|
189
|
+
mappers = list_mappers(root=plugin_root)
|
|
190
|
+
if mappers:
|
|
191
|
+
choice = pick_from_menu(
|
|
192
|
+
"Mapper:",
|
|
193
|
+
[
|
|
194
|
+
("create", "Create new composed mapper (default)"),
|
|
195
|
+
("existing", "Select existing mapper"),
|
|
196
|
+
("custom", "Custom mapper"),
|
|
197
|
+
],
|
|
328
198
|
)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
"
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
#
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
- {inputs_list}
|
|
367
|
-
|
|
368
|
-
mapper:
|
|
369
|
-
entrypoint: {mapper_path}
|
|
370
|
-
args: {{ driver: {(inputs.split(',')[0].split('=')[0].strip() if '=' in inputs.split(',')[0] else inputs.split(',')[0].strip())} }}
|
|
371
|
-
"""
|
|
372
|
-
cfile.write_text(yaml_text.strip() + "\n", encoding="utf-8")
|
|
373
|
-
print(f"[new] composed contract: {cfile}")
|
|
199
|
+
else:
|
|
200
|
+
choice = pick_from_menu(
|
|
201
|
+
"Mapper:",
|
|
202
|
+
[
|
|
203
|
+
("create", "Create new composed mapper (default)"),
|
|
204
|
+
("custom", "Custom mapper"),
|
|
205
|
+
],
|
|
206
|
+
)
|
|
207
|
+
if choice == "existing":
|
|
208
|
+
mapper_path = pick_from_menu(
|
|
209
|
+
"Select mapper entrypoint:",
|
|
210
|
+
[(k, k) for k in sorted(mappers.keys())],
|
|
211
|
+
)
|
|
212
|
+
with_mapper_stub = False
|
|
213
|
+
elif choice == "create":
|
|
214
|
+
with_mapper_stub = True
|
|
215
|
+
else:
|
|
216
|
+
mapper_path = input("Mapper entrypoint: ").strip()
|
|
217
|
+
if not mapper_path:
|
|
218
|
+
error_exit("Mapper entrypoint is required")
|
|
219
|
+
with_mapper_stub = False
|
|
220
|
+
|
|
221
|
+
# Optional mapper stub under mappers/ (composed signature)
|
|
222
|
+
if with_mapper_stub:
|
|
223
|
+
mapper_path = create_composed_mapper(
|
|
224
|
+
domain=domain,
|
|
225
|
+
stream_id=stream_id,
|
|
226
|
+
root=plugin_root,
|
|
227
|
+
mapper_path=mapper_path,
|
|
228
|
+
)
|
|
229
|
+
write_composed_contract(
|
|
230
|
+
project_yaml=proj_path,
|
|
231
|
+
stream_id=stream_id,
|
|
232
|
+
inputs_list=inputs_list,
|
|
233
|
+
mapper_entrypoint=mapper_path,
|
|
234
|
+
driver_key=driver_key,
|
|
235
|
+
)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from datapipeline.services.scaffold.demo import scaffold_demo
|
|
4
|
+
from datapipeline.services.scaffold.plugin import scaffold_plugin
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def handle(subcmd: str, *, out: str | None = None) -> None:
|
|
8
|
+
if subcmd != "init":
|
|
9
|
+
raise SystemExit(f"Unknown demo subcommand: {subcmd}")
|
|
10
|
+
demo_name = "demo"
|
|
11
|
+
target_root = Path(out or ".")
|
|
12
|
+
scaffold_plugin(demo_name, target_root)
|
|
13
|
+
scaffold_demo(target_root / demo_name)
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
3
|
from datapipeline.services.scaffold.domain import create_domain
|
|
4
|
+
from datapipeline.services.scaffold.utils import error_exit
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
def handle(subcmd: str, domain: str | None, *, plugin_root: Path | None = None) -> None:
|
|
7
|
-
if subcmd
|
|
8
|
+
if subcmd == "create":
|
|
8
9
|
if not domain:
|
|
9
|
-
|
|
10
|
-
"
|
|
10
|
+
error_exit(
|
|
11
|
+
"Domain name is required. Use 'jerry domain create <name>' "
|
|
11
12
|
"or pass -n/--name."
|
|
12
13
|
)
|
|
13
|
-
raise SystemExit(2)
|
|
14
14
|
create_domain(domain=domain, root=plugin_root)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from datapipeline.services.scaffold.dto import create_dto
|
|
4
|
+
from datapipeline.services.scaffold.utils import status, prompt_required
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def handle(name: str | None, *, plugin_root: Path | None = None) -> None:
|
|
8
|
+
if not name:
|
|
9
|
+
name = prompt_required("DTO class name")
|
|
10
|
+
create_dto(name=name, root=plugin_root)
|
|
11
|
+
status("ok", "DTO ready.")
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from datapipeline.services.scaffold.filter import create_filter
|
|
2
|
+
from datapipeline.services.scaffold.utils import error_exit
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
def handle(subcmd: str, name: str | None) -> None:
|
|
5
6
|
if subcmd == "create":
|
|
6
7
|
if not name:
|
|
7
|
-
|
|
8
|
-
raise SystemExit(2)
|
|
8
|
+
error_exit("--name is required for filter create")
|
|
9
9
|
create_filter(name=name, root=None)
|
|
@@ -402,71 +402,3 @@ def partitions(
|
|
|
402
402
|
work=_work,
|
|
403
403
|
)
|
|
404
404
|
|
|
405
|
-
|
|
406
|
-
def expected(
|
|
407
|
-
project: str,
|
|
408
|
-
*,
|
|
409
|
-
output: str | None = None,
|
|
410
|
-
visuals: str | None = None,
|
|
411
|
-
progress: str | None = None,
|
|
412
|
-
log_level: int | None = None,
|
|
413
|
-
workspace=None,
|
|
414
|
-
) -> None:
|
|
415
|
-
"""Discover complete set of observed full feature IDs and write a list.
|
|
416
|
-
|
|
417
|
-
Writes newline-separated ids to `<paths.artifacts>/expected.txt` by default.
|
|
418
|
-
"""
|
|
419
|
-
|
|
420
|
-
_prepare_inspect_build(
|
|
421
|
-
project,
|
|
422
|
-
visuals=visuals,
|
|
423
|
-
progress=progress,
|
|
424
|
-
workspace=workspace,
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
def _work(dataset_ctx, progress_style):
|
|
428
|
-
project_path = dataset_ctx.project
|
|
429
|
-
dataset = dataset_ctx.dataset
|
|
430
|
-
feature_cfgs = list(dataset.features or [])
|
|
431
|
-
target_cfgs = list(dataset.targets or [])
|
|
432
|
-
|
|
433
|
-
context = dataset_ctx.pipeline_context
|
|
434
|
-
vectors = build_vector_pipeline(
|
|
435
|
-
context,
|
|
436
|
-
feature_cfgs,
|
|
437
|
-
dataset.group_by,
|
|
438
|
-
target_configs=target_cfgs,
|
|
439
|
-
)
|
|
440
|
-
vector_iter = _iter_with_progress(
|
|
441
|
-
vectors,
|
|
442
|
-
progress_style=progress_style,
|
|
443
|
-
label="Processing vectors",
|
|
444
|
-
)
|
|
445
|
-
ids: set[str] = set()
|
|
446
|
-
for sample in vector_iter:
|
|
447
|
-
ids.update(sample.features.values.keys())
|
|
448
|
-
if sample.targets:
|
|
449
|
-
ids.update(sample.targets.values.keys())
|
|
450
|
-
|
|
451
|
-
try:
|
|
452
|
-
default_path = artifacts_root(project_path) / "expected.txt"
|
|
453
|
-
except Exception as e:
|
|
454
|
-
raise RuntimeError(
|
|
455
|
-
f"{e}. Set `paths.artifacts` in your project.yaml to a writable directory."
|
|
456
|
-
)
|
|
457
|
-
output_path = Path(output) if output else default_path
|
|
458
|
-
ensure_parent(output_path)
|
|
459
|
-
with output_path.open("w", encoding="utf-8") as fh:
|
|
460
|
-
for fid in sorted(ids):
|
|
461
|
-
fh.write(f"{fid}\n")
|
|
462
|
-
print(f"[write] Saved expected feature list to {output_path} ({len(ids)} ids)")
|
|
463
|
-
|
|
464
|
-
_run_inspect_job(
|
|
465
|
-
project,
|
|
466
|
-
visuals=visuals,
|
|
467
|
-
progress=progress,
|
|
468
|
-
log_level=log_level,
|
|
469
|
-
label="Inspect expected ids",
|
|
470
|
-
section="expected",
|
|
471
|
-
work=_work,
|
|
472
|
-
)
|