jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +286 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +42 -17
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +110 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
- datapipeline/templates/stubs/parser.py.j2 +4 -0
- datapipeline/templates/stubs/record.py.j2 +0 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.0.dist-info/METADATA +282 -0
- jerry_thomas-2.0.0.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from datapipeline.services.project_paths import streams_dir as resolve_streams_dir, ensure_project_scaffold
|
|
4
|
+
from datapipeline.services.scaffold.templates import render
|
|
5
|
+
from datapipeline.services.scaffold.utils import status
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def write_ingest_contract(
|
|
9
|
+
*,
|
|
10
|
+
project_yaml: Path,
|
|
11
|
+
stream_id: str,
|
|
12
|
+
source: str,
|
|
13
|
+
mapper_entrypoint: str,
|
|
14
|
+
) -> Path:
|
|
15
|
+
ensure_project_scaffold(project_yaml)
|
|
16
|
+
streams_path = resolve_streams_dir(project_yaml)
|
|
17
|
+
streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
|
|
18
|
+
streams_dir.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
cfile = streams_dir / f"{stream_id}.yaml"
|
|
20
|
+
cfile.write_text(
|
|
21
|
+
render(
|
|
22
|
+
"contracts/ingest.yaml.j2",
|
|
23
|
+
source=source,
|
|
24
|
+
stream_id=stream_id,
|
|
25
|
+
mapper_entrypoint=mapper_entrypoint,
|
|
26
|
+
),
|
|
27
|
+
encoding="utf-8",
|
|
28
|
+
)
|
|
29
|
+
status("new", f"canonical spec: {cfile}")
|
|
30
|
+
return cfile
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def write_composed_contract(
|
|
34
|
+
*,
|
|
35
|
+
project_yaml: Path,
|
|
36
|
+
stream_id: str,
|
|
37
|
+
inputs_list: str,
|
|
38
|
+
mapper_entrypoint: str,
|
|
39
|
+
driver_key: str,
|
|
40
|
+
) -> Path:
|
|
41
|
+
ensure_project_scaffold(project_yaml)
|
|
42
|
+
streams_path = resolve_streams_dir(project_yaml)
|
|
43
|
+
streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
|
|
44
|
+
streams_dir.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
cfile = streams_dir / f"{stream_id}.yaml"
|
|
46
|
+
cfile.write_text(
|
|
47
|
+
render(
|
|
48
|
+
"contracts/composed.yaml.j2",
|
|
49
|
+
stream_id=stream_id,
|
|
50
|
+
inputs_list=inputs_list,
|
|
51
|
+
mapper_entrypoint=mapper_entrypoint,
|
|
52
|
+
driver_key=driver_key,
|
|
53
|
+
).strip() + "\n",
|
|
54
|
+
encoding="utf-8",
|
|
55
|
+
)
|
|
56
|
+
status("new", f"composed contract: {cfile}")
|
|
57
|
+
return cfile
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def compose_inputs(picked: list[str]) -> tuple[str, str]:
|
|
61
|
+
"""Build composed inputs list text and driver key."""
|
|
62
|
+
built: list[str] = []
|
|
63
|
+
for ref in picked:
|
|
64
|
+
parts = ref.split(".")
|
|
65
|
+
if len(parts) >= 3:
|
|
66
|
+
domain, variant = parts[0], parts[-1]
|
|
67
|
+
alias = f"{domain}_{variant}"
|
|
68
|
+
elif len(parts) == 2:
|
|
69
|
+
alias = parts[0]
|
|
70
|
+
else:
|
|
71
|
+
alias = ref
|
|
72
|
+
built.append(f"{alias}={ref}")
|
|
73
|
+
inputs_csv = ",".join(built)
|
|
74
|
+
inputs_list = "\n - ".join(s.strip() for s in inputs_csv.split(",") if s.strip())
|
|
75
|
+
driver_key = inputs_csv.split(",")[0].split("=")[0].strip()
|
|
76
|
+
return inputs_list, driver_key
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from importlib.resources import as_file, files
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import shutil
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from datapipeline.services.entrypoints import inject_ep
|
|
10
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
11
|
+
from datapipeline.utils.load import load_yaml
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_DEMO_DATASET_ALIAS = "demo"
|
|
16
|
+
_DEMO_PARSER_EP = "sandbox_ohlcv_dto_parser"
|
|
17
|
+
_DEMO_MAPPER_EP = "map_sandbox_ohlcv_dto_to_equity"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _replace_placeholders(path: Path, replacements: dict[str, str]) -> None:
|
|
21
|
+
if not path.is_file():
|
|
22
|
+
return
|
|
23
|
+
if path.suffix not in {".py", ".toml", ".md", ".yaml", ".yml"}:
|
|
24
|
+
return
|
|
25
|
+
text = path.read_text()
|
|
26
|
+
for placeholder, value in replacements.items():
|
|
27
|
+
text = text.replace(placeholder, value)
|
|
28
|
+
path.write_text(text)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _inject_demo_entrypoints(pyproject: Path, pkg_name: str) -> None:
|
|
32
|
+
toml = pyproject.read_text()
|
|
33
|
+
toml = inject_ep(
|
|
34
|
+
toml,
|
|
35
|
+
"parsers",
|
|
36
|
+
_DEMO_PARSER_EP,
|
|
37
|
+
f"{pkg_name}.parsers.sandbox_ohlcv_dto_parser:SandboxOhlcvDTOParser",
|
|
38
|
+
)
|
|
39
|
+
toml = inject_ep(
|
|
40
|
+
toml,
|
|
41
|
+
"mappers",
|
|
42
|
+
_DEMO_MAPPER_EP,
|
|
43
|
+
f"{pkg_name}.mappers.map_sandbox_ohlcv_dto_to_equity:map_sandbox_ohlcv_dto_to_equity",
|
|
44
|
+
)
|
|
45
|
+
pyproject.write_text(toml)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _update_workspace_jerry(
|
|
49
|
+
workspace_root: Path,
|
|
50
|
+
plugin_root_rel: Path,
|
|
51
|
+
dataset_path: Path,
|
|
52
|
+
) -> None:
|
|
53
|
+
workspace_jerry = workspace_root / "jerry.yaml"
|
|
54
|
+
if not workspace_jerry.exists():
|
|
55
|
+
return
|
|
56
|
+
data = load_yaml(workspace_jerry)
|
|
57
|
+
datasets = data.get("datasets") or {}
|
|
58
|
+
demo_path = (plugin_root_rel / dataset_path).as_posix()
|
|
59
|
+
datasets[_DEMO_DATASET_ALIAS] = demo_path
|
|
60
|
+
# Drop skeleton placeholders that point into this demo plugin.
|
|
61
|
+
for key in ("your-dataset", "interim-builder"):
|
|
62
|
+
path = datasets.get(key)
|
|
63
|
+
if isinstance(path, str) and path.startswith(plugin_root_rel.as_posix()):
|
|
64
|
+
datasets.pop(key, None)
|
|
65
|
+
data["datasets"] = datasets
|
|
66
|
+
data["default_dataset"] = _DEMO_DATASET_ALIAS
|
|
67
|
+
workspace_jerry.write_text(
|
|
68
|
+
yaml.safe_dump(data, sort_keys=False), encoding="utf-8"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _copy_tree(src: Path, dest: Path) -> None:
|
|
73
|
+
if src.is_dir():
|
|
74
|
+
shutil.copytree(src, dest, dirs_exist_ok=True)
|
|
75
|
+
else:
|
|
76
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
77
|
+
shutil.copy2(src, dest)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def scaffold_demo(root: Path | None = None) -> None:
|
|
81
|
+
root_dir, pkg_name, pyproject = pkg_root(root)
|
|
82
|
+
demo_ref = files("datapipeline") / "templates" / "demo_skeleton"
|
|
83
|
+
with as_file(demo_ref) as demo_dir:
|
|
84
|
+
demo_data = demo_dir / "demo"
|
|
85
|
+
demo_pkg = demo_dir / "src" / "{{PACKAGE_NAME}}"
|
|
86
|
+
|
|
87
|
+
target_demo = root_dir / "demo"
|
|
88
|
+
if target_demo.exists():
|
|
89
|
+
logger.error("`%s` already exists", target_demo)
|
|
90
|
+
raise SystemExit(1)
|
|
91
|
+
|
|
92
|
+
_copy_tree(demo_data, target_demo)
|
|
93
|
+
|
|
94
|
+
dest_pkg = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
95
|
+
for item in demo_pkg.iterdir():
|
|
96
|
+
_copy_tree(item, dest_pkg / item.name)
|
|
97
|
+
|
|
98
|
+
for cleanup in ("your-dataset", "your-interim-data-builder"):
|
|
99
|
+
extra = root_dir / cleanup
|
|
100
|
+
if extra.exists():
|
|
101
|
+
shutil.rmtree(extra)
|
|
102
|
+
|
|
103
|
+
replacements = {
|
|
104
|
+
"{{PACKAGE_NAME}}": pkg_name,
|
|
105
|
+
}
|
|
106
|
+
for p in target_demo.rglob("*"):
|
|
107
|
+
_replace_placeholders(p, replacements)
|
|
108
|
+
for p in dest_pkg.rglob("*"):
|
|
109
|
+
_replace_placeholders(p, replacements)
|
|
110
|
+
|
|
111
|
+
_inject_demo_entrypoints(pyproject, pkg_name)
|
|
112
|
+
|
|
113
|
+
workspace_root = Path.cwd().resolve()
|
|
114
|
+
try:
|
|
115
|
+
plugin_root_rel = root_dir.relative_to(workspace_root)
|
|
116
|
+
except ValueError:
|
|
117
|
+
plugin_root_rel = Path(os.path.relpath(root_dir, workspace_root))
|
|
118
|
+
|
|
119
|
+
_update_workspace_jerry(
|
|
120
|
+
workspace_root,
|
|
121
|
+
plugin_root_rel,
|
|
122
|
+
Path("demo/project.yaml"),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Rewrite demo source YAMLs to use plugin-root-relative paths so
|
|
126
|
+
# workspace-level runs resolve sample data correctly.
|
|
127
|
+
sources_dir = target_demo / "sources"
|
|
128
|
+
for src_file in sources_dir.glob("*.yaml"):
|
|
129
|
+
src_data = load_yaml(src_file)
|
|
130
|
+
loader = (src_data or {}).get("loader") or {}
|
|
131
|
+
args = loader.get("args") or {}
|
|
132
|
+
path = args.get("path")
|
|
133
|
+
if isinstance(path, str) and path and not Path(path).is_absolute():
|
|
134
|
+
args["path"] = (plugin_root_rel / path).as_posix()
|
|
135
|
+
loader["args"] = args
|
|
136
|
+
src_data["loader"] = loader
|
|
137
|
+
src_file.write_text(
|
|
138
|
+
yaml.safe_dump(src_data, sort_keys=False), encoding="utf-8"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
logger.info("demo dataset created at %s", target_demo)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import ast
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
6
|
+
from datapipeline.services.entrypoints import read_group_entries
|
|
7
|
+
from datapipeline.services.constants import PARSERS_GROUP, LOADERS_GROUP, MAPPERS_GROUP
|
|
8
|
+
from datapipeline.services.project_paths import sources_dir as resolve_sources_dir, streams_dir as resolve_streams_dir
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def list_dtos(*, root: Optional[Path] = None) -> dict[str, str]:
|
|
12
|
+
"""Return mapping of DTO class name -> module path."""
|
|
13
|
+
root_dir, pkg_name, _ = pkg_root(root)
|
|
14
|
+
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
15
|
+
dtos_dir = base / "dtos"
|
|
16
|
+
if not dtos_dir.exists():
|
|
17
|
+
return {}
|
|
18
|
+
|
|
19
|
+
package_name = base.name
|
|
20
|
+
found: dict[str, str] = {}
|
|
21
|
+
for path in sorted(dtos_dir.glob("*.py")):
|
|
22
|
+
if path.name == "__init__.py":
|
|
23
|
+
continue
|
|
24
|
+
try:
|
|
25
|
+
tree = ast.parse(path.read_text())
|
|
26
|
+
except Exception:
|
|
27
|
+
continue
|
|
28
|
+
module = f"{package_name}.dtos.{path.stem}"
|
|
29
|
+
for node in tree.body:
|
|
30
|
+
if isinstance(node, ast.ClassDef) and _is_dataclass(node):
|
|
31
|
+
found[node.name] = module
|
|
32
|
+
return found
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _is_dataclass(node: ast.ClassDef) -> bool:
|
|
36
|
+
for deco in node.decorator_list:
|
|
37
|
+
if isinstance(deco, ast.Name) and deco.id == "dataclass":
|
|
38
|
+
return True
|
|
39
|
+
if isinstance(deco, ast.Attribute) and deco.attr == "dataclass":
|
|
40
|
+
return True
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def list_parsers(*, root: Optional[Path] = None) -> dict[str, str]:
|
|
45
|
+
root_dir, _, pyproject = pkg_root(root)
|
|
46
|
+
if not pyproject.exists():
|
|
47
|
+
return {}
|
|
48
|
+
return read_group_entries(pyproject, PARSERS_GROUP)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def list_loaders(*, root: Optional[Path] = None) -> dict[str, str]:
|
|
52
|
+
root_dir, _, pyproject = pkg_root(root)
|
|
53
|
+
if not pyproject.exists():
|
|
54
|
+
return {}
|
|
55
|
+
return read_group_entries(pyproject, LOADERS_GROUP)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def list_mappers(*, root: Optional[Path] = None) -> dict[str, str]:
|
|
59
|
+
root_dir, _, pyproject = pkg_root(root)
|
|
60
|
+
if not pyproject.exists():
|
|
61
|
+
return {}
|
|
62
|
+
return read_group_entries(pyproject, MAPPERS_GROUP)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def list_domains(*, root: Optional[Path] = None) -> list[str]:
|
|
66
|
+
root_dir, pkg_name, _ = pkg_root(root)
|
|
67
|
+
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
68
|
+
dom_dir = base / "domains"
|
|
69
|
+
if not dom_dir.exists():
|
|
70
|
+
return []
|
|
71
|
+
return sorted(
|
|
72
|
+
p.name
|
|
73
|
+
for p in dom_dir.iterdir()
|
|
74
|
+
if p.is_dir() and (p / "model.py").exists()
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def list_sources(project_yaml: Path) -> list[str]:
|
|
79
|
+
from datapipeline.utils.load import load_yaml
|
|
80
|
+
from datapipeline.services.constants import PARSER_KEY, LOADER_KEY, SOURCE_ID_KEY
|
|
81
|
+
|
|
82
|
+
sources_dir = resolve_sources_dir(project_yaml)
|
|
83
|
+
if not sources_dir.exists():
|
|
84
|
+
return []
|
|
85
|
+
out: list[str] = []
|
|
86
|
+
for p in sorted(sources_dir.rglob("*.y*ml")):
|
|
87
|
+
try:
|
|
88
|
+
data = load_yaml(p)
|
|
89
|
+
except Exception:
|
|
90
|
+
continue
|
|
91
|
+
if isinstance(data, dict) and isinstance(data.get(PARSER_KEY), dict) and isinstance(data.get(LOADER_KEY), dict):
|
|
92
|
+
alias = data.get(SOURCE_ID_KEY)
|
|
93
|
+
if isinstance(alias, str):
|
|
94
|
+
out.append(alias)
|
|
95
|
+
return sorted(set(out))
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def list_streams(project_yaml: Path) -> list[str]:
|
|
99
|
+
from datapipeline.utils.load import load_yaml
|
|
100
|
+
from datapipeline.services.constants import STREAM_ID_KEY
|
|
101
|
+
|
|
102
|
+
streams_dir = resolve_streams_dir(project_yaml)
|
|
103
|
+
if not streams_dir.exists():
|
|
104
|
+
return []
|
|
105
|
+
out: list[str] = []
|
|
106
|
+
for p in sorted(streams_dir.rglob("*.y*ml")):
|
|
107
|
+
try:
|
|
108
|
+
data = load_yaml(p)
|
|
109
|
+
except Exception:
|
|
110
|
+
continue
|
|
111
|
+
if isinstance(data, dict) and data.get("kind") in {"ingest", "composed"}:
|
|
112
|
+
sid = data.get(STREAM_ID_KEY)
|
|
113
|
+
if isinstance(sid, str) and sid:
|
|
114
|
+
out.append(sid)
|
|
115
|
+
return sorted(set(out))
|
|
@@ -2,25 +2,33 @@ from pathlib import Path
|
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
4
|
from datapipeline.services.scaffold.templates import render
|
|
5
|
+
from datapipeline.services.scaffold.utils import (
|
|
6
|
+
ensure_pkg_dir,
|
|
7
|
+
to_snake,
|
|
8
|
+
validate_identifier,
|
|
9
|
+
write_if_missing,
|
|
10
|
+
)
|
|
11
|
+
from datapipeline.services.scaffold.layout import DIR_DOMAINS, TPL_DOMAIN_RECORD, domain_record_class
|
|
5
12
|
|
|
6
13
|
from ..paths import pkg_root, resolve_base_pkg_dir
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
def create_domain(*, domain: str, root: Optional[Path]) -> None:
|
|
17
|
+
validate_identifier(domain, "Domain name")
|
|
10
18
|
root_dir, name, _ = pkg_root(root)
|
|
11
19
|
base = resolve_base_pkg_dir(root_dir, name)
|
|
12
20
|
package_name = base.name
|
|
13
|
-
pkg_dir = base /
|
|
14
|
-
pkg_dir.mkdir(parents=True, exist_ok=True)
|
|
15
|
-
(pkg_dir / "__init__.py").touch(exist_ok=True)
|
|
16
|
-
|
|
17
|
-
def write_missing(path: Path, tpl: str, **ctx):
|
|
18
|
-
if not path.exists():
|
|
19
|
-
path.write_text(render(tpl, **ctx))
|
|
20
|
-
print(f"[new] {path}")
|
|
21
|
-
|
|
22
|
-
cls_ = "".join(w.capitalize() for w in domain.split("_"))
|
|
21
|
+
pkg_dir = ensure_pkg_dir(base / DIR_DOMAINS, domain)
|
|
23
22
|
parent = "TemporalRecord"
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
23
|
+
write_if_missing(
|
|
24
|
+
pkg_dir / "model.py",
|
|
25
|
+
render(
|
|
26
|
+
TPL_DOMAIN_RECORD,
|
|
27
|
+
PACKAGE_NAME=package_name,
|
|
28
|
+
DOMAIN=domain,
|
|
29
|
+
CLASS_NAME=domain_record_class(domain),
|
|
30
|
+
PARENT_CLASS=parent,
|
|
31
|
+
time_aware=True,
|
|
32
|
+
),
|
|
33
|
+
label="Domain",
|
|
34
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.scaffold.templates import render
|
|
5
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
6
|
+
from datapipeline.services.scaffold.utils import (
|
|
7
|
+
ensure_pkg_dir,
|
|
8
|
+
to_snake,
|
|
9
|
+
validate_identifier,
|
|
10
|
+
write_if_missing,
|
|
11
|
+
)
|
|
12
|
+
from datapipeline.services.scaffold.layout import DIR_DTOS, TPL_DTO
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_dto(*, name: str, root: Optional[Path]) -> None:
|
|
16
|
+
validate_identifier(name, "DTO name")
|
|
17
|
+
|
|
18
|
+
root_dir, pkg_name, _ = pkg_root(root)
|
|
19
|
+
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
20
|
+
dtos_dir = ensure_pkg_dir(base, DIR_DTOS)
|
|
21
|
+
module_name = to_snake(name)
|
|
22
|
+
path = dtos_dir / f"{module_name}.py"
|
|
23
|
+
write_if_missing(
|
|
24
|
+
path,
|
|
25
|
+
render(
|
|
26
|
+
TPL_DTO,
|
|
27
|
+
CLASS_NAME=name,
|
|
28
|
+
DOMAIN=name,
|
|
29
|
+
),
|
|
30
|
+
label="DTO",
|
|
31
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
from typing import Optional
|
|
3
3
|
from datapipeline.services.scaffold.templates import render
|
|
4
|
+
from datapipeline.services.scaffold.utils import status
|
|
4
5
|
from ..constants import FILTERS_GROUP
|
|
5
6
|
from ..entrypoints import inject_ep
|
|
6
7
|
from ..paths import pkg_root, resolve_base_pkg_dir
|
|
@@ -19,7 +20,7 @@ def create_filter(*, name: str, root: Optional[Path]) -> None:
|
|
|
19
20
|
path = filters_dir / f"{module_name}.py"
|
|
20
21
|
if not path.exists():
|
|
21
22
|
path.write_text(render("filter.py.j2", FUNCTION_NAME=name))
|
|
22
|
-
|
|
23
|
+
status("new", str(path))
|
|
23
24
|
|
|
24
25
|
# Register entry point under datapipeline.filters
|
|
25
26
|
toml_path = root_dir / "pyproject.toml"
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.scaffold.templates import camel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def to_snake(name: str) -> str:
|
|
8
|
+
s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
|
|
9
|
+
return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def slugify(text: str) -> str:
|
|
13
|
+
return re.sub(r"[^a-z0-9]+", "_", text.strip().lower()).strip("_")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def ep_key_from_name(name: str) -> str:
|
|
17
|
+
return to_snake(name)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Directory names
|
|
21
|
+
DIR_DTOS = "dtos"
|
|
22
|
+
DIR_PARSERS = "parsers"
|
|
23
|
+
DIR_LOADERS = "loaders"
|
|
24
|
+
DIR_MAPPERS = "mappers"
|
|
25
|
+
DIR_DOMAINS = "domains"
|
|
26
|
+
|
|
27
|
+
# Template paths
|
|
28
|
+
TPL_DTO = "dto.py.j2"
|
|
29
|
+
TPL_PARSER = "parser.py.j2"
|
|
30
|
+
TPL_LOADER_BASIC = "loaders/basic.py.j2"
|
|
31
|
+
TPL_LOADER_SYNTHETIC = "loader_synthetic.py.j2"
|
|
32
|
+
TPL_MAPPER_INGEST = "mappers/ingest.py.j2"
|
|
33
|
+
TPL_MAPPER_COMPOSED = "mappers/composed.py.j2"
|
|
34
|
+
TPL_DOMAIN_RECORD = "record.py.j2"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def class_name_with_suffix(name: str, suffix: str) -> str:
|
|
38
|
+
return f"{camel(name)}{suffix}"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def loader_class_name(name: str) -> str:
|
|
42
|
+
return class_name_with_suffix(name, "Loader")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def domain_record_class(domain: str) -> str:
|
|
46
|
+
return class_name_with_suffix(domain, "Record")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def loader_template_name(template: str) -> str:
|
|
50
|
+
if template == "synthetic":
|
|
51
|
+
return TPL_LOADER_SYNTHETIC
|
|
52
|
+
return TPL_LOADER_BASIC
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def dto_class_name(base: str) -> str:
|
|
56
|
+
return class_name_with_suffix(base, "DTO")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def dto_module_path(package: str, dto_class: str) -> str:
|
|
60
|
+
return f"{package}.{DIR_DTOS}.{to_snake(dto_class)}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def default_parser_name(dto_class: str) -> str:
|
|
64
|
+
return f"{dto_class}Parser"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def default_mapper_name(input_module: str, domain: str) -> str:
|
|
68
|
+
input_mod = input_module.rsplit(".", 1)[-1]
|
|
69
|
+
return f"map_{input_mod}_to_{domain}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def default_stream_id(domain: str, dataset: str, variant: str | None = None) -> str:
|
|
73
|
+
base = f"{slugify(domain)}.{slugify(dataset)}"
|
|
74
|
+
return f"{base}.{slugify(variant)}" if variant else base
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# Prompt labels (keep CLI wording consistent)
|
|
78
|
+
LABEL_DTO_FOR_PARSER = "DTO for parser"
|
|
79
|
+
LABEL_DTO_FOR_MAPPER = "DTO for mapper"
|
|
80
|
+
LABEL_DOMAIN_TO_MAP = "Domain"
|
|
81
|
+
LABEL_MAPPER_INPUT = "Mapper input"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def default_mapper_name_for_identity(domain: str) -> str:
|
|
85
|
+
return f"map_identity_to_{slugify(domain)}"
|
|
86
|
+
|
|
87
|
+
def pyproject_path(root_dir: Path) -> Path:
|
|
88
|
+
return root_dir / "pyproject.toml"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def module_path(package: str, group: str, module: str) -> str:
|
|
92
|
+
return f"{package}.{group}.{module}"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def entrypoint_target(package: str, group: str, module: str, attr: str) -> str:
|
|
96
|
+
return f"{module_path(package, group, module)}:{attr}"
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
5
|
+
from datapipeline.services.scaffold.templates import render
|
|
6
|
+
from datapipeline.services.scaffold.utils import (
|
|
7
|
+
ensure_pkg_dir,
|
|
8
|
+
ep_key_from_name,
|
|
9
|
+
to_snake,
|
|
10
|
+
validate_identifier,
|
|
11
|
+
write_if_missing,
|
|
12
|
+
)
|
|
13
|
+
from datapipeline.services.scaffold.layout import (
|
|
14
|
+
DIR_LOADERS,
|
|
15
|
+
entrypoint_target,
|
|
16
|
+
loader_class_name,
|
|
17
|
+
loader_template_name,
|
|
18
|
+
pyproject_path,
|
|
19
|
+
)
|
|
20
|
+
from datapipeline.services.entrypoints import inject_ep
|
|
21
|
+
from datapipeline.services.constants import LOADERS_GROUP
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_loader(
|
|
25
|
+
*,
|
|
26
|
+
name: str,
|
|
27
|
+
root: Optional[Path],
|
|
28
|
+
template: str = "basic",
|
|
29
|
+
) -> str:
|
|
30
|
+
validate_identifier(name, "Loader name")
|
|
31
|
+
|
|
32
|
+
root_dir, pkg_name, _ = pkg_root(root)
|
|
33
|
+
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
34
|
+
package_name = base.name
|
|
35
|
+
|
|
36
|
+
loaders_dir = ensure_pkg_dir(base, DIR_LOADERS)
|
|
37
|
+
module_name = to_snake(name)
|
|
38
|
+
path = loaders_dir / f"{module_name}.py"
|
|
39
|
+
|
|
40
|
+
class_name = loader_class_name(name)
|
|
41
|
+
template_name = loader_template_name(template)
|
|
42
|
+
|
|
43
|
+
write_if_missing(
|
|
44
|
+
path,
|
|
45
|
+
render(
|
|
46
|
+
template_name,
|
|
47
|
+
CLASS_NAME=class_name,
|
|
48
|
+
),
|
|
49
|
+
label="Loader",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
ep_key = ep_key_from_name(name)
|
|
53
|
+
pyproject = pyproject_path(root_dir)
|
|
54
|
+
toml = inject_ep(
|
|
55
|
+
pyproject.read_text(),
|
|
56
|
+
LOADERS_GROUP,
|
|
57
|
+
ep_key,
|
|
58
|
+
entrypoint_target(package_name, "loaders", module_name, class_name),
|
|
59
|
+
)
|
|
60
|
+
pyproject.write_text(toml)
|
|
61
|
+
return ep_key
|