jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +0 -1
- datapipeline/build/tasks/config.py +0 -2
- datapipeline/build/tasks/metadata.py +0 -2
- datapipeline/build/tasks/scaler.py +0 -2
- datapipeline/build/tasks/schema.py +0 -2
- datapipeline/build/tasks/utils.py +0 -2
- datapipeline/cli/app.py +201 -81
- datapipeline/cli/commands/contract.py +145 -283
- datapipeline/cli/commands/demo.py +13 -0
- datapipeline/cli/commands/domain.py +4 -4
- datapipeline/cli/commands/dto.py +11 -0
- datapipeline/cli/commands/filter.py +2 -2
- datapipeline/cli/commands/inspect.py +0 -68
- datapipeline/cli/commands/list_.py +30 -13
- datapipeline/cli/commands/loader.py +11 -0
- datapipeline/cli/commands/mapper.py +82 -0
- datapipeline/cli/commands/parser.py +45 -0
- datapipeline/cli/commands/run_config.py +1 -3
- datapipeline/cli/commands/serve_pipeline.py +5 -7
- datapipeline/cli/commands/source.py +106 -18
- datapipeline/cli/commands/stream.py +292 -0
- datapipeline/cli/visuals/common.py +0 -2
- datapipeline/cli/visuals/sections.py +0 -2
- datapipeline/cli/workspace_utils.py +0 -3
- datapipeline/config/context.py +0 -2
- datapipeline/config/dataset/feature.py +1 -0
- datapipeline/config/metadata.py +0 -2
- datapipeline/config/project.py +0 -2
- datapipeline/config/resolution.py +10 -2
- datapipeline/config/tasks.py +9 -9
- datapipeline/domain/feature.py +3 -0
- datapipeline/domain/record.py +7 -7
- datapipeline/domain/sample.py +0 -2
- datapipeline/domain/vector.py +6 -8
- datapipeline/integrations/ml/adapter.py +0 -2
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +0 -2
- datapipeline/integrations/ml/torch_support.py +0 -2
- datapipeline/io/output.py +0 -2
- datapipeline/io/serializers.py +26 -16
- datapipeline/mappers/synthetic/time.py +9 -2
- datapipeline/pipeline/artifacts.py +3 -5
- datapipeline/pipeline/observability.py +0 -2
- datapipeline/pipeline/pipelines.py +118 -34
- datapipeline/pipeline/stages.py +54 -18
- datapipeline/pipeline/utils/spool_cache.py +142 -0
- datapipeline/pipeline/utils/transform_utils.py +27 -2
- datapipeline/services/artifacts.py +1 -4
- datapipeline/services/constants.py +1 -0
- datapipeline/services/factories.py +4 -6
- datapipeline/services/paths.py +10 -1
- datapipeline/services/project_paths.py +0 -2
- datapipeline/services/runs.py +0 -2
- datapipeline/services/scaffold/contract_yaml.py +76 -0
- datapipeline/services/scaffold/demo.py +141 -0
- datapipeline/services/scaffold/discovery.py +115 -0
- datapipeline/services/scaffold/domain.py +21 -13
- datapipeline/services/scaffold/dto.py +31 -0
- datapipeline/services/scaffold/filter.py +2 -1
- datapipeline/services/scaffold/layout.py +96 -0
- datapipeline/services/scaffold/loader.py +61 -0
- datapipeline/services/scaffold/mapper.py +116 -0
- datapipeline/services/scaffold/parser.py +56 -0
- datapipeline/services/scaffold/plugin.py +14 -2
- datapipeline/services/scaffold/source_yaml.py +91 -0
- datapipeline/services/scaffold/stream_plan.py +129 -0
- datapipeline/services/scaffold/utils.py +187 -0
- datapipeline/sources/data_loader.py +0 -2
- datapipeline/sources/decoders.py +49 -8
- datapipeline/sources/factory.py +9 -6
- datapipeline/sources/foreach.py +18 -3
- datapipeline/sources/synthetic/time/parser.py +1 -1
- datapipeline/sources/transports.py +10 -4
- datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
- datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
- datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
- datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
- datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
- datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
- datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
- datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
- datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
- datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
- datapipeline/templates/plugin_skeleton/README.md +57 -136
- datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
- datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
- datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
- datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
- datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
- datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
- datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
- datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
- datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +9 -11
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +15 -0
- datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
- datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
- datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -2
- datapipeline/templates/stubs/filter.py.j2 +1 -1
- datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
- datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
- datapipeline/templates/stubs/mappers/ingest.py.j2 +20 -0
- datapipeline/templates/stubs/parser.py.j2 +5 -1
- datapipeline/templates/stubs/record.py.j2 +1 -1
- datapipeline/templates/stubs/source.yaml.j2 +1 -1
- datapipeline/transforms/debug/identity.py +34 -16
- datapipeline/transforms/debug/lint.py +14 -11
- datapipeline/transforms/feature/scaler.py +5 -12
- datapipeline/transforms/filter.py +73 -17
- datapipeline/transforms/interfaces.py +58 -0
- datapipeline/transforms/record/floor_time.py +10 -7
- datapipeline/transforms/record/lag.py +8 -10
- datapipeline/transforms/sequence.py +2 -3
- datapipeline/transforms/stream/dedupe.py +5 -7
- datapipeline/transforms/stream/ensure_ticks.py +39 -24
- datapipeline/transforms/stream/fill.py +34 -25
- datapipeline/transforms/stream/filter.py +25 -0
- datapipeline/transforms/stream/floor_time.py +16 -0
- datapipeline/transforms/stream/granularity.py +52 -30
- datapipeline/transforms/stream/lag.py +17 -0
- datapipeline/transforms/stream/rolling.py +72 -0
- datapipeline/transforms/utils.py +42 -10
- datapipeline/transforms/vector/drop/horizontal.py +0 -3
- datapipeline/transforms/vector/drop/orchestrator.py +0 -3
- datapipeline/transforms/vector/drop/vertical.py +0 -2
- datapipeline/transforms/vector/ensure_schema.py +0 -2
- datapipeline/utils/paths.py +0 -2
- datapipeline/utils/placeholders.py +0 -2
- datapipeline/utils/rich_compat.py +0 -3
- datapipeline/utils/window.py +0 -2
- jerry_thomas-2.0.1.dist-info/METADATA +269 -0
- jerry_thomas-2.0.1.dist-info/RECORD +264 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/WHEEL +1 -1
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/entry_points.txt +7 -3
- datapipeline/services/scaffold/mappers.py +0 -55
- datapipeline/services/scaffold/source.py +0 -191
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
- datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
- datapipeline/templates/stubs/mapper.py.j2 +0 -22
- jerry_thomas-1.0.3.dist-info/METADATA +0 -827
- jerry_thomas-1.0.3.dist-info/RECORD +0 -198
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
5
|
+
from datapipeline.services.scaffold.templates import render
|
|
6
|
+
from datapipeline.services.scaffold.utils import (
|
|
7
|
+
ensure_pkg_dir,
|
|
8
|
+
ep_key_from_name,
|
|
9
|
+
info,
|
|
10
|
+
status,
|
|
11
|
+
to_snake,
|
|
12
|
+
validate_identifier,
|
|
13
|
+
write_if_missing,
|
|
14
|
+
)
|
|
15
|
+
from datapipeline.services.scaffold.layout import (
|
|
16
|
+
DIR_MAPPERS,
|
|
17
|
+
TPL_MAPPER_COMPOSED,
|
|
18
|
+
TPL_MAPPER_INGEST,
|
|
19
|
+
domain_record_class,
|
|
20
|
+
entrypoint_target,
|
|
21
|
+
pyproject_path,
|
|
22
|
+
)
|
|
23
|
+
from datapipeline.services.entrypoints import inject_ep
|
|
24
|
+
from datapipeline.services.constants import MAPPERS_GROUP
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def create_mapper(
|
|
28
|
+
*,
|
|
29
|
+
name: str,
|
|
30
|
+
dto_class: str | None = None,
|
|
31
|
+
dto_module: str | None = None,
|
|
32
|
+
input_class: str | None = None,
|
|
33
|
+
input_module: str | None = None,
|
|
34
|
+
domain: str,
|
|
35
|
+
root: Optional[Path],
|
|
36
|
+
) -> str:
|
|
37
|
+
validate_identifier(name, "Mapper name")
|
|
38
|
+
|
|
39
|
+
root_dir, pkg_name, _ = pkg_root(root)
|
|
40
|
+
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
41
|
+
package_name = base.name
|
|
42
|
+
|
|
43
|
+
mappers_dir = ensure_pkg_dir(base, DIR_MAPPERS)
|
|
44
|
+
module_name = to_snake(name)
|
|
45
|
+
path = mappers_dir / f"{module_name}.py"
|
|
46
|
+
|
|
47
|
+
domain_module = f"{package_name}.domains.{domain}.model"
|
|
48
|
+
domain_record = domain_record_class(domain)
|
|
49
|
+
|
|
50
|
+
resolved_class = input_class or dto_class
|
|
51
|
+
resolved_module = input_module or dto_module
|
|
52
|
+
if not resolved_class or not resolved_module:
|
|
53
|
+
raise ValueError("Mapper input class/module is required")
|
|
54
|
+
|
|
55
|
+
write_if_missing(
|
|
56
|
+
path,
|
|
57
|
+
render(
|
|
58
|
+
TPL_MAPPER_INGEST,
|
|
59
|
+
FUNCTION_NAME=module_name,
|
|
60
|
+
INPUT_CLASS=resolved_class,
|
|
61
|
+
INPUT_IMPORT=resolved_module,
|
|
62
|
+
DOMAIN_MODULE=domain_module,
|
|
63
|
+
DOMAIN_RECORD=domain_record,
|
|
64
|
+
),
|
|
65
|
+
label="Mapper",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
ep_key = ep_key_from_name(name)
|
|
69
|
+
pyproject = pyproject_path(root_dir)
|
|
70
|
+
try:
|
|
71
|
+
toml_text = pyproject.read_text()
|
|
72
|
+
updated = inject_ep(
|
|
73
|
+
toml_text,
|
|
74
|
+
MAPPERS_GROUP,
|
|
75
|
+
ep_key,
|
|
76
|
+
entrypoint_target(package_name, "mappers", module_name, module_name),
|
|
77
|
+
)
|
|
78
|
+
if updated != toml_text:
|
|
79
|
+
pyproject.write_text(updated)
|
|
80
|
+
status("ok", f"Registered mapper entry point '{ep_key}'.")
|
|
81
|
+
else:
|
|
82
|
+
status("skip", f"Mapper entry point already registered: '{ep_key}'.")
|
|
83
|
+
except FileNotFoundError:
|
|
84
|
+
info("pyproject.toml not found; skipping entry point registration")
|
|
85
|
+
return ep_key
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def create_composed_mapper(
|
|
89
|
+
*,
|
|
90
|
+
domain: str,
|
|
91
|
+
stream_id: str,
|
|
92
|
+
root: Optional[Path],
|
|
93
|
+
mapper_path: str | None = None,
|
|
94
|
+
) -> str:
|
|
95
|
+
root_dir, name, _ = pkg_root(root)
|
|
96
|
+
base = resolve_base_pkg_dir(root_dir, name)
|
|
97
|
+
map_pkg_dir = ensure_pkg_dir(base, DIR_MAPPERS)
|
|
98
|
+
mapper_file = map_pkg_dir / f"{domain}.py"
|
|
99
|
+
if not mapper_file.exists():
|
|
100
|
+
mapper_file.write_text(render(TPL_MAPPER_COMPOSED))
|
|
101
|
+
status("new", str(mapper_file))
|
|
102
|
+
|
|
103
|
+
ep_key = stream_id
|
|
104
|
+
package_name = base.name
|
|
105
|
+
default_target = f"{package_name}.mappers.{domain}:mapper"
|
|
106
|
+
ep_target = mapper_path if (mapper_path and ":" in mapper_path) else default_target
|
|
107
|
+
pyproj_path = pyproject_path(root_dir)
|
|
108
|
+
try:
|
|
109
|
+
toml_text = pyproj_path.read_text()
|
|
110
|
+
updated = inject_ep(toml_text, MAPPERS_GROUP, ep_key, ep_target)
|
|
111
|
+
if updated != toml_text:
|
|
112
|
+
pyproj_path.write_text(updated)
|
|
113
|
+
status("ok", f"Registered mapper entry point '{ep_key}' -> {ep_target}")
|
|
114
|
+
except FileNotFoundError:
|
|
115
|
+
info("pyproject.toml not found; skipping entry point registration")
|
|
116
|
+
return ep_key
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.scaffold.templates import render
|
|
5
|
+
from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
|
|
6
|
+
from datapipeline.services.scaffold.utils import (
|
|
7
|
+
ensure_pkg_dir,
|
|
8
|
+
ep_key_from_name,
|
|
9
|
+
to_snake,
|
|
10
|
+
validate_identifier,
|
|
11
|
+
write_if_missing,
|
|
12
|
+
)
|
|
13
|
+
from datapipeline.services.scaffold.layout import DIR_PARSERS, TPL_PARSER
|
|
14
|
+
from datapipeline.services.scaffold.layout import entrypoint_target, pyproject_path
|
|
15
|
+
from datapipeline.services.entrypoints import inject_ep
|
|
16
|
+
from datapipeline.services.constants import PARSERS_GROUP
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_parser(
|
|
20
|
+
*,
|
|
21
|
+
name: str,
|
|
22
|
+
dto_class: str,
|
|
23
|
+
dto_module: str,
|
|
24
|
+
root: Optional[Path],
|
|
25
|
+
) -> str:
|
|
26
|
+
validate_identifier(name, "Parser name")
|
|
27
|
+
|
|
28
|
+
root_dir, pkg_name, _ = pkg_root(root)
|
|
29
|
+
base = resolve_base_pkg_dir(root_dir, pkg_name)
|
|
30
|
+
package_name = base.name
|
|
31
|
+
|
|
32
|
+
parsers_dir = ensure_pkg_dir(base, DIR_PARSERS)
|
|
33
|
+
module_name = to_snake(name)
|
|
34
|
+
path = parsers_dir / f"{module_name}.py"
|
|
35
|
+
|
|
36
|
+
write_if_missing(
|
|
37
|
+
path,
|
|
38
|
+
render(
|
|
39
|
+
TPL_PARSER,
|
|
40
|
+
CLASS_NAME=name,
|
|
41
|
+
DTO_CLASS=dto_class,
|
|
42
|
+
DTO_IMPORT=dto_module,
|
|
43
|
+
),
|
|
44
|
+
label="Parser",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
ep_key = ep_key_from_name(name)
|
|
48
|
+
pyproject = pyproject_path(root_dir)
|
|
49
|
+
toml = inject_ep(
|
|
50
|
+
pyproject.read_text(),
|
|
51
|
+
PARSERS_GROUP,
|
|
52
|
+
ep_key,
|
|
53
|
+
entrypoint_target(package_name, "parsers", module_name, name),
|
|
54
|
+
)
|
|
55
|
+
pyproject.write_text(toml)
|
|
56
|
+
return ep_key
|
|
@@ -2,6 +2,7 @@ from importlib.resources import as_file, files
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
|
+
import sys
|
|
5
6
|
|
|
6
7
|
import yaml
|
|
7
8
|
|
|
@@ -11,7 +12,8 @@ from ..constants import DEFAULT_IO_LOADER_EP
|
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
14
|
-
_RESERVED_PACKAGE_NAMES = {"datapipeline"}
|
|
15
|
+
_RESERVED_PACKAGE_NAMES = {"datapipeline", "test", "tests"}
|
|
16
|
+
_STDLIB_MODULE_NAMES = getattr(sys, "stdlib_module_names", set())
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
def _normalized_package_name(dist_name: str) -> str:
|
|
@@ -21,6 +23,12 @@ def _normalized_package_name(dist_name: str) -> str:
|
|
|
21
23
|
"`datapipeline` is reserved for the core package. Choose a different plugin name."
|
|
22
24
|
)
|
|
23
25
|
raise SystemExit(1)
|
|
26
|
+
if package_name in _STDLIB_MODULE_NAMES:
|
|
27
|
+
logger.error(
|
|
28
|
+
"Plugin name '%s' conflicts with a Python standard library module. Choose a different name.",
|
|
29
|
+
package_name,
|
|
30
|
+
)
|
|
31
|
+
raise SystemExit(1)
|
|
24
32
|
if not package_name.isidentifier():
|
|
25
33
|
logger.error(
|
|
26
34
|
"Plugin names must be valid Python identifiers once hyphens are replaced with underscores."
|
|
@@ -47,7 +55,11 @@ def scaffold_plugin(name: str, outdir: Path) -> None:
|
|
|
47
55
|
"{{DIST_NAME}}": name,
|
|
48
56
|
"{{DEFAULT_IO_LOADER_EP}}": DEFAULT_IO_LOADER_EP,
|
|
49
57
|
}
|
|
50
|
-
for p in
|
|
58
|
+
for p in target.rglob("*"):
|
|
59
|
+
if not p.is_file():
|
|
60
|
+
continue
|
|
61
|
+
if p.suffix not in {".py", ".toml", ".md", ".yaml", ".yml"}:
|
|
62
|
+
continue
|
|
51
63
|
text = p.read_text()
|
|
52
64
|
for placeholder, value in replacements.items():
|
|
53
65
|
text = text.replace(placeholder, value)
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.paths import pkg_root
|
|
5
|
+
from datapipeline.services.project_paths import (
|
|
6
|
+
sources_dir as resolve_sources_dir,
|
|
7
|
+
ensure_project_scaffold,
|
|
8
|
+
resolve_project_yaml_path,
|
|
9
|
+
)
|
|
10
|
+
from datapipeline.services.scaffold.templates import render
|
|
11
|
+
from datapipeline.services.constants import (
|
|
12
|
+
DEFAULT_IO_LOADER_EP,
|
|
13
|
+
DEFAULT_SYNTHETIC_LOADER_EP,
|
|
14
|
+
)
|
|
15
|
+
from datapipeline.services.scaffold.utils import status
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _loader_args(transport: str, fmt: Optional[str]) -> dict:
|
|
19
|
+
if transport == "fs":
|
|
20
|
+
args = {
|
|
21
|
+
"transport": "fs",
|
|
22
|
+
"format": fmt or "<FORMAT (csv|json|json-lines|pickle)>",
|
|
23
|
+
"path": "<PATH OR GLOB>",
|
|
24
|
+
"glob": False,
|
|
25
|
+
"encoding": "utf-8",
|
|
26
|
+
}
|
|
27
|
+
if fmt == "csv":
|
|
28
|
+
args["delimiter"] = ","
|
|
29
|
+
return args
|
|
30
|
+
if transport == "http":
|
|
31
|
+
args = {
|
|
32
|
+
"transport": "http",
|
|
33
|
+
"format": fmt or "<FORMAT (json|json-lines|csv)>",
|
|
34
|
+
"url": "<https://api.example.com/data.json>",
|
|
35
|
+
"headers": {},
|
|
36
|
+
"params": {},
|
|
37
|
+
"encoding": "utf-8",
|
|
38
|
+
}
|
|
39
|
+
if fmt == "csv":
|
|
40
|
+
args["delimiter"] = ","
|
|
41
|
+
return args
|
|
42
|
+
if transport == "synthetic":
|
|
43
|
+
return {"start": "<ISO8601>", "end": "<ISO8601>", "frequency": "1h"}
|
|
44
|
+
return {}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_source_yaml(
|
|
48
|
+
*,
|
|
49
|
+
provider: str,
|
|
50
|
+
dataset: str,
|
|
51
|
+
loader_ep: str,
|
|
52
|
+
loader_args: dict,
|
|
53
|
+
parser_ep: str,
|
|
54
|
+
parser_args: dict | None = None,
|
|
55
|
+
root: Optional[Path],
|
|
56
|
+
project_yaml: Optional[Path] = None,
|
|
57
|
+
) -> None:
|
|
58
|
+
root_dir, _, _ = pkg_root(root)
|
|
59
|
+
alias = f"{provider}.{dataset}"
|
|
60
|
+
parser_args = parser_args or {}
|
|
61
|
+
|
|
62
|
+
proj_yaml = project_yaml.resolve() if project_yaml is not None else resolve_project_yaml_path(root_dir)
|
|
63
|
+
ensure_project_scaffold(proj_yaml)
|
|
64
|
+
sources_dir = resolve_sources_dir(proj_yaml).resolve()
|
|
65
|
+
sources_dir.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
|
|
67
|
+
src_cfg_path = sources_dir / f"{alias}.yaml"
|
|
68
|
+
if src_cfg_path.exists():
|
|
69
|
+
status("skip", f"Source YAML already exists: {src_cfg_path.resolve()}")
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
src_cfg_path.write_text(
|
|
73
|
+
render(
|
|
74
|
+
"source.yaml.j2",
|
|
75
|
+
id=alias,
|
|
76
|
+
parser_ep=parser_ep,
|
|
77
|
+
parser_args=parser_args,
|
|
78
|
+
loader_ep=loader_ep,
|
|
79
|
+
loader_args=loader_args,
|
|
80
|
+
default_io_loader_ep=DEFAULT_IO_LOADER_EP,
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
status("new", str(src_cfg_path.resolve()))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def default_loader_config(transport: str, fmt: Optional[str]) -> tuple[str, dict]:
|
|
87
|
+
if transport in {"fs", "http"}:
|
|
88
|
+
return DEFAULT_IO_LOADER_EP, _loader_args(transport, fmt)
|
|
89
|
+
if transport == "synthetic":
|
|
90
|
+
return DEFAULT_SYNTHETIC_LOADER_EP, _loader_args(transport, fmt)
|
|
91
|
+
return DEFAULT_IO_LOADER_EP, {}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from datapipeline.services.scaffold.domain import create_domain
|
|
5
|
+
from datapipeline.services.scaffold.dto import create_dto
|
|
6
|
+
from datapipeline.services.scaffold.parser import create_parser
|
|
7
|
+
from datapipeline.services.scaffold.mapper import create_mapper
|
|
8
|
+
from datapipeline.services.scaffold.source_yaml import create_source_yaml
|
|
9
|
+
from datapipeline.services.scaffold.contract_yaml import write_ingest_contract
|
|
10
|
+
from datapipeline.services.scaffold.discovery import list_dtos
|
|
11
|
+
from datapipeline.services.paths import pkg_root
|
|
12
|
+
from datapipeline.services.scaffold.utils import error_exit, status
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ParserPlan:
|
|
17
|
+
create: bool
|
|
18
|
+
create_dto: bool = False
|
|
19
|
+
dto_class: str | None = None
|
|
20
|
+
dto_module: str | None = None
|
|
21
|
+
parser_name: str | None = None
|
|
22
|
+
parser_ep: str | None = None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class MapperPlan:
|
|
27
|
+
create: bool
|
|
28
|
+
create_dto: bool = False
|
|
29
|
+
input_class: str | None = None
|
|
30
|
+
input_module: str | None = None
|
|
31
|
+
mapper_name: str | None = None
|
|
32
|
+
mapper_ep: str | None = None
|
|
33
|
+
domain: str | None = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class StreamPlan:
|
|
38
|
+
provider: str
|
|
39
|
+
dataset: str
|
|
40
|
+
source_id: str
|
|
41
|
+
project_yaml: Path
|
|
42
|
+
stream_id: str
|
|
43
|
+
root: Path | None
|
|
44
|
+
create_source: bool
|
|
45
|
+
loader_ep: str | None = None
|
|
46
|
+
loader_args: dict | None = None
|
|
47
|
+
parser: ParserPlan | None = None
|
|
48
|
+
mapper: MapperPlan | None = None
|
|
49
|
+
domain: str | None = None
|
|
50
|
+
create_domain: bool = False
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def execute_stream_plan(plan: StreamPlan) -> None:
|
|
54
|
+
pyproject_path = None
|
|
55
|
+
before_pyproject = None
|
|
56
|
+
try:
|
|
57
|
+
root_dir, _, pyproject = pkg_root(plan.root)
|
|
58
|
+
pyproject_path = pyproject
|
|
59
|
+
if pyproject_path.exists():
|
|
60
|
+
before_pyproject = pyproject_path.read_text()
|
|
61
|
+
except SystemExit:
|
|
62
|
+
pyproject_path = None
|
|
63
|
+
before_pyproject = None
|
|
64
|
+
|
|
65
|
+
if plan.create_domain and plan.domain:
|
|
66
|
+
create_domain(domain=plan.domain, root=plan.root)
|
|
67
|
+
|
|
68
|
+
parser_ep = None
|
|
69
|
+
if plan.parser:
|
|
70
|
+
if plan.parser.create:
|
|
71
|
+
if plan.parser.dto_class and plan.parser.create_dto:
|
|
72
|
+
create_dto(name=plan.parser.dto_class, root=plan.root)
|
|
73
|
+
dto_module = plan.parser.dto_module or list_dtos(root=plan.root).get(plan.parser.dto_class or "")
|
|
74
|
+
if not dto_module:
|
|
75
|
+
error_exit("Failed to resolve DTO module.")
|
|
76
|
+
parser_ep = create_parser(
|
|
77
|
+
name=plan.parser.parser_name or "parser",
|
|
78
|
+
dto_class=plan.parser.dto_class or "DTO",
|
|
79
|
+
dto_module=dto_module,
|
|
80
|
+
root=plan.root,
|
|
81
|
+
)
|
|
82
|
+
else:
|
|
83
|
+
parser_ep = plan.parser.parser_ep
|
|
84
|
+
|
|
85
|
+
mapper_ep = None
|
|
86
|
+
if plan.mapper:
|
|
87
|
+
if plan.mapper.create:
|
|
88
|
+
if plan.mapper.input_class and plan.mapper.create_dto:
|
|
89
|
+
create_dto(name=plan.mapper.input_class, root=plan.root)
|
|
90
|
+
input_module = plan.mapper.input_module
|
|
91
|
+
if not input_module and plan.mapper.input_class:
|
|
92
|
+
input_module = list_dtos(root=plan.root).get(plan.mapper.input_class)
|
|
93
|
+
if not input_module:
|
|
94
|
+
error_exit("Failed to resolve mapper input module.")
|
|
95
|
+
mapper_ep = create_mapper(
|
|
96
|
+
name=plan.mapper.mapper_name or "mapper",
|
|
97
|
+
input_class=plan.mapper.input_class or "Record",
|
|
98
|
+
input_module=input_module,
|
|
99
|
+
domain=plan.mapper.domain or plan.domain or "domain",
|
|
100
|
+
root=plan.root,
|
|
101
|
+
)
|
|
102
|
+
else:
|
|
103
|
+
mapper_ep = plan.mapper.mapper_ep
|
|
104
|
+
|
|
105
|
+
if plan.create_source and plan.loader_ep and plan.loader_args is not None:
|
|
106
|
+
create_source_yaml(
|
|
107
|
+
provider=plan.provider,
|
|
108
|
+
dataset=plan.dataset,
|
|
109
|
+
loader_ep=plan.loader_ep,
|
|
110
|
+
loader_args=plan.loader_args,
|
|
111
|
+
parser_ep=parser_ep or "identity",
|
|
112
|
+
root=plan.root,
|
|
113
|
+
project_yaml=plan.project_yaml,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
write_ingest_contract(
|
|
117
|
+
project_yaml=plan.project_yaml,
|
|
118
|
+
stream_id=plan.stream_id,
|
|
119
|
+
source=plan.source_id,
|
|
120
|
+
mapper_entrypoint=mapper_ep or "identity",
|
|
121
|
+
)
|
|
122
|
+
status("ok", "Stream created.")
|
|
123
|
+
if pyproject_path and before_pyproject is not None:
|
|
124
|
+
after_pyproject = pyproject_path.read_text()
|
|
125
|
+
if after_pyproject != before_pyproject:
|
|
126
|
+
status(
|
|
127
|
+
"note",
|
|
128
|
+
f"Entry points updated; reinstall plugin: pip install -e {pyproject_path.parent}",
|
|
129
|
+
)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import sys
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from datapipeline.services.scaffold.layout import to_snake, slugify, ep_key_from_name
|
|
6
|
+
|
|
7
|
+
_LOGGER = logging.getLogger("datapipeline.cli")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def ensure_pkg_dir(base: Path, name: str) -> Path:
|
|
11
|
+
path = base / name
|
|
12
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
(path / "__init__.py").touch(exist_ok=True)
|
|
14
|
+
return path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ensure_pkg_dir",
|
|
19
|
+
"to_snake",
|
|
20
|
+
"slugify",
|
|
21
|
+
"ep_key_from_name",
|
|
22
|
+
"validate_identifier",
|
|
23
|
+
"write_if_missing",
|
|
24
|
+
"prompt_required",
|
|
25
|
+
"prompt_optional",
|
|
26
|
+
"choose_name",
|
|
27
|
+
"status",
|
|
28
|
+
"info",
|
|
29
|
+
"error_exit",
|
|
30
|
+
"pick_from_list",
|
|
31
|
+
"pick_from_menu",
|
|
32
|
+
"choose_existing_or_create",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_identifier(name: str, label: str) -> None:
|
|
37
|
+
if not name or not name.isidentifier():
|
|
38
|
+
error_exit(f"{label} must be a valid Python identifier")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def write_if_missing(path: Path, text: str, *, label: str | None = None) -> bool:
|
|
42
|
+
if path.exists():
|
|
43
|
+
status("skip", f"{label or 'File'} already exists: {path}")
|
|
44
|
+
return False
|
|
45
|
+
path.write_text(text)
|
|
46
|
+
status("new", str(path))
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def prompt_required(prompt: str) -> str:
|
|
51
|
+
value = input(f"{prompt}: ").strip()
|
|
52
|
+
if not value:
|
|
53
|
+
error_exit(f"{prompt} is required")
|
|
54
|
+
return value
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def prompt_optional(prompt: str) -> str | None:
|
|
58
|
+
value = input(f"{prompt}: ").strip()
|
|
59
|
+
return value or None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def choose_name(label: str, *, default: str | None = None) -> str:
|
|
63
|
+
if not default:
|
|
64
|
+
return prompt_required(label)
|
|
65
|
+
info(f"{label}:")
|
|
66
|
+
info(f" [1] {default} (default)")
|
|
67
|
+
info(" [2] Custom name")
|
|
68
|
+
while True:
|
|
69
|
+
sel = input("> ").strip()
|
|
70
|
+
if sel == "":
|
|
71
|
+
return default
|
|
72
|
+
if sel == "1":
|
|
73
|
+
return default
|
|
74
|
+
if sel == "2":
|
|
75
|
+
return prompt_required(label)
|
|
76
|
+
info("Please enter a number from the list.")
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def info(message: str) -> None:
|
|
80
|
+
_LOGGER.info(message)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def status(tag: str, message: str) -> None:
|
|
84
|
+
_LOGGER.info("[%s] %s", tag, message)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def error_exit(message: str, code: int = 2) -> None:
|
|
88
|
+
_LOGGER.error(message)
|
|
89
|
+
raise SystemExit(code)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def pick_from_list(prompt: str, options: list[str]) -> str:
|
|
93
|
+
info(prompt)
|
|
94
|
+
for i, opt in enumerate(options, 1):
|
|
95
|
+
info(f" [{i}] {opt}")
|
|
96
|
+
while True:
|
|
97
|
+
sel = input("> ").strip()
|
|
98
|
+
if sel.isdigit():
|
|
99
|
+
idx = int(sel)
|
|
100
|
+
if 1 <= idx <= len(options):
|
|
101
|
+
return options[idx - 1]
|
|
102
|
+
info("Please enter a number from the list.")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def pick_from_menu(prompt: str, options: list[tuple[str, str]], *, allow_default: bool = True) -> str:
|
|
106
|
+
info(prompt)
|
|
107
|
+
for i, (_, label) in enumerate(options, 1):
|
|
108
|
+
info(f" [{i}] {label}")
|
|
109
|
+
while True:
|
|
110
|
+
sel = input("> ").strip()
|
|
111
|
+
if sel == "" and allow_default:
|
|
112
|
+
return options[0][0]
|
|
113
|
+
if sel.isdigit():
|
|
114
|
+
idx = int(sel)
|
|
115
|
+
if 1 <= idx <= len(options):
|
|
116
|
+
return options[idx - 1][0]
|
|
117
|
+
info("Please enter a number from the list.")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def pick_multiple_from_list(prompt: str, options: list[str]) -> list[str]:
|
|
121
|
+
info(prompt)
|
|
122
|
+
for i, opt in enumerate(options, 1):
|
|
123
|
+
info(f" [{i}] {opt}")
|
|
124
|
+
sel = input("> ").strip()
|
|
125
|
+
try:
|
|
126
|
+
idxs = [int(x) for x in sel.split(",") if x.strip()]
|
|
127
|
+
except ValueError:
|
|
128
|
+
error_exit("Invalid selection.")
|
|
129
|
+
picked = [options[i - 1] for i in idxs if 1 <= i <= len(options)]
|
|
130
|
+
if not picked:
|
|
131
|
+
error_exit("No inputs selected.")
|
|
132
|
+
return picked
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def choose_existing_or_create(
|
|
136
|
+
*,
|
|
137
|
+
label: str,
|
|
138
|
+
existing: list[str],
|
|
139
|
+
create_label: str,
|
|
140
|
+
create_fn,
|
|
141
|
+
prompt_new: str,
|
|
142
|
+
root: Path | None,
|
|
143
|
+
default_new: str | None = None,
|
|
144
|
+
) -> str:
|
|
145
|
+
info(f"{label}:")
|
|
146
|
+
info(f" [1] {create_label} (default)")
|
|
147
|
+
info(f" [2] Select existing {label}")
|
|
148
|
+
while True:
|
|
149
|
+
sel = input("> ").strip()
|
|
150
|
+
if sel == "":
|
|
151
|
+
sel = "1"
|
|
152
|
+
if sel == "1":
|
|
153
|
+
name = choose_name(prompt_new, default=default_new)
|
|
154
|
+
create_fn(name=name, root=root)
|
|
155
|
+
return name
|
|
156
|
+
if sel == "2":
|
|
157
|
+
if not existing:
|
|
158
|
+
error_exit(f"No existing {label} found.")
|
|
159
|
+
return pick_from_list(f"Select {label}:", existing)
|
|
160
|
+
info("Please enter a number from the list.")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def choose_existing_or_create_name(
|
|
164
|
+
*,
|
|
165
|
+
label: str,
|
|
166
|
+
existing: list[str],
|
|
167
|
+
create_label: str,
|
|
168
|
+
prompt_new: str,
|
|
169
|
+
default_new: str | None = None,
|
|
170
|
+
) -> tuple[str, bool]:
|
|
171
|
+
"""Return (name, created) without side effects."""
|
|
172
|
+
info(f"{label}:")
|
|
173
|
+
info(f" [1] {create_label} (default)")
|
|
174
|
+
info(f" [2] Select existing {label}")
|
|
175
|
+
while True:
|
|
176
|
+
sel = input("> ").strip()
|
|
177
|
+
if sel == "":
|
|
178
|
+
sel = "1"
|
|
179
|
+
if sel == "1":
|
|
180
|
+
name = choose_name(prompt_new, default=default_new)
|
|
181
|
+
return name, True
|
|
182
|
+
if sel == "2":
|
|
183
|
+
if not existing:
|
|
184
|
+
error_exit(f"No existing {label} found.")
|
|
185
|
+
name = pick_from_list(f"Select {label}:", existing)
|
|
186
|
+
return name, False
|
|
187
|
+
info("Please enter a number from the list.")
|