jerry-thomas 1.0.3__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. datapipeline/analysis/vector/collector.py +0 -1
  2. datapipeline/build/tasks/config.py +0 -2
  3. datapipeline/build/tasks/metadata.py +0 -2
  4. datapipeline/build/tasks/scaler.py +0 -2
  5. datapipeline/build/tasks/schema.py +0 -2
  6. datapipeline/build/tasks/utils.py +0 -2
  7. datapipeline/cli/app.py +201 -81
  8. datapipeline/cli/commands/contract.py +145 -283
  9. datapipeline/cli/commands/demo.py +13 -0
  10. datapipeline/cli/commands/domain.py +4 -4
  11. datapipeline/cli/commands/dto.py +11 -0
  12. datapipeline/cli/commands/filter.py +2 -2
  13. datapipeline/cli/commands/inspect.py +0 -68
  14. datapipeline/cli/commands/list_.py +30 -13
  15. datapipeline/cli/commands/loader.py +11 -0
  16. datapipeline/cli/commands/mapper.py +82 -0
  17. datapipeline/cli/commands/parser.py +45 -0
  18. datapipeline/cli/commands/run_config.py +1 -3
  19. datapipeline/cli/commands/serve_pipeline.py +5 -7
  20. datapipeline/cli/commands/source.py +106 -18
  21. datapipeline/cli/commands/stream.py +286 -0
  22. datapipeline/cli/visuals/common.py +0 -2
  23. datapipeline/cli/visuals/sections.py +0 -2
  24. datapipeline/cli/workspace_utils.py +0 -3
  25. datapipeline/config/context.py +0 -2
  26. datapipeline/config/dataset/feature.py +1 -0
  27. datapipeline/config/metadata.py +0 -2
  28. datapipeline/config/project.py +0 -2
  29. datapipeline/config/resolution.py +10 -2
  30. datapipeline/config/tasks.py +9 -9
  31. datapipeline/domain/feature.py +3 -0
  32. datapipeline/domain/record.py +7 -7
  33. datapipeline/domain/sample.py +0 -2
  34. datapipeline/domain/vector.py +6 -8
  35. datapipeline/integrations/ml/adapter.py +0 -2
  36. datapipeline/integrations/ml/pandas_support.py +0 -2
  37. datapipeline/integrations/ml/rows.py +0 -2
  38. datapipeline/integrations/ml/torch_support.py +0 -2
  39. datapipeline/io/output.py +0 -2
  40. datapipeline/io/serializers.py +26 -16
  41. datapipeline/mappers/synthetic/time.py +9 -2
  42. datapipeline/pipeline/artifacts.py +3 -5
  43. datapipeline/pipeline/observability.py +0 -2
  44. datapipeline/pipeline/pipelines.py +118 -34
  45. datapipeline/pipeline/stages.py +42 -17
  46. datapipeline/pipeline/utils/spool_cache.py +142 -0
  47. datapipeline/pipeline/utils/transform_utils.py +27 -2
  48. datapipeline/services/artifacts.py +1 -4
  49. datapipeline/services/constants.py +1 -0
  50. datapipeline/services/factories.py +4 -6
  51. datapipeline/services/project_paths.py +0 -2
  52. datapipeline/services/runs.py +0 -2
  53. datapipeline/services/scaffold/contract_yaml.py +76 -0
  54. datapipeline/services/scaffold/demo.py +141 -0
  55. datapipeline/services/scaffold/discovery.py +115 -0
  56. datapipeline/services/scaffold/domain.py +21 -13
  57. datapipeline/services/scaffold/dto.py +31 -0
  58. datapipeline/services/scaffold/filter.py +2 -1
  59. datapipeline/services/scaffold/layout.py +96 -0
  60. datapipeline/services/scaffold/loader.py +61 -0
  61. datapipeline/services/scaffold/mapper.py +116 -0
  62. datapipeline/services/scaffold/parser.py +56 -0
  63. datapipeline/services/scaffold/plugin.py +14 -2
  64. datapipeline/services/scaffold/source_yaml.py +91 -0
  65. datapipeline/services/scaffold/stream_plan.py +110 -0
  66. datapipeline/services/scaffold/utils.py +187 -0
  67. datapipeline/sources/data_loader.py +0 -2
  68. datapipeline/sources/decoders.py +49 -8
  69. datapipeline/sources/factory.py +9 -6
  70. datapipeline/sources/foreach.py +18 -3
  71. datapipeline/sources/synthetic/time/parser.py +1 -1
  72. datapipeline/sources/transports.py +10 -4
  73. datapipeline/templates/demo_skeleton/demo/contracts/equity.ohlcv.yaml +33 -0
  74. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.hour_sin.yaml +22 -0
  75. datapipeline/templates/demo_skeleton/demo/contracts/time.ticks.linear.yaml +22 -0
  76. datapipeline/templates/demo_skeleton/demo/data/APPL.jsonl +19 -0
  77. datapipeline/templates/demo_skeleton/demo/data/MSFT.jsonl +19 -0
  78. datapipeline/templates/demo_skeleton/demo/dataset.yaml +19 -0
  79. datapipeline/templates/demo_skeleton/demo/postprocess.yaml +19 -0
  80. datapipeline/templates/demo_skeleton/demo/project.yaml +19 -0
  81. datapipeline/templates/demo_skeleton/demo/sources/sandbox.ohlcv.yaml +17 -0
  82. datapipeline/templates/{plugin_skeleton/example → demo_skeleton/demo}/sources/synthetic.ticks.yaml +1 -1
  83. datapipeline/templates/demo_skeleton/demo/tasks/metadata.yaml +2 -0
  84. datapipeline/templates/demo_skeleton/demo/tasks/scaler.yaml +3 -0
  85. datapipeline/templates/demo_skeleton/demo/tasks/schema.yaml +2 -0
  86. datapipeline/templates/demo_skeleton/demo/tasks/serve.test.yaml +4 -0
  87. datapipeline/templates/demo_skeleton/demo/tasks/serve.train.yaml +4 -0
  88. datapipeline/templates/demo_skeleton/demo/tasks/serve.val.yaml +4 -0
  89. datapipeline/templates/demo_skeleton/scripts/run_dataframe.py +20 -0
  90. datapipeline/templates/demo_skeleton/scripts/run_torch.py +23 -0
  91. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/__init__.py +0 -0
  92. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/__init__.py +0 -0
  93. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/domains/equity/model.py +18 -0
  94. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  95. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/dtos/sandbox_ohlcv_dto.py +14 -0
  96. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +0 -0
  97. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/mappers/map_sandbox_ohlcv_dto_to_equity.py +26 -0
  98. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  99. datapipeline/templates/demo_skeleton/src/{{PACKAGE_NAME}}/parsers/sandbox_ohlcv_dto_parser.py +46 -0
  100. datapipeline/templates/plugin_skeleton/README.md +57 -136
  101. datapipeline/templates/plugin_skeleton/jerry.yaml +12 -24
  102. datapipeline/templates/plugin_skeleton/reference/jerry.yaml +28 -0
  103. datapipeline/templates/plugin_skeleton/reference/reference/contracts/composed.reference.yaml +29 -0
  104. datapipeline/templates/plugin_skeleton/reference/reference/contracts/ingest.reference.yaml +31 -0
  105. datapipeline/templates/plugin_skeleton/reference/reference/contracts/overview.reference.yaml +34 -0
  106. datapipeline/templates/plugin_skeleton/reference/reference/dataset.yaml +29 -0
  107. datapipeline/templates/plugin_skeleton/reference/reference/postprocess.yaml +25 -0
  108. datapipeline/templates/plugin_skeleton/reference/reference/project.yaml +32 -0
  109. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.http.reference.yaml +24 -0
  110. datapipeline/templates/plugin_skeleton/reference/reference/sources/foreach.reference.yaml +21 -0
  111. datapipeline/templates/plugin_skeleton/reference/reference/sources/fs.reference.yaml +16 -0
  112. datapipeline/templates/plugin_skeleton/reference/reference/sources/http.reference.yaml +17 -0
  113. datapipeline/templates/plugin_skeleton/reference/reference/sources/overview.reference.yaml +18 -0
  114. datapipeline/templates/plugin_skeleton/reference/reference/sources/synthetic.reference.yaml +15 -0
  115. datapipeline/templates/plugin_skeleton/reference/reference/tasks/metadata.reference.yaml +11 -0
  116. datapipeline/templates/plugin_skeleton/reference/reference/tasks/scaler.reference.yaml +10 -0
  117. datapipeline/templates/plugin_skeleton/reference/reference/tasks/schema.reference.yaml +10 -0
  118. datapipeline/templates/plugin_skeleton/reference/reference/tasks/serve.reference.yaml +28 -0
  119. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/domains/__init__.py +2 -0
  120. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/dtos/__init__.py +0 -0
  121. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/loaders/__init__.py +0 -0
  122. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/mappers/__init__.py +1 -0
  123. datapipeline/templates/plugin_skeleton/src/{{PACKAGE_NAME}}/parsers/__init__.py +0 -0
  124. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +12 -11
  125. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +4 -13
  126. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +7 -10
  127. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +1 -2
  128. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +1 -7
  129. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +1 -1
  130. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +1 -1
  131. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +1 -25
  132. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +1 -1
  133. datapipeline/templates/plugin_skeleton/your-interim-data-builder/dataset.yaml +9 -0
  134. datapipeline/templates/plugin_skeleton/your-interim-data-builder/postprocess.yaml +1 -0
  135. datapipeline/templates/plugin_skeleton/your-interim-data-builder/project.yaml +14 -0
  136. datapipeline/templates/plugin_skeleton/your-interim-data-builder/tasks/serve.all.yaml +8 -0
  137. datapipeline/templates/stubs/contracts/composed.yaml.j2 +10 -0
  138. datapipeline/templates/stubs/contracts/ingest.yaml.j2 +25 -0
  139. datapipeline/templates/stubs/dto.py.j2 +1 -1
  140. datapipeline/templates/stubs/loaders/basic.py.j2 +11 -0
  141. datapipeline/templates/stubs/mappers/composed.py.j2 +13 -0
  142. datapipeline/templates/stubs/mappers/ingest.py.j2 +17 -0
  143. datapipeline/templates/stubs/parser.py.j2 +4 -0
  144. datapipeline/templates/stubs/record.py.j2 +0 -1
  145. datapipeline/templates/stubs/source.yaml.j2 +1 -1
  146. datapipeline/transforms/debug/identity.py +34 -16
  147. datapipeline/transforms/debug/lint.py +14 -11
  148. datapipeline/transforms/feature/scaler.py +5 -12
  149. datapipeline/transforms/filter.py +73 -17
  150. datapipeline/transforms/interfaces.py +58 -0
  151. datapipeline/transforms/record/floor_time.py +10 -7
  152. datapipeline/transforms/record/lag.py +8 -10
  153. datapipeline/transforms/sequence.py +2 -3
  154. datapipeline/transforms/stream/dedupe.py +5 -7
  155. datapipeline/transforms/stream/ensure_ticks.py +39 -24
  156. datapipeline/transforms/stream/fill.py +34 -25
  157. datapipeline/transforms/stream/filter.py +25 -0
  158. datapipeline/transforms/stream/floor_time.py +16 -0
  159. datapipeline/transforms/stream/granularity.py +52 -30
  160. datapipeline/transforms/stream/lag.py +17 -0
  161. datapipeline/transforms/stream/rolling.py +72 -0
  162. datapipeline/transforms/utils.py +42 -10
  163. datapipeline/transforms/vector/drop/horizontal.py +0 -3
  164. datapipeline/transforms/vector/drop/orchestrator.py +0 -3
  165. datapipeline/transforms/vector/drop/vertical.py +0 -2
  166. datapipeline/transforms/vector/ensure_schema.py +0 -2
  167. datapipeline/utils/paths.py +0 -2
  168. datapipeline/utils/placeholders.py +0 -2
  169. datapipeline/utils/rich_compat.py +0 -3
  170. datapipeline/utils/window.py +0 -2
  171. jerry_thomas-2.0.0.dist-info/METADATA +282 -0
  172. jerry_thomas-2.0.0.dist-info/RECORD +264 -0
  173. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/WHEEL +1 -1
  174. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/entry_points.txt +7 -3
  175. datapipeline/services/scaffold/mappers.py +0 -55
  176. datapipeline/services/scaffold/source.py +0 -191
  177. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +0 -31
  178. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +0 -30
  179. datapipeline/templates/plugin_skeleton/example/dataset.yaml +0 -18
  180. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +0 -29
  181. datapipeline/templates/plugin_skeleton/example/project.yaml +0 -23
  182. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +0 -3
  183. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +0 -9
  184. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +0 -2
  185. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +0 -4
  186. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +0 -28
  187. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +0 -4
  188. datapipeline/templates/stubs/mapper.py.j2 +0 -22
  189. jerry_thomas-1.0.3.dist-info/METADATA +0 -827
  190. jerry_thomas-1.0.3.dist-info/RECORD +0 -198
  191. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/licenses/LICENSE +0 -0
  192. {jerry_thomas-1.0.3.dist-info → jerry_thomas-2.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,76 @@
1
+ from pathlib import Path
2
+
3
+ from datapipeline.services.project_paths import streams_dir as resolve_streams_dir, ensure_project_scaffold
4
+ from datapipeline.services.scaffold.templates import render
5
+ from datapipeline.services.scaffold.utils import status
6
+
7
+
8
+ def write_ingest_contract(
9
+ *,
10
+ project_yaml: Path,
11
+ stream_id: str,
12
+ source: str,
13
+ mapper_entrypoint: str,
14
+ ) -> Path:
15
+ ensure_project_scaffold(project_yaml)
16
+ streams_path = resolve_streams_dir(project_yaml)
17
+ streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
18
+ streams_dir.mkdir(parents=True, exist_ok=True)
19
+ cfile = streams_dir / f"{stream_id}.yaml"
20
+ cfile.write_text(
21
+ render(
22
+ "contracts/ingest.yaml.j2",
23
+ source=source,
24
+ stream_id=stream_id,
25
+ mapper_entrypoint=mapper_entrypoint,
26
+ ),
27
+ encoding="utf-8",
28
+ )
29
+ status("new", f"canonical spec: {cfile}")
30
+ return cfile
31
+
32
+
33
+ def write_composed_contract(
34
+ *,
35
+ project_yaml: Path,
36
+ stream_id: str,
37
+ inputs_list: str,
38
+ mapper_entrypoint: str,
39
+ driver_key: str,
40
+ ) -> Path:
41
+ ensure_project_scaffold(project_yaml)
42
+ streams_path = resolve_streams_dir(project_yaml)
43
+ streams_dir = streams_path if streams_path.is_dir() else streams_path.parent
44
+ streams_dir.mkdir(parents=True, exist_ok=True)
45
+ cfile = streams_dir / f"{stream_id}.yaml"
46
+ cfile.write_text(
47
+ render(
48
+ "contracts/composed.yaml.j2",
49
+ stream_id=stream_id,
50
+ inputs_list=inputs_list,
51
+ mapper_entrypoint=mapper_entrypoint,
52
+ driver_key=driver_key,
53
+ ).strip() + "\n",
54
+ encoding="utf-8",
55
+ )
56
+ status("new", f"composed contract: {cfile}")
57
+ return cfile
58
+
59
+
60
+ def compose_inputs(picked: list[str]) -> tuple[str, str]:
61
+ """Build composed inputs list text and driver key."""
62
+ built: list[str] = []
63
+ for ref in picked:
64
+ parts = ref.split(".")
65
+ if len(parts) >= 3:
66
+ domain, variant = parts[0], parts[-1]
67
+ alias = f"{domain}_{variant}"
68
+ elif len(parts) == 2:
69
+ alias = parts[0]
70
+ else:
71
+ alias = ref
72
+ built.append(f"{alias}={ref}")
73
+ inputs_csv = ",".join(built)
74
+ inputs_list = "\n - ".join(s.strip() for s in inputs_csv.split(",") if s.strip())
75
+ driver_key = inputs_csv.split(",")[0].split("=")[0].strip()
76
+ return inputs_list, driver_key
@@ -0,0 +1,141 @@
1
+ from importlib.resources import as_file, files
2
+ from pathlib import Path
3
+ import logging
4
+ import os
5
+ import shutil
6
+
7
+ import yaml
8
+
9
+ from datapipeline.services.entrypoints import inject_ep
10
+ from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
11
+ from datapipeline.utils.load import load_yaml
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ _DEMO_DATASET_ALIAS = "demo"
16
+ _DEMO_PARSER_EP = "sandbox_ohlcv_dto_parser"
17
+ _DEMO_MAPPER_EP = "map_sandbox_ohlcv_dto_to_equity"
18
+
19
+
20
+ def _replace_placeholders(path: Path, replacements: dict[str, str]) -> None:
21
+ if not path.is_file():
22
+ return
23
+ if path.suffix not in {".py", ".toml", ".md", ".yaml", ".yml"}:
24
+ return
25
+ text = path.read_text()
26
+ for placeholder, value in replacements.items():
27
+ text = text.replace(placeholder, value)
28
+ path.write_text(text)
29
+
30
+
31
+ def _inject_demo_entrypoints(pyproject: Path, pkg_name: str) -> None:
32
+ toml = pyproject.read_text()
33
+ toml = inject_ep(
34
+ toml,
35
+ "parsers",
36
+ _DEMO_PARSER_EP,
37
+ f"{pkg_name}.parsers.sandbox_ohlcv_dto_parser:SandboxOhlcvDTOParser",
38
+ )
39
+ toml = inject_ep(
40
+ toml,
41
+ "mappers",
42
+ _DEMO_MAPPER_EP,
43
+ f"{pkg_name}.mappers.map_sandbox_ohlcv_dto_to_equity:map_sandbox_ohlcv_dto_to_equity",
44
+ )
45
+ pyproject.write_text(toml)
46
+
47
+
48
+ def _update_workspace_jerry(
49
+ workspace_root: Path,
50
+ plugin_root_rel: Path,
51
+ dataset_path: Path,
52
+ ) -> None:
53
+ workspace_jerry = workspace_root / "jerry.yaml"
54
+ if not workspace_jerry.exists():
55
+ return
56
+ data = load_yaml(workspace_jerry)
57
+ datasets = data.get("datasets") or {}
58
+ demo_path = (plugin_root_rel / dataset_path).as_posix()
59
+ datasets[_DEMO_DATASET_ALIAS] = demo_path
60
+ # Drop skeleton placeholders that point into this demo plugin.
61
+ for key in ("your-dataset", "interim-builder"):
62
+ path = datasets.get(key)
63
+ if isinstance(path, str) and path.startswith(plugin_root_rel.as_posix()):
64
+ datasets.pop(key, None)
65
+ data["datasets"] = datasets
66
+ data["default_dataset"] = _DEMO_DATASET_ALIAS
67
+ workspace_jerry.write_text(
68
+ yaml.safe_dump(data, sort_keys=False), encoding="utf-8"
69
+ )
70
+
71
+
72
+ def _copy_tree(src: Path, dest: Path) -> None:
73
+ if src.is_dir():
74
+ shutil.copytree(src, dest, dirs_exist_ok=True)
75
+ else:
76
+ dest.parent.mkdir(parents=True, exist_ok=True)
77
+ shutil.copy2(src, dest)
78
+
79
+
80
+ def scaffold_demo(root: Path | None = None) -> None:
81
+ root_dir, pkg_name, pyproject = pkg_root(root)
82
+ demo_ref = files("datapipeline") / "templates" / "demo_skeleton"
83
+ with as_file(demo_ref) as demo_dir:
84
+ demo_data = demo_dir / "demo"
85
+ demo_pkg = demo_dir / "src" / "{{PACKAGE_NAME}}"
86
+
87
+ target_demo = root_dir / "demo"
88
+ if target_demo.exists():
89
+ logger.error("`%s` already exists", target_demo)
90
+ raise SystemExit(1)
91
+
92
+ _copy_tree(demo_data, target_demo)
93
+
94
+ dest_pkg = resolve_base_pkg_dir(root_dir, pkg_name)
95
+ for item in demo_pkg.iterdir():
96
+ _copy_tree(item, dest_pkg / item.name)
97
+
98
+ for cleanup in ("your-dataset", "your-interim-data-builder"):
99
+ extra = root_dir / cleanup
100
+ if extra.exists():
101
+ shutil.rmtree(extra)
102
+
103
+ replacements = {
104
+ "{{PACKAGE_NAME}}": pkg_name,
105
+ }
106
+ for p in target_demo.rglob("*"):
107
+ _replace_placeholders(p, replacements)
108
+ for p in dest_pkg.rglob("*"):
109
+ _replace_placeholders(p, replacements)
110
+
111
+ _inject_demo_entrypoints(pyproject, pkg_name)
112
+
113
+ workspace_root = Path.cwd().resolve()
114
+ try:
115
+ plugin_root_rel = root_dir.relative_to(workspace_root)
116
+ except ValueError:
117
+ plugin_root_rel = Path(os.path.relpath(root_dir, workspace_root))
118
+
119
+ _update_workspace_jerry(
120
+ workspace_root,
121
+ plugin_root_rel,
122
+ Path("demo/project.yaml"),
123
+ )
124
+
125
+ # Rewrite demo source YAMLs to use plugin-root-relative paths so
126
+ # workspace-level runs resolve sample data correctly.
127
+ sources_dir = target_demo / "sources"
128
+ for src_file in sources_dir.glob("*.yaml"):
129
+ src_data = load_yaml(src_file)
130
+ loader = (src_data or {}).get("loader") or {}
131
+ args = loader.get("args") or {}
132
+ path = args.get("path")
133
+ if isinstance(path, str) and path and not Path(path).is_absolute():
134
+ args["path"] = (plugin_root_rel / path).as_posix()
135
+ loader["args"] = args
136
+ src_data["loader"] = loader
137
+ src_file.write_text(
138
+ yaml.safe_dump(src_data, sort_keys=False), encoding="utf-8"
139
+ )
140
+
141
+ logger.info("demo dataset created at %s", target_demo)
@@ -0,0 +1,115 @@
1
+ from pathlib import Path
2
+ import ast
3
+ from typing import Optional
4
+
5
+ from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
6
+ from datapipeline.services.entrypoints import read_group_entries
7
+ from datapipeline.services.constants import PARSERS_GROUP, LOADERS_GROUP, MAPPERS_GROUP
8
+ from datapipeline.services.project_paths import sources_dir as resolve_sources_dir, streams_dir as resolve_streams_dir
9
+
10
+
11
+ def list_dtos(*, root: Optional[Path] = None) -> dict[str, str]:
12
+ """Return mapping of DTO class name -> module path."""
13
+ root_dir, pkg_name, _ = pkg_root(root)
14
+ base = resolve_base_pkg_dir(root_dir, pkg_name)
15
+ dtos_dir = base / "dtos"
16
+ if not dtos_dir.exists():
17
+ return {}
18
+
19
+ package_name = base.name
20
+ found: dict[str, str] = {}
21
+ for path in sorted(dtos_dir.glob("*.py")):
22
+ if path.name == "__init__.py":
23
+ continue
24
+ try:
25
+ tree = ast.parse(path.read_text())
26
+ except Exception:
27
+ continue
28
+ module = f"{package_name}.dtos.{path.stem}"
29
+ for node in tree.body:
30
+ if isinstance(node, ast.ClassDef) and _is_dataclass(node):
31
+ found[node.name] = module
32
+ return found
33
+
34
+
35
+ def _is_dataclass(node: ast.ClassDef) -> bool:
36
+ for deco in node.decorator_list:
37
+ if isinstance(deco, ast.Name) and deco.id == "dataclass":
38
+ return True
39
+ if isinstance(deco, ast.Attribute) and deco.attr == "dataclass":
40
+ return True
41
+ return False
42
+
43
+
44
+ def list_parsers(*, root: Optional[Path] = None) -> dict[str, str]:
45
+ root_dir, _, pyproject = pkg_root(root)
46
+ if not pyproject.exists():
47
+ return {}
48
+ return read_group_entries(pyproject, PARSERS_GROUP)
49
+
50
+
51
+ def list_loaders(*, root: Optional[Path] = None) -> dict[str, str]:
52
+ root_dir, _, pyproject = pkg_root(root)
53
+ if not pyproject.exists():
54
+ return {}
55
+ return read_group_entries(pyproject, LOADERS_GROUP)
56
+
57
+
58
+ def list_mappers(*, root: Optional[Path] = None) -> dict[str, str]:
59
+ root_dir, _, pyproject = pkg_root(root)
60
+ if not pyproject.exists():
61
+ return {}
62
+ return read_group_entries(pyproject, MAPPERS_GROUP)
63
+
64
+
65
+ def list_domains(*, root: Optional[Path] = None) -> list[str]:
66
+ root_dir, pkg_name, _ = pkg_root(root)
67
+ base = resolve_base_pkg_dir(root_dir, pkg_name)
68
+ dom_dir = base / "domains"
69
+ if not dom_dir.exists():
70
+ return []
71
+ return sorted(
72
+ p.name
73
+ for p in dom_dir.iterdir()
74
+ if p.is_dir() and (p / "model.py").exists()
75
+ )
76
+
77
+
78
+ def list_sources(project_yaml: Path) -> list[str]:
79
+ from datapipeline.utils.load import load_yaml
80
+ from datapipeline.services.constants import PARSER_KEY, LOADER_KEY, SOURCE_ID_KEY
81
+
82
+ sources_dir = resolve_sources_dir(project_yaml)
83
+ if not sources_dir.exists():
84
+ return []
85
+ out: list[str] = []
86
+ for p in sorted(sources_dir.rglob("*.y*ml")):
87
+ try:
88
+ data = load_yaml(p)
89
+ except Exception:
90
+ continue
91
+ if isinstance(data, dict) and isinstance(data.get(PARSER_KEY), dict) and isinstance(data.get(LOADER_KEY), dict):
92
+ alias = data.get(SOURCE_ID_KEY)
93
+ if isinstance(alias, str):
94
+ out.append(alias)
95
+ return sorted(set(out))
96
+
97
+
98
+ def list_streams(project_yaml: Path) -> list[str]:
99
+ from datapipeline.utils.load import load_yaml
100
+ from datapipeline.services.constants import STREAM_ID_KEY
101
+
102
+ streams_dir = resolve_streams_dir(project_yaml)
103
+ if not streams_dir.exists():
104
+ return []
105
+ out: list[str] = []
106
+ for p in sorted(streams_dir.rglob("*.y*ml")):
107
+ try:
108
+ data = load_yaml(p)
109
+ except Exception:
110
+ continue
111
+ if isinstance(data, dict) and data.get("kind") in {"ingest", "composed"}:
112
+ sid = data.get(STREAM_ID_KEY)
113
+ if isinstance(sid, str) and sid:
114
+ out.append(sid)
115
+ return sorted(set(out))
@@ -2,25 +2,33 @@ from pathlib import Path
2
2
  from typing import Optional
3
3
 
4
4
  from datapipeline.services.scaffold.templates import render
5
+ from datapipeline.services.scaffold.utils import (
6
+ ensure_pkg_dir,
7
+ to_snake,
8
+ validate_identifier,
9
+ write_if_missing,
10
+ )
11
+ from datapipeline.services.scaffold.layout import DIR_DOMAINS, TPL_DOMAIN_RECORD, domain_record_class
5
12
 
6
13
  from ..paths import pkg_root, resolve_base_pkg_dir
7
14
 
8
15
 
9
16
  def create_domain(*, domain: str, root: Optional[Path]) -> None:
17
+ validate_identifier(domain, "Domain name")
10
18
  root_dir, name, _ = pkg_root(root)
11
19
  base = resolve_base_pkg_dir(root_dir, name)
12
20
  package_name = base.name
13
- pkg_dir = base / "domains" / domain
14
- pkg_dir.mkdir(parents=True, exist_ok=True)
15
- (pkg_dir / "__init__.py").touch(exist_ok=True)
16
-
17
- def write_missing(path: Path, tpl: str, **ctx):
18
- if not path.exists():
19
- path.write_text(render(tpl, **ctx))
20
- print(f"[new] {path}")
21
-
22
- cls_ = "".join(w.capitalize() for w in domain.split("_"))
21
+ pkg_dir = ensure_pkg_dir(base / DIR_DOMAINS, domain)
23
22
  parent = "TemporalRecord"
24
- write_missing(pkg_dir / "model.py", "record.py.j2",
25
- PACKAGE_NAME=package_name, DOMAIN=domain, CLASS_NAME=f"{cls_}Record",
26
- PARENT_CLASS=parent, time_aware=True)
23
+ write_if_missing(
24
+ pkg_dir / "model.py",
25
+ render(
26
+ TPL_DOMAIN_RECORD,
27
+ PACKAGE_NAME=package_name,
28
+ DOMAIN=domain,
29
+ CLASS_NAME=domain_record_class(domain),
30
+ PARENT_CLASS=parent,
31
+ time_aware=True,
32
+ ),
33
+ label="Domain",
34
+ )
@@ -0,0 +1,31 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ from datapipeline.services.scaffold.templates import render
5
+ from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
6
+ from datapipeline.services.scaffold.utils import (
7
+ ensure_pkg_dir,
8
+ to_snake,
9
+ validate_identifier,
10
+ write_if_missing,
11
+ )
12
+ from datapipeline.services.scaffold.layout import DIR_DTOS, TPL_DTO
13
+
14
+
15
+ def create_dto(*, name: str, root: Optional[Path]) -> None:
16
+ validate_identifier(name, "DTO name")
17
+
18
+ root_dir, pkg_name, _ = pkg_root(root)
19
+ base = resolve_base_pkg_dir(root_dir, pkg_name)
20
+ dtos_dir = ensure_pkg_dir(base, DIR_DTOS)
21
+ module_name = to_snake(name)
22
+ path = dtos_dir / f"{module_name}.py"
23
+ write_if_missing(
24
+ path,
25
+ render(
26
+ TPL_DTO,
27
+ CLASS_NAME=name,
28
+ DOMAIN=name,
29
+ ),
30
+ label="DTO",
31
+ )
@@ -1,6 +1,7 @@
1
1
  from pathlib import Path
2
2
  from typing import Optional
3
3
  from datapipeline.services.scaffold.templates import render
4
+ from datapipeline.services.scaffold.utils import status
4
5
  from ..constants import FILTERS_GROUP
5
6
  from ..entrypoints import inject_ep
6
7
  from ..paths import pkg_root, resolve_base_pkg_dir
@@ -19,7 +20,7 @@ def create_filter(*, name: str, root: Optional[Path]) -> None:
19
20
  path = filters_dir / f"{module_name}.py"
20
21
  if not path.exists():
21
22
  path.write_text(render("filter.py.j2", FUNCTION_NAME=name))
22
- print(f"[new] {path}")
23
+ status("new", str(path))
23
24
 
24
25
  # Register entry point under datapipeline.filters
25
26
  toml_path = root_dir / "pyproject.toml"
@@ -0,0 +1,96 @@
1
+ from pathlib import Path
2
+ import re
3
+
4
+ from datapipeline.services.scaffold.templates import camel
5
+
6
+
7
+ def to_snake(name: str) -> str:
8
+ s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
9
+ return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
10
+
11
+
12
+ def slugify(text: str) -> str:
13
+ return re.sub(r"[^a-z0-9]+", "_", text.strip().lower()).strip("_")
14
+
15
+
16
+ def ep_key_from_name(name: str) -> str:
17
+ return to_snake(name)
18
+
19
+
20
+ # Directory names
21
+ DIR_DTOS = "dtos"
22
+ DIR_PARSERS = "parsers"
23
+ DIR_LOADERS = "loaders"
24
+ DIR_MAPPERS = "mappers"
25
+ DIR_DOMAINS = "domains"
26
+
27
+ # Template paths
28
+ TPL_DTO = "dto.py.j2"
29
+ TPL_PARSER = "parser.py.j2"
30
+ TPL_LOADER_BASIC = "loaders/basic.py.j2"
31
+ TPL_LOADER_SYNTHETIC = "loader_synthetic.py.j2"
32
+ TPL_MAPPER_INGEST = "mappers/ingest.py.j2"
33
+ TPL_MAPPER_COMPOSED = "mappers/composed.py.j2"
34
+ TPL_DOMAIN_RECORD = "record.py.j2"
35
+
36
+
37
+ def class_name_with_suffix(name: str, suffix: str) -> str:
38
+ return f"{camel(name)}{suffix}"
39
+
40
+
41
+ def loader_class_name(name: str) -> str:
42
+ return class_name_with_suffix(name, "Loader")
43
+
44
+
45
+ def domain_record_class(domain: str) -> str:
46
+ return class_name_with_suffix(domain, "Record")
47
+
48
+
49
+ def loader_template_name(template: str) -> str:
50
+ if template == "synthetic":
51
+ return TPL_LOADER_SYNTHETIC
52
+ return TPL_LOADER_BASIC
53
+
54
+
55
+ def dto_class_name(base: str) -> str:
56
+ return class_name_with_suffix(base, "DTO")
57
+
58
+
59
+ def dto_module_path(package: str, dto_class: str) -> str:
60
+ return f"{package}.{DIR_DTOS}.{to_snake(dto_class)}"
61
+
62
+
63
+ def default_parser_name(dto_class: str) -> str:
64
+ return f"{dto_class}Parser"
65
+
66
+
67
+ def default_mapper_name(input_module: str, domain: str) -> str:
68
+ input_mod = input_module.rsplit(".", 1)[-1]
69
+ return f"map_{input_mod}_to_{domain}"
70
+
71
+
72
+ def default_stream_id(domain: str, dataset: str, variant: str | None = None) -> str:
73
+ base = f"{slugify(domain)}.{slugify(dataset)}"
74
+ return f"{base}.{slugify(variant)}" if variant else base
75
+
76
+
77
+ # Prompt labels (keep CLI wording consistent)
78
+ LABEL_DTO_FOR_PARSER = "DTO for parser"
79
+ LABEL_DTO_FOR_MAPPER = "DTO for mapper"
80
+ LABEL_DOMAIN_TO_MAP = "Domain"
81
+ LABEL_MAPPER_INPUT = "Mapper input"
82
+
83
+
84
+ def default_mapper_name_for_identity(domain: str) -> str:
85
+ return f"map_identity_to_{slugify(domain)}"
86
+
87
+ def pyproject_path(root_dir: Path) -> Path:
88
+ return root_dir / "pyproject.toml"
89
+
90
+
91
+ def module_path(package: str, group: str, module: str) -> str:
92
+ return f"{package}.{group}.{module}"
93
+
94
+
95
+ def entrypoint_target(package: str, group: str, module: str, attr: str) -> str:
96
+ return f"{module_path(package, group, module)}:{attr}"
@@ -0,0 +1,61 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ from datapipeline.services.paths import pkg_root, resolve_base_pkg_dir
5
+ from datapipeline.services.scaffold.templates import render
6
+ from datapipeline.services.scaffold.utils import (
7
+ ensure_pkg_dir,
8
+ ep_key_from_name,
9
+ to_snake,
10
+ validate_identifier,
11
+ write_if_missing,
12
+ )
13
+ from datapipeline.services.scaffold.layout import (
14
+ DIR_LOADERS,
15
+ entrypoint_target,
16
+ loader_class_name,
17
+ loader_template_name,
18
+ pyproject_path,
19
+ )
20
+ from datapipeline.services.entrypoints import inject_ep
21
+ from datapipeline.services.constants import LOADERS_GROUP
22
+
23
+
24
+ def create_loader(
25
+ *,
26
+ name: str,
27
+ root: Optional[Path],
28
+ template: str = "basic",
29
+ ) -> str:
30
+ validate_identifier(name, "Loader name")
31
+
32
+ root_dir, pkg_name, _ = pkg_root(root)
33
+ base = resolve_base_pkg_dir(root_dir, pkg_name)
34
+ package_name = base.name
35
+
36
+ loaders_dir = ensure_pkg_dir(base, DIR_LOADERS)
37
+ module_name = to_snake(name)
38
+ path = loaders_dir / f"{module_name}.py"
39
+
40
+ class_name = loader_class_name(name)
41
+ template_name = loader_template_name(template)
42
+
43
+ write_if_missing(
44
+ path,
45
+ render(
46
+ template_name,
47
+ CLASS_NAME=class_name,
48
+ ),
49
+ label="Loader",
50
+ )
51
+
52
+ ep_key = ep_key_from_name(name)
53
+ pyproject = pyproject_path(root_dir)
54
+ toml = inject_ep(
55
+ pyproject.read_text(),
56
+ LOADERS_GROUP,
57
+ ep_key,
58
+ entrypoint_target(package_name, "loaders", module_name, class_name),
59
+ )
60
+ pyproject.write_text(toml)
61
+ return ep_key