jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +5 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +54 -10
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +76 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.1.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.1.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,208 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, asdict
4
+ from datetime import datetime, timezone
5
+ from pathlib import Path
6
+ from typing import Any, Tuple
7
+
8
+ import json
9
+ import shutil
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class RunPaths:
14
+ """Resolved filesystem paths for a single run rooted at a serve directory.
15
+
16
+ The serve directory is typically the user-configured `directory` for the
17
+ filesystem transport (e.g. `data/processed/...`).
18
+
19
+ Layout:
20
+
21
+ serve_root/
22
+ runs/
23
+ <run_id>/
24
+ dataset/ # main output for this run
25
+ run.json # metadata for this run
26
+ latest/ # symlink or copy pointing at the current live run
27
+ current_run.json # pointer to the run currently marked as "latest"
28
+ """
29
+
30
+ serve_root: Path
31
+ runs_root: Path
32
+ run_id: str
33
+ run_root: Path
34
+ dataset_dir: Path
35
+ metadata_path: Path
36
+
37
+
38
+ @dataclass
39
+ class RunMetadata:
40
+ """Metadata describing a single run."""
41
+
42
+ run_id: str
43
+ started_at: str
44
+ finished_at: str | None = None
45
+ status: str | None = None # e.g. "running", "success", "failed"
46
+ notes: str | None = None
47
+ stage: int | None = None
48
+
49
+
50
+ def _now_utc_iso() -> str:
51
+ return datetime.now(timezone.utc).isoformat()
52
+
53
+
54
+ def make_run_id() -> str:
55
+ """Create a filesystem-safe, sortable run identifier."""
56
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
57
+
58
+
59
+ def get_serve_root(directory: str | Path) -> Path:
60
+ """Resolve the user-configured serve directory to an absolute path."""
61
+ return Path(directory).expanduser().resolve()
62
+
63
+
64
+ def get_run_paths(serve_root: Path, run_id: str | None = None) -> RunPaths:
65
+ """Build RunPaths for a run rooted at the given serve directory."""
66
+ if run_id is None:
67
+ run_id = make_run_id()
68
+
69
+ runs_root = serve_root / "runs"
70
+ run_root = runs_root / run_id
71
+ dataset_dir = run_root / "dataset"
72
+ metadata_path = run_root / "run.json"
73
+
74
+ return RunPaths(
75
+ serve_root=serve_root,
76
+ runs_root=runs_root,
77
+ run_id=run_id,
78
+ run_root=run_root,
79
+ dataset_dir=dataset_dir,
80
+ metadata_path=metadata_path,
81
+ )
82
+
83
+
84
+ def _write_run_metadata(meta: RunMetadata, path: Path) -> None:
85
+ path.parent.mkdir(parents=True, exist_ok=True)
86
+ with path.open("w", encoding="utf-8") as f:
87
+ json.dump(asdict(meta), f, indent=2, sort_keys=True)
88
+
89
+
90
+ def _load_run_metadata(path: Path) -> RunMetadata:
91
+ with path.open("r", encoding="utf-8") as f:
92
+ data: dict[str, Any] = json.load(f)
93
+ return RunMetadata(**data)
94
+
95
+
96
+ def start_run_for_directory(
97
+ directory: str | Path,
98
+ run_id: str | None = None,
99
+ *,
100
+ stage: int | None = None,
101
+ ) -> Tuple[RunPaths, RunMetadata]:
102
+ """Initialise a new run rooted at the given directory.
103
+
104
+ This will create the run's dataset directory and an initial metadata file
105
+ with status set to "running".
106
+ """
107
+ serve_root = get_serve_root(directory)
108
+ paths = get_run_paths(serve_root, run_id)
109
+
110
+ # Ensure the run directories exist
111
+ paths.dataset_dir.mkdir(parents=True, exist_ok=True)
112
+
113
+ meta = RunMetadata(
114
+ run_id=paths.run_id,
115
+ started_at=_now_utc_iso(),
116
+ finished_at=None,
117
+ status="running",
118
+ notes=None,
119
+ stage=stage,
120
+ )
121
+ _write_run_metadata(meta, paths.metadata_path)
122
+ return paths, meta
123
+
124
+
125
+ def finish_run(paths: RunPaths, status: str, notes: str | None = None) -> RunMetadata:
126
+ """Mark an existing run as finished with the given status."""
127
+ if paths.metadata_path.exists():
128
+ meta = _load_run_metadata(paths.metadata_path)
129
+ else:
130
+ # Fallback: create a minimal metadata record if none exists yet
131
+ meta = RunMetadata(
132
+ run_id=paths.run_id,
133
+ started_at=_now_utc_iso(),
134
+ )
135
+
136
+ meta.finished_at = _now_utc_iso()
137
+ meta.status = status
138
+ if notes is not None:
139
+ meta.notes = notes
140
+
141
+ _write_run_metadata(meta, paths.metadata_path)
142
+ return meta
143
+
144
+
145
+ def finish_run_success(paths: RunPaths, notes: str | None = None) -> RunMetadata:
146
+ """Convenience wrapper to mark a run as successful."""
147
+ return finish_run(paths, status="success", notes=notes)
148
+
149
+
150
+ def finish_run_failed(paths: RunPaths, notes: str | None = None) -> RunMetadata:
151
+ """Convenience wrapper to mark a run as failed."""
152
+ return finish_run(paths, status="failed", notes=notes)
153
+
154
+
155
+ def set_latest_run(paths: RunPaths) -> None:
156
+ """Mark the given run as the latest/live run for its serve directory.
157
+
158
+ This updates two things under the serve root:
159
+
160
+ * `latest/` – a symlink (or copied directory as a fallback) pointing to
161
+ this run's root directory, so consumers can read from
162
+ `<directory>/latest/dataset`.
163
+
164
+ * `current_run.json` – a small pointer file recording which run is
165
+ currently live and when this pointer was updated.
166
+ """
167
+ serve_root = paths.serve_root
168
+ latest_root = serve_root / "latest"
169
+
170
+ # Ensure serve_root exists so that the layout is predictable
171
+ serve_root.mkdir(parents=True, exist_ok=True)
172
+
173
+ # Remove any existing "latest" pointer
174
+ if latest_root.is_symlink() or latest_root.is_file():
175
+ latest_root.unlink()
176
+ elif latest_root.is_dir():
177
+ shutil.rmtree(latest_root)
178
+
179
+ # Prefer a symlink for efficiency; fall back to copying if symlinks fail
180
+ try:
181
+ latest_root.symlink_to(paths.run_root, target_is_directory=True)
182
+ except OSError:
183
+ shutil.copytree(paths.run_root, latest_root)
184
+
185
+ # Write/update current_run.json with a simple pointer
186
+ current_meta_path = serve_root / "current_run.json"
187
+ current_data: dict[str, Any] = {
188
+ "run_id": paths.run_id,
189
+ "run_root": str(paths.run_root),
190
+ "dataset_dir": str(paths.dataset_dir),
191
+ "updated_at": _now_utc_iso(),
192
+ }
193
+ with current_meta_path.open("w", encoding="utf-8") as f:
194
+ json.dump(current_data, f, indent=2, sort_keys=True)
195
+
196
+
197
+ __all__ = [
198
+ "RunPaths",
199
+ "RunMetadata",
200
+ "make_run_id",
201
+ "get_serve_root",
202
+ "get_run_paths",
203
+ "start_run_for_directory",
204
+ "finish_run",
205
+ "finish_run_success",
206
+ "finish_run_failed",
207
+ "set_latest_run",
208
+ ]
@@ -9,6 +9,7 @@ from ..paths import pkg_root, resolve_base_pkg_dir
9
9
  def create_domain(*, domain: str, root: Optional[Path]) -> None:
10
10
  root_dir, name, _ = pkg_root(root)
11
11
  base = resolve_base_pkg_dir(root_dir, name)
12
+ package_name = base.name
12
13
  pkg_dir = base / "domains" / domain
13
14
  pkg_dir.mkdir(parents=True, exist_ok=True)
14
15
  (pkg_dir / "__init__.py").touch(exist_ok=True)
@@ -16,10 +17,10 @@ def create_domain(*, domain: str, root: Optional[Path]) -> None:
16
17
  def write_missing(path: Path, tpl: str, **ctx):
17
18
  if not path.exists():
18
19
  path.write_text(render(tpl, **ctx))
19
- print(f"[new] Created: {path}")
20
+ print(f"[new] {path}")
20
21
 
21
22
  cls_ = "".join(w.capitalize() for w in domain.split("_"))
22
23
  parent = "TemporalRecord"
23
24
  write_missing(pkg_dir / "model.py", "record.py.j2",
24
- PACKAGE_NAME=name, DOMAIN=domain, CLASS_NAME=f"{cls_}Record",
25
+ PACKAGE_NAME=package_name, DOMAIN=domain, CLASS_NAME=f"{cls_}Record",
25
26
  PARENT_CLASS=parent, time_aware=True)
@@ -9,6 +9,7 @@ from ..paths import pkg_root, resolve_base_pkg_dir
9
9
  def create_filter(*, name: str, root: Optional[Path]) -> None:
10
10
  root_dir, pkg_name, _ = pkg_root(root)
11
11
  base = resolve_base_pkg_dir(root_dir, pkg_name)
12
+ package_name = base.name
12
13
  filters_dir = base / FILTERS_GROUP
13
14
  filters_dir.mkdir(parents=True, exist_ok=True)
14
15
  (filters_dir / "__init__.py").touch(exist_ok=True)
@@ -18,7 +19,7 @@ def create_filter(*, name: str, root: Optional[Path]) -> None:
18
19
  path = filters_dir / f"{module_name}.py"
19
20
  if not path.exists():
20
21
  path.write_text(render("filter.py.j2", FUNCTION_NAME=name))
21
- print(f"[new] Created: {path}")
22
+ print(f"[new] {path}")
22
23
 
23
24
  # Register entry point under datapipeline.filters
24
25
  toml_path = root_dir / "pyproject.toml"
@@ -26,6 +27,6 @@ def create_filter(*, name: str, root: Optional[Path]) -> None:
26
27
  toml_path.read_text(),
27
28
  FILTERS_GROUP,
28
29
  name,
29
- f"{pkg_name}.filters.{module_name}:{name}",
30
+ f"{package_name}.filters.{module_name}:{name}",
30
31
  )
31
32
  toml_path.write_text(toml)
@@ -16,8 +16,11 @@ def _slug(s: str) -> str:
16
16
  def attach_source_to_domain(*, domain: str, provider: str, dataset: str, root: Optional[Path]) -> None:
17
17
  root_dir, name, pyproject = pkg_root(root)
18
18
  base = resolve_base_pkg_dir(root_dir, name)
19
+ package_name = base.name
19
20
  mappers_root = base / MAPPERS_GROUP
20
- prov = _slug(provider); ds = _slug(dataset); dom = _slug(domain)
21
+ _ = _slug(provider)
22
+ ds = _slug(dataset)
23
+ dom = _slug(domain)
21
24
 
22
25
  # Option B layout: mappers/{provider}/{dataset}/to_{domain}.py
23
26
  pkg_dir = mappers_root / provider / dataset
@@ -32,7 +35,7 @@ def attach_source_to_domain(*, domain: str, provider: str, dataset: str, root: O
32
35
  function_name = "map"
33
36
  path.write_text(render(
34
37
  "mapper.py.j2",
35
- PACKAGE_NAME=name,
38
+ PACKAGE_NAME=package_name,
36
39
  ORIGIN=provider,
37
40
  DATASET=dataset,
38
41
  TARGET_DOMAIN=dom,
@@ -42,11 +45,11 @@ def attach_source_to_domain(*, domain: str, provider: str, dataset: str, root: O
42
45
  OriginDTO=f"{camel(provider)}{camel(dataset)}DTO",
43
46
  time_aware=True,
44
47
  ))
45
- print(f"[new] Created: {path}")
48
+ print(f"[new] {path}")
46
49
 
47
- # Register the mapper EP as domain.provider (fallback to domain.provider.dataset on collision handled elsewhere)
48
- ep_key = f"{dom}.{prov}"
49
- ep_target = f"{name}.mappers.{provider}.{dataset}.{module_name}:map"
50
+ # Register the mapper EP as domain.dataset
51
+ ep_key = f"{dom}.{ds}"
52
+ ep_target = f"{package_name}.mappers.{provider}.{dataset}.{module_name}:map"
50
53
  toml = (root_dir / "pyproject.toml").read_text()
51
54
  toml = inject_ep(toml, MAPPERS_GROUP, ep_key, ep_target)
52
55
  (root_dir / "pyproject.toml").write_text(toml)
@@ -1,7 +1,15 @@
1
1
  from importlib.resources import as_file, files
2
2
  from pathlib import Path
3
+ import logging
4
+ import os
3
5
 
4
- from ..constants import COMPOSED_LOADER_EP
6
+ import yaml
7
+
8
+ from datapipeline.utils.load import load_yaml
9
+
10
+ from ..constants import DEFAULT_IO_LOADER_EP
11
+
12
+ logger = logging.getLogger(__name__)
5
13
 
6
14
  _RESERVED_PACKAGE_NAMES = {"datapipeline"}
7
15
 
@@ -9,15 +17,13 @@ _RESERVED_PACKAGE_NAMES = {"datapipeline"}
9
17
  def _normalized_package_name(dist_name: str) -> str:
10
18
  package_name = dist_name.replace("-", "_")
11
19
  if package_name in _RESERVED_PACKAGE_NAMES:
12
- print(
13
- "[error] `datapipeline` is reserved for the core package. "
14
- "Choose a different plugin name."
20
+ logger.error(
21
+ "`datapipeline` is reserved for the core package. Choose a different plugin name."
15
22
  )
16
23
  raise SystemExit(1)
17
24
  if not package_name.isidentifier():
18
- print(
19
- "[error] Plugin names must be valid Python identifiers once hyphens are replaced "
20
- "with underscores."
25
+ logger.error(
26
+ "Plugin names must be valid Python identifiers once hyphens are replaced with underscores."
21
27
  )
22
28
  raise SystemExit(1)
23
29
  return package_name
@@ -26,7 +32,7 @@ def _normalized_package_name(dist_name: str) -> str:
26
32
  def scaffold_plugin(name: str, outdir: Path) -> None:
27
33
  target = (outdir / name).absolute()
28
34
  if target.exists():
29
- print(f"[error] `{target}` already exists")
35
+ logger.error("`%s` already exists", target)
30
36
  raise SystemExit(1)
31
37
  import shutil
32
38
 
@@ -39,11 +45,49 @@ def scaffold_plugin(name: str, outdir: Path) -> None:
39
45
  replacements = {
40
46
  "{{PACKAGE_NAME}}": package_name,
41
47
  "{{DIST_NAME}}": name,
42
- "{{COMPOSED_LOADER_EP}}": COMPOSED_LOADER_EP,
48
+ "{{DEFAULT_IO_LOADER_EP}}": DEFAULT_IO_LOADER_EP,
43
49
  }
44
50
  for p in (target / "pyproject.toml", target / "README.md"):
45
51
  text = p.read_text()
46
52
  for placeholder, value in replacements.items():
47
53
  text = text.replace(placeholder, value)
48
54
  p.write_text(text)
49
- print(f"[new] Created plugin skeleton at {target}")
55
+
56
+ # Move jerry.yaml up to the workspace root (current working directory) so
57
+ # users can run the CLI from the workspace without cd'ing into the plugin.
58
+ # We adjust plugin_root and dataset paths to point at the plugin directory
59
+ # relative to the workspace. Do not overwrite an existing workspace
60
+ # jerry.yaml.
61
+ plugin_jerry = target / "jerry.yaml"
62
+ workspace_root = Path.cwd().resolve()
63
+ workspace_jerry = workspace_root / "jerry.yaml"
64
+ if plugin_jerry.exists() and not workspace_jerry.exists():
65
+ try:
66
+ plugin_root_rel = target.relative_to(workspace_root)
67
+ except ValueError:
68
+ # Fall back to a relative path between arbitrary directories; this
69
+ # may include ".." segments.
70
+ try:
71
+ plugin_root_rel = Path(os.path.relpath(target, workspace_root))
72
+ except Exception:
73
+ plugin_root_rel = target
74
+
75
+ data = load_yaml(plugin_jerry)
76
+ data["plugin_root"] = plugin_root_rel.as_posix()
77
+ datasets = data.get("datasets") or {}
78
+ updated_datasets = {}
79
+ for alias, path in datasets.items():
80
+ p = Path(path)
81
+ if p.is_absolute():
82
+ updated_datasets[alias] = p.as_posix()
83
+ else:
84
+ updated_datasets[alias] = (plugin_root_rel / p).as_posix()
85
+ data["datasets"] = updated_datasets
86
+
87
+ workspace_jerry.write_text(
88
+ yaml.safe_dump(data, sort_keys=False), encoding="utf-8"
89
+ )
90
+ plugin_jerry.unlink()
91
+ logger.info("workspace jerry.yaml created at %s", workspace_jerry)
92
+
93
+ logger.info("plugin skeleton created at %s", target)
@@ -4,12 +4,13 @@ from typing import Optional
4
4
 
5
5
  from datapipeline.services.scaffold.templates import camel, render
6
6
 
7
- from ..constants import COMPOSED_LOADER_EP
7
+ from ..constants import DEFAULT_IO_LOADER_EP
8
8
  from ..entrypoints import inject_ep
9
9
  from ..paths import pkg_root, resolve_base_pkg_dir
10
10
  from datapipeline.services.project_paths import (
11
11
  sources_dir as resolve_sources_dir,
12
12
  ensure_project_scaffold,
13
+ resolve_project_yaml_path,
13
14
  )
14
15
 
15
16
 
@@ -19,14 +20,14 @@ def _class_prefix(provider: str, dataset: str) -> str:
19
20
 
20
21
 
21
22
  def _source_alias(provider: str, dataset: str) -> str:
22
- return f"{provider}_{dataset}"
23
+ return f"{provider}.{dataset}"
23
24
 
24
25
 
25
26
  def _write_if_missing(path: Path, text: str) -> None:
26
27
  """Write file only if it does not exist; echo a friendly message."""
27
28
  if not path.exists():
28
29
  path.write_text(text)
29
- print(f"[new] Created: {path}")
30
+ print(f"[new] {path}")
30
31
 
31
32
 
32
33
  def _render_loader_stub(transport: str, loader_class: str,
@@ -55,86 +56,120 @@ def _update_ep(toml_text: str, provider: str, dataset: str, pkg_name: str,
55
56
  return toml_text, ep_key
56
57
 
57
58
 
58
- def _loader_ep_and_args(transport: str, fmt: Optional[str], ep_key: str) -> tuple[str, dict]:
59
+ def _loader_ep_and_args(transport: str, fmt: Optional[str], ep_key: Optional[str]) -> tuple[str, dict]:
59
60
  """Return (loader EP name, default args) for the YAML snippet."""
60
61
  if transport == "fs":
61
62
  args = {
62
63
  "transport": "fs",
63
- "format": fmt or "<FORMAT (csv|json|json-lines)>",
64
+ "format": fmt or "<FORMAT (csv|json|json-lines|pickle)>",
64
65
  "path": "<PATH OR GLOB>",
65
66
  "glob": False,
66
67
  "encoding": "utf-8",
67
68
  }
68
69
  if fmt == "csv":
69
70
  args["delimiter"] = ","
70
- return COMPOSED_LOADER_EP, args
71
+ return DEFAULT_IO_LOADER_EP, args
71
72
  if transport == "synthetic":
73
+ if ep_key is None:
74
+ raise ValueError("synthetic transport requires scaffolding a loader entrypoint")
72
75
  return ep_key, {"start": "<ISO8601>", "end": "<ISO8601>", "frequency": "1h"}
73
- if transport == "url":
76
+ if transport == "http":
74
77
  args = {
75
- "transport": "url",
78
+ "transport": "http",
76
79
  "format": fmt or "<FORMAT (json|json-lines|csv)>",
77
80
  "url": "<https://api.example.com/data.json>",
78
81
  "headers": {},
82
+ "params": {},
79
83
  "encoding": "utf-8",
80
84
  }
81
85
  if fmt == "csv":
82
86
  args["delimiter"] = ","
83
- return COMPOSED_LOADER_EP, args
87
+ return DEFAULT_IO_LOADER_EP, args
88
+ if ep_key is None:
89
+ raise ValueError(f"unsupported transport '{transport}' for identity scaffold")
84
90
  return ep_key, {}
85
91
 
86
92
 
87
- def create_source(*, provider: str, dataset: str, transport: str,
88
- format: Optional[str], root: Optional[Path]) -> None:
93
+ def create_source(
94
+ *,
95
+ provider: str,
96
+ dataset: str,
97
+ transport: str,
98
+ format: Optional[str],
99
+ root: Optional[Path],
100
+ identity: bool = False,
101
+ ) -> None:
89
102
  root_dir, name, _ = pkg_root(root)
90
103
  base = resolve_base_pkg_dir(root_dir, name)
91
- src_pkg_dir = base / "sources" / provider / dataset
92
- src_pkg_dir.mkdir(parents=True, exist_ok=True)
93
- (src_pkg_dir / "__init__.py").touch(exist_ok=True)
94
-
95
- class_prefix = _class_prefix(provider, dataset)
96
- dto_class = f"{class_prefix}DTO"
97
- parser_class = f"{class_prefix}Parser"
98
- loader_class = f"{class_prefix}DataLoader"
99
-
100
- # DTO
101
- dto_path = src_pkg_dir / "dto.py"
102
- _write_if_missing(dto_path, render(
103
- "dto.py.j2",
104
- PACKAGE_NAME=name, ORIGIN=provider, DOMAIN=dataset,
105
- CLASS_NAME=dto_class, time_aware=True
106
- ))
107
-
108
- # Parser
109
- parser_path = src_pkg_dir / "parser.py"
110
- _write_if_missing(parser_path, render(
111
- "parser.py.j2",
112
- PACKAGE_NAME=name, ORIGIN=provider, DOMAIN=dataset,
113
- CLASS_NAME=parser_class, DTO_CLASS=dto_class, time_aware=True
114
- ))
115
-
116
- # Optional loader stub: synthetic (url uses composed loader by default)
117
- if transport in {"synthetic"}:
118
- loader_path = src_pkg_dir / "loader.py"
119
- stub = _render_loader_stub(transport, loader_class, fmt=format)
120
- if stub is not None:
121
- _write_if_missing(loader_path, stub)
122
-
123
- toml_path = root_dir / "pyproject.toml"
124
- toml_text, ep_key = _update_ep(
125
- toml_path.read_text(),
126
- provider, dataset, name,
127
- transport, parser_class, loader_class
128
- )
129
- toml_path.write_text(toml_text)
104
+ package_name = base.name
130
105
 
131
106
  alias = _source_alias(provider, dataset)
107
+ parser_ep: str
108
+ parser_args: dict
109
+ ep_key: Optional[str] = None
110
+
111
+ if identity:
112
+ if transport == "synthetic":
113
+ raise ValueError(
114
+ "identity parser scaffold is not supported for synthetic sources; "
115
+ "generate the standard parser instead."
116
+ )
117
+ parser_ep = "identity"
118
+ parser_args = {}
119
+ else:
120
+ src_pkg_dir = base / "sources" / provider / dataset
121
+ src_pkg_dir.mkdir(parents=True, exist_ok=True)
122
+ (src_pkg_dir / "__init__.py").touch(exist_ok=True)
123
+
124
+ class_prefix = _class_prefix(provider, dataset)
125
+ dto_class = f"{class_prefix}DTO"
126
+ parser_class = f"{class_prefix}Parser"
127
+ loader_class = f"{class_prefix}DataLoader"
128
+
129
+ # DTO
130
+ dto_path = src_pkg_dir / "dto.py"
131
+ _write_if_missing(dto_path, render(
132
+ "dto.py.j2",
133
+ PACKAGE_NAME=package_name, ORIGIN=provider, DOMAIN=dataset,
134
+ CLASS_NAME=dto_class, time_aware=True
135
+ ))
136
+
137
+ # Parser
138
+ parser_path = src_pkg_dir / "parser.py"
139
+ _write_if_missing(parser_path, render(
140
+ "parser.py.j2",
141
+ PACKAGE_NAME=package_name, ORIGIN=provider, DOMAIN=dataset,
142
+ CLASS_NAME=parser_class, DTO_CLASS=dto_class, time_aware=True
143
+ ))
144
+
145
+ # Optional loader stub: synthetic (http uses core IO loader by default)
146
+ if transport in {"synthetic"}:
147
+ loader_path = src_pkg_dir / "loader.py"
148
+ stub = _render_loader_stub(transport, loader_class, fmt=format)
149
+ if stub is not None:
150
+ _write_if_missing(loader_path, stub)
151
+
152
+ toml_path = root_dir / "pyproject.toml"
153
+ toml_text, ep_key = _update_ep(
154
+ toml_path.read_text(),
155
+ provider,
156
+ dataset,
157
+ package_name,
158
+ transport,
159
+ parser_class,
160
+ loader_class,
161
+ )
162
+ toml_path.write_text(toml_text)
163
+
164
+ parser_ep = ep_key
165
+ parser_args = {}
166
+
132
167
  loader_ep, loader_args = _loader_ep_and_args(transport, format, ep_key)
133
168
 
134
169
  # Resolve sources directory from a single dataset-scoped project config.
135
170
  # If not present or invalid, let the exception bubble up to prompt the user
136
171
  # to provide a valid project path.
137
- proj_yaml = root_dir / "config" / "datasets" / "default" / "project.yaml"
172
+ proj_yaml = resolve_project_yaml_path(root_dir)
138
173
  # Best-effort: create a minimal project scaffold if missing
139
174
  ensure_project_scaffold(proj_yaml)
140
175
  sources_dir = resolve_sources_dir(proj_yaml).resolve()
@@ -143,11 +178,13 @@ def create_source(*, provider: str, dataset: str, transport: str,
143
178
  if not src_cfg_path.exists():
144
179
  src_cfg_path.write_text(render(
145
180
  "source.yaml.j2",
146
- source_id=alias,
147
- parser_ep=ep_key,
148
- parser_args={},
181
+ id=alias,
182
+ parser_ep=parser_ep,
183
+ parser_args=parser_args,
149
184
  loader_ep=loader_ep,
150
185
  loader_args=loader_args,
151
- composed_loader_ep=COMPOSED_LOADER_EP,
186
+ default_io_loader_ep=DEFAULT_IO_LOADER_EP,
152
187
  ))
153
- print(f"[new] Created: {src_cfg_path.resolve()}")
188
+ print(f"[new] {src_cfg_path.resolve()}")
189
+ elif identity:
190
+ print(f"[info] Source YAML already exists; skipped identity scaffold at {src_cfg_path.resolve()}")
@@ -1,21 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from typing import Iterator, Any, Optional
4
- from .models.loader import RawDataLoader
5
- from .transports import TextSource, UrlSource
4
+ from .models.loader import BaseDataLoader
5
+ from .transports import Transport, HttpTransport
6
6
  from .decoders import Decoder
7
7
 
8
8
 
9
- class ComposedRawLoader(RawDataLoader):
10
- """Compose a transport TextSource with a row Decoder."""
9
+ class DataLoader(BaseDataLoader):
10
+ """Compose a Transport with a row Decoder."""
11
11
 
12
- def __init__(self, source: TextSource, decoder: Decoder, *, allow_network_count: bool = False):
13
- self.source = source
12
+ def __init__(self, transport: Transport, decoder: Decoder, *, allow_network_count: bool = False):
13
+ self.transport = transport
14
14
  self.decoder = decoder
15
15
  self._allow_net_count = bool(allow_network_count)
16
16
 
17
17
  def load(self) -> Iterator[Any]:
18
- for stream in self.source.streams():
18
+ for stream in self.transport.streams():
19
19
  for row in self.decoder.decode(stream):
20
20
  yield row
21
21
 
@@ -23,11 +23,11 @@ class ComposedRawLoader(RawDataLoader):
23
23
  # Delegate counting to the decoder using the transport streams.
24
24
  # Avoid counting over network unless explicitly enabled.
25
25
  try:
26
- if isinstance(self.source, UrlSource) and not self._allow_net_count:
26
+ if isinstance(self.transport, HttpTransport) and not self._allow_net_count:
27
27
  return None
28
28
  total = 0
29
29
  any_stream = False
30
- for stream in self.source.streams():
30
+ for stream in self.transport.streams():
31
31
  any_stream = True
32
32
  c = self.decoder.count(stream)
33
33
  if c is None: