PyPI - jerry-thomas - Versions diffs - 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

jerry-thomas 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

datapipeline/analysis/vector/collector.py +120 -17
datapipeline/analysis/vector/matrix.py +33 -8
datapipeline/analysis/vector/report.py +162 -32
datapipeline/build/tasks/__init__.py +11 -0
datapipeline/build/tasks/config.py +74 -0
datapipeline/build/tasks/metadata.py +170 -0
datapipeline/build/tasks/scaler.py +73 -0
datapipeline/build/tasks/schema.py +60 -0
datapipeline/build/tasks/utils.py +169 -0
datapipeline/cli/app.py +304 -127
datapipeline/cli/commands/build.py +240 -16
datapipeline/cli/commands/contract.py +367 -0
datapipeline/cli/commands/domain.py +8 -3
datapipeline/cli/commands/inspect.py +401 -149
datapipeline/cli/commands/list_.py +30 -7
datapipeline/cli/commands/plugin.py +5 -1
datapipeline/cli/commands/run.py +227 -241
datapipeline/cli/commands/run_config.py +101 -0
datapipeline/cli/commands/serve_pipeline.py +156 -0
datapipeline/cli/commands/source.py +44 -8
datapipeline/cli/visuals/__init__.py +4 -2
datapipeline/cli/visuals/common.py +239 -0
datapipeline/cli/visuals/labels.py +15 -15
datapipeline/cli/visuals/runner.py +66 -0
datapipeline/cli/visuals/sections.py +20 -0
datapipeline/cli/visuals/sources.py +132 -119
datapipeline/cli/visuals/sources_basic.py +260 -0
datapipeline/cli/visuals/sources_off.py +76 -0
datapipeline/cli/visuals/sources_rich.py +414 -0
datapipeline/config/catalog.py +37 -3
datapipeline/config/context.py +214 -0
datapipeline/config/dataset/loader.py +21 -4
datapipeline/config/dataset/normalize.py +4 -4
datapipeline/config/metadata.py +43 -0
datapipeline/config/postprocess.py +2 -2
datapipeline/config/project.py +3 -2
datapipeline/config/resolution.py +129 -0
datapipeline/config/tasks.py +309 -0
datapipeline/config/workspace.py +155 -0
datapipeline/domain/__init__.py +12 -0
datapipeline/domain/record.py +11 -0
datapipeline/domain/sample.py +54 -0
datapipeline/integrations/ml/adapter.py +34 -20
datapipeline/integrations/ml/pandas_support.py +0 -2
datapipeline/integrations/ml/rows.py +1 -6
datapipeline/integrations/ml/torch_support.py +1 -3
datapipeline/io/factory.py +112 -0
datapipeline/io/output.py +132 -0
datapipeline/io/protocols.py +21 -0
datapipeline/io/serializers.py +219 -0
datapipeline/io/sinks/__init__.py +23 -0
datapipeline/io/sinks/base.py +2 -0
datapipeline/io/sinks/files.py +79 -0
datapipeline/io/sinks/rich.py +57 -0
datapipeline/io/sinks/stdout.py +18 -0
datapipeline/io/writers/__init__.py +14 -0
datapipeline/io/writers/base.py +28 -0
datapipeline/io/writers/csv_writer.py +25 -0
datapipeline/io/writers/jsonl.py +52 -0
datapipeline/io/writers/pickle_writer.py +30 -0
datapipeline/pipeline/artifacts.py +58 -0
datapipeline/pipeline/context.py +66 -7
datapipeline/pipeline/observability.py +65 -0
datapipeline/pipeline/pipelines.py +65 -13
datapipeline/pipeline/split.py +11 -10
datapipeline/pipeline/stages.py +127 -16
datapipeline/pipeline/utils/keygen.py +20 -7
datapipeline/pipeline/utils/memory_sort.py +22 -10
datapipeline/pipeline/utils/transform_utils.py +22 -0
datapipeline/runtime.py +5 -2
datapipeline/services/artifacts.py +12 -6
datapipeline/services/bootstrap/config.py +25 -0
datapipeline/services/bootstrap/core.py +52 -37
datapipeline/services/constants.py +6 -5
datapipeline/services/factories.py +123 -1
datapipeline/services/project_paths.py +43 -16
datapipeline/services/runs.py +208 -0
datapipeline/services/scaffold/domain.py +3 -2
datapipeline/services/scaffold/filter.py +3 -2
datapipeline/services/scaffold/mappers.py +9 -6
datapipeline/services/scaffold/plugin.py +54 -10
datapipeline/services/scaffold/source.py +93 -56
datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
datapipeline/sources/decoders.py +83 -18
datapipeline/sources/factory.py +26 -16
datapipeline/sources/models/__init__.py +2 -2
datapipeline/sources/models/generator.py +0 -7
datapipeline/sources/models/loader.py +3 -3
datapipeline/sources/models/parsing_error.py +24 -0
datapipeline/sources/models/source.py +6 -6
datapipeline/sources/synthetic/time/loader.py +14 -2
datapipeline/sources/transports.py +74 -37
datapipeline/templates/plugin_skeleton/README.md +76 -30
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
datapipeline/templates/plugin_skeleton/jerry.yaml +34 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
datapipeline/templates/stubs/dto.py.j2 +2 -0
datapipeline/templates/stubs/mapper.py.j2 +5 -4
datapipeline/templates/stubs/parser.py.j2 +2 -0
datapipeline/templates/stubs/record.py.j2 +2 -0
datapipeline/templates/stubs/source.yaml.j2 +2 -3
datapipeline/transforms/debug/lint.py +26 -41
datapipeline/transforms/feature/scaler.py +89 -13
datapipeline/transforms/record/floor_time.py +4 -4
datapipeline/transforms/sequence.py +2 -35
datapipeline/transforms/stream/dedupe.py +24 -0
datapipeline/transforms/stream/ensure_ticks.py +7 -6
datapipeline/transforms/vector/__init__.py +5 -0
datapipeline/transforms/vector/common.py +98 -0
datapipeline/transforms/vector/drop/__init__.py +4 -0
datapipeline/transforms/vector/drop/horizontal.py +79 -0
datapipeline/transforms/vector/drop/orchestrator.py +59 -0
datapipeline/transforms/vector/drop/vertical.py +182 -0
datapipeline/transforms/vector/ensure_schema.py +184 -0
datapipeline/transforms/vector/fill.py +87 -0
datapipeline/transforms/vector/replace.py +62 -0
datapipeline/utils/load.py +24 -3
datapipeline/utils/rich_compat.py +38 -0
datapipeline/utils/window.py +76 -0
jerry_thomas-1.0.1.dist-info/METADATA +825 -0
jerry_thomas-1.0.1.dist-info/RECORD +199 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/entry_points.txt +9 -8
datapipeline/build/tasks.py +0 -186
datapipeline/cli/commands/link.py +0 -128
datapipeline/cli/commands/writers.py +0 -138
datapipeline/config/build.py +0 -64
datapipeline/config/run.py +0 -116
datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
datapipeline/transforms/vector.py +0 -210
jerry_thomas-0.3.0.dist-info/METADATA +0 -502
jerry_thomas-0.3.0.dist-info/RECORD +0 -139
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/WHEEL +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/licenses/LICENSE +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.1.dist-info}/top_level.txt +0 -0

datapipeline/runtime.py CHANGED Viewed

@@ -1,8 +1,9 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, List, Mapping, Optional, Sequence, Union
+from datetime import datetime
-from datapipeline.config.run import RunConfig
+from datapipeline.config.tasks import ServeTask
 from datapipeline.config.split import SplitConfig
 from datapipeline.registries.registry import Registry
@@ -66,7 +67,9 @@ class Runtime:
     registries: Registries = field(default_factory=Registries)
     split: Optional[SplitConfig] = None
     split_keep: Optional[str] = None
-    run: Optional[RunConfig] = None
+    run: Optional[ServeTask] = None
+    schema_required: bool = True
+    window_bounds: tuple[datetime | None, datetime | None] | None = None
     artifacts: ArtifactManager = field(init=False)
     def __post_init__(self) -> None:

datapipeline/services/artifacts.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from __future__ import annotations
 from dataclasses import dataclass
+import json
 from pathlib import Path
 from typing import Any, Callable, Dict, Generic, Mapping, Optional, TypeVar
-from datapipeline.services.constants import PARTIONED_IDS
+from datapipeline.services.constants import VECTOR_SCHEMA, VECTOR_SCHEMA_METADATA
 ArtifactValue = TypeVar("ArtifactValue")
@@ -85,12 +86,17 @@ class ArtifactManager:
             raise RuntimeError(message) from exc
-def _read_expected_ids(path: Path) -> list[str]:
+def _read_schema(path: Path) -> dict:
     with path.open("r", encoding="utf-8") as fh:
-        return [line.strip() for line in fh if line.strip()]
+        return json.load(fh)
-PARTITIONED_IDS_SPEC = ArtifactSpec[list[str]](
-    key=PARTIONED_IDS,
-    loader=_read_expected_ids,
+VECTOR_SCHEMA_SPEC = ArtifactSpec[dict](
+    key=VECTOR_SCHEMA,
+    loader=_read_schema,
+)
+VECTOR_METADATA_SPEC = ArtifactSpec[dict](
+    key=VECTOR_SCHEMA_METADATA,
+    loader=_read_schema,
 )

datapipeline/services/bootstrap/config.py CHANGED Viewed

@@ -41,6 +41,12 @@ def _project_vars(data: dict) -> dict[str, Any]:
     if name:
         vars_["project"] = str(name)
         vars_["project_name"] = str(name)
+    version = data.get("version")
+    if version is not None:
+        vars_["version"] = str(version)
+        vars_["project_version"] = str(version)
     globals_ = data.get("globals") or {}
     for k, v in globals_.items():
         vars_[str(k)] = _serialize_global_value(v)
@@ -64,6 +70,24 @@ def artifacts_root(project_yaml: Path) -> Path:
     return (pj.parent / ap).resolve() if not ap.is_absolute() else ap
+def run_root(project_yaml: Path, run_id: str | None = None) -> Path:
+    """Return a per-run artifacts directory under the project artifacts root.
+    Example:
+      artifacts_root: /.../artifacts/my_dataset/v3
+      run_root:       /.../artifacts/my_dataset/v3/runs/2025-11-29T14-15-23Z
+    """
+    base = artifacts_root(project_yaml)
+    if run_id is None:
+        ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
+        run_id = ts
+    root = (base / "runs" / run_id).resolve()
+    root.mkdir(parents=True, exist_ok=True)
+    return root
 def _load_by_key(
     project_yaml: Path,
     key: str,
@@ -131,6 +155,7 @@ def _interpolate(obj, vars_: dict[str, Any]):
 __all__ = [
     "artifacts_root",
+    "run_root",
     "_globals",
     "_interpolate",
     "_load_by_key",

datapipeline/services/bootstrap/core.py CHANGED Viewed

@@ -1,15 +1,14 @@
 from pathlib import Path
-from typing import Any, Mapping
+from typing import Any
 from datapipeline.utils.load import load_yaml
 from datapipeline.config.catalog import StreamsConfig
-from datapipeline.config.run import load_run_config
+from datapipeline.config.tasks import default_serve_task
 from datapipeline.services.project_paths import streams_dir, sources_dir
 from datapipeline.build.state import load_build_state
 from datapipeline.services.constants import (
     PARSER_KEY,
     LOADER_KEY,
-    SOURCE_KEY,
     SOURCE_ID_KEY,
     MAPPER_KEY,
     ENTRYPOINT_KEY,
@@ -19,6 +18,7 @@ from datapipeline.services.constants import (
 from datapipeline.services.factories import (
     build_source_from_spec,
     build_mapper_from_spec,
+    build_composed_source,
 )
 from datapipeline.runtime import Runtime
@@ -28,9 +28,7 @@ from .config import (
     _globals,
     _interpolate,
     _load_by_key,
-    _paths,
     _project,
-    _project_vars,
 )
@@ -41,26 +39,28 @@ SRC_LOADER_KEY = LOADER_KEY
 def _load_sources_from_dir(project_yaml: Path, vars_: dict[str, Any]) -> dict:
     """Aggregate per-source YAML files into a raw-sources mapping.
-    Expects each file to define a single source with top-level 'parser' and
-    'loader' keys. The source alias is inferred from the filename (without
-    extension).
+    Scans for YAML files under the sources directory (recursing through
+    subfolders). Expects each file to define a single source with top-level
+    'parser' and 'loader' keys. The top-level 'id' inside the file becomes the
+    runtime alias.
     """
-    import os
     src_dir = sources_dir(project_yaml)
     if not src_dir.exists() or not src_dir.is_dir():
         return {}
     out: dict[str, dict] = {}
-    for fname in sorted(os.listdir(src_dir)):
-        if not (fname.endswith(".yaml") or fname.endswith(".yml")):
-            continue
-        data = load_yaml(src_dir / fname)
+    candidates = sorted(
+        (p for p in src_dir.rglob("*.y*ml") if p.is_file()),
+        key=lambda p: p.relative_to(src_dir).as_posix(),
+    )
+    for path in candidates:
+        data = load_yaml(path)
         if not isinstance(data, dict):
             continue
         if isinstance(data.get(SRC_PARSER_KEY), dict) and isinstance(data.get(SRC_LOADER_KEY), dict):
             alias = data.get(SOURCE_ID_KEY)
             if not alias:
                 raise ValueError(
-                    f"Missing 'source_id' in source file: {fname}")
+                    f"Missing 'id' in source file: {path.relative_to(src_dir)}")
             out[alias] = _interpolate(data, vars_)
             continue
     return out
@@ -81,13 +81,32 @@ def _load_canonical_streams(project_yaml: Path, vars_: dict[str, Any]) -> dict:
         if not p.is_file():
             continue
         data = load_yaml(p)
-        # Require explicit ids: stream_id and source_id
-        if isinstance(data, dict) and (SOURCE_ID_KEY in data) and (STREAM_ID_KEY in data):
-            m = data.get(MAPPER_KEY)
-            if (not isinstance(m, dict)) or (ENTRYPOINT_KEY not in (m or {})):
-                data[MAPPER_KEY] = None
-            alias = data.get(STREAM_ID_KEY)
-            out[alias] = _interpolate(data, vars_)
+        # Contracts must declare kind: 'ingest' | 'composed'
+        if not isinstance(data, dict):
+            continue
+        kind = data.get("kind")
+        if kind not in {"ingest", "composed"}:
+            continue
+        if (STREAM_ID_KEY not in data):
+            continue
+        if kind == "ingest" and ("source" not in data):
+            continue
+        if kind == "composed" and ("inputs" not in data):
+            continue
+        m = data.get(MAPPER_KEY)
+        if (not isinstance(m, dict)) or (ENTRYPOINT_KEY not in (m or {})):
+            data[MAPPER_KEY] = None
+        # Support simple per-contract variables like 'cadence' while keeping
+        # project-level globals as the single source of truth for shared values.
+        local_vars = dict(vars_)
+        cadence_expr = data.get("cadence")
+        if cadence_expr is not None:
+            # Allow cadence to reference globals (e.g. ${group_by}) while also
+            # making ${cadence} usable elsewhere in the same contract.
+            resolved_cadence = _interpolate(cadence_expr, vars_)
+            local_vars["cadence"] = resolved_cadence
+        alias = data.get(STREAM_ID_KEY)
+        out[alias] = _interpolate(data, local_vars)
     return out
@@ -101,16 +120,7 @@ def load_streams(project_yaml: Path) -> StreamsConfig:
 def init_streams(cfg: StreamsConfig, runtime: Runtime) -> None:
     """Compile typed streams config into runtime registries."""
     regs = runtime.registries
-    regs.stream_operations.clear()
-    regs.debug_operations.clear()
-    regs.partition_by.clear()
-    regs.sort_batch_size.clear()
-    regs.record_operations.clear()
-    regs.feature_transforms.clear()
-    regs.postprocesses.clear()
-    regs.sources.clear()
-    regs.mappers.clear()
-    regs.stream_sources.clear()
+    regs.clear_all()
     # Register per-stream policies and record transforms for runtime lookups
     for alias, spec in (cfg.contracts or {}).items():
@@ -124,9 +134,16 @@ def init_streams(cfg: StreamsConfig, runtime: Runtime) -> None:
     for alias, spec in (cfg.raw or {}).items():
         regs.sources.register(alias, build_source_from_spec(spec))
     for alias, spec in (cfg.contracts or {}).items():
-        mapper = build_mapper_from_spec(spec.mapper)
-        regs.mappers.register(alias, mapper)
-        regs.stream_sources.register(alias, regs.sources.get(spec.source_id))
+        if getattr(spec, "kind", None) == "composed":
+            # Composed stream: register virtual source and identity mapper
+            regs.stream_sources.register(
+                alias, build_composed_source(alias, spec, runtime)
+            )
+            regs.mappers.register(alias, build_mapper_from_spec(None))
+        else:
+            mapper = build_mapper_from_spec(spec.mapper)
+            regs.mappers.register(alias, mapper)
+            regs.stream_sources.register(alias, regs.sources.get(spec.source))
 def bootstrap(project_yaml: Path) -> Runtime:
@@ -146,9 +163,7 @@ def bootstrap(project_yaml: Path) -> Runtime:
         runtime.split = None
     try:
-        runtime.run = load_run_config(project_yaml)
-    except FileNotFoundError:
-        runtime.run = None
+        runtime.run = default_serve_task(project_yaml)
     except Exception:
         runtime.run = None

datapipeline/services/constants.py CHANGED Viewed

@@ -1,19 +1,20 @@
 PARSER_KEY = "parser"
 LOADER_KEY = "loader"
 SOURCE_KEY = "source"
-SOURCE_ID_KEY = "source_id"
+SOURCE_ID_KEY = "id"
 MAPPER_KEY = "mapper"
 ENTRYPOINT_KEY = "entrypoint"
 ARGS_KEY = "args"
-STREAM_ID_KEY = "stream_id"
+STREAM_ID_KEY = "id"
 PARSERS_GROUP = "parsers"
 LOADERS_GROUP = "loaders"
 MAPPERS_GROUP = "mappers"
 FILTERS_GROUP = "filters"
-COMPOSED_LOADER_EP = "composed.loader"
+DEFAULT_IO_LOADER_EP = "core.io"
-#POSTPROCESS_GLOBAL_KEY = "__global__"
+# POSTPROCESS_GLOBAL_KEY = "__global__"
 POSTPROCESS_TRANSFORMS = "transforms"
-PARTIONED_IDS = "partitioned_ids"
 SCALER_STATISTICS = "scaler_statistics"
+VECTOR_SCHEMA = "vector_schema"
+VECTOR_SCHEMA_METADATA = "vector_schema_metadata"

datapipeline/services/factories.py CHANGED Viewed

@@ -1,9 +1,16 @@
 from datapipeline.utils.load import load_ep
 from datapipeline.plugins import PARSERS_EP, LOADERS_EP, MAPPERS_EP
 from datapipeline.sources.models.source import Source
-from datapipeline.config.catalog import SourceConfig, EPArgs
+from datapipeline.config.catalog import SourceConfig, EPArgs, ContractConfig
 from datapipeline.mappers.noop import identity
 from datapipeline.utils.placeholders import normalize_args
+from datapipeline.sources.models.base import SourceInterface
+from datapipeline.pipeline.context import PipelineContext
+from datapipeline.config.dataset.feature import FeatureRecordConfig
+from datapipeline.pipeline.pipelines import build_feature_pipeline
+from datapipeline.pipeline.utils.transform_utils import _supports_parameter
+from inspect import isclass
+from typing import Iterator, Any, Optional
 def build_source_from_spec(spec: SourceConfig) -> Source:
@@ -23,3 +30,118 @@ def build_mapper_from_spec(spec: EPArgs | None):
     if args:
         return lambda raw: fn(raw, **args)
     return fn
+class _ComposedSource(SourceInterface):
+    def __init__(self, *, runtime, stream_id: str, spec: ContractConfig):
+        self._runtime = runtime
+        self._stream_id = stream_id
+        self._spec = spec
+    def stream(self):
+        context = PipelineContext(self._runtime)
+        raw_inputs = self._spec.inputs
+        input_specs = list(raw_inputs or [])
+        if not input_specs:
+            return iter(())
+        # Resolve inputs: "[alias=]stream_id" (streams only)
+        resolved = self._resolve_inputs(context, input_specs)
+        aligned = {k: v for k, v in resolved.items() if v["aligned"]}
+        aux = {k: v for k, v in resolved.items() if not v["aligned"]}
+        # Build aligned/aux iterators (unwrap FeatureRecord -> record for aligned)
+        aligned_iters: dict[str, Iterator[Any]] = {
+            k: (fr.record for fr in v["iter"])  # stage>=3 yields FeatureRecord
+            for k, v in aligned.items()
+        }
+        aux_iters: dict[str, Iterator[Any]] = {
+            k: v["iter"] for k, v in aux.items()}
+        # Load mapper (composer) from contract
+        mapper = self._spec.mapper
+        if not mapper or not mapper.entrypoint:
+            raise ValueError(
+                f"Composed stream '{self._stream_id}' requires mapper.entrypoint composer"
+            )
+        ep = load_ep(MAPPERS_EP, mapper.entrypoint)
+        kwargs = normalize_args(mapper.args)
+        # Choose driver among aligned inputs
+        aligned_keys = list(aligned_iters.keys())
+        if not aligned_keys:
+            driver_key = None
+        else:
+            driver_key = kwargs.pop("driver", None) or aligned_keys[0]
+        # Mapper adapters: Simple vs Advanced
+        if not isclass(ep) and not _supports_parameter(ep, "inputs"):
+            # Simple: expect a single iterator when exactly one aligned input and no aux
+            if len(aligned_iters) == 1 and not aux_iters:
+                single_iter = next(iter(aligned_iters.values()))
+                for rec in ep(single_iter):
+                    yield getattr(rec, "record", rec)
+                return
+            raise TypeError(
+                "Mapper must accept inputs=... for multi-input or aux-enabled contracts"
+            )
+        # Advanced: pass inputs / aux / driver / context when supported
+        call_kwargs = dict(kwargs)
+        if _supports_parameter(ep, "context") and "context" not in call_kwargs:
+            call_kwargs["context"] = context
+        if _supports_parameter(ep, "aux"):
+            call_kwargs["aux"] = aux_iters
+        if driver_key and _supports_parameter(ep, "driver"):
+            call_kwargs["driver"] = driver_key
+        if isclass(ep):
+            inst = ep(**call_kwargs) if call_kwargs else ep()
+            binder = getattr(inst, "bind_context", None)
+            if callable(binder):
+                binder(context)
+            for rec in inst(inputs=aligned_iters):
+                yield getattr(rec, "record", rec)
+            return
+        for rec in ep(inputs=aligned_iters, **call_kwargs):
+            yield getattr(rec, "record", rec)
+    def _resolve_inputs(self, context: PipelineContext, specs: list[str]):
+        """Parse and resolve composed inputs into iterators.
+        Grammar: "[alias=]stream_id" only. All inputs are built to stage 4
+        and are alignable (FeatureRecord -> domain record unwrapped).
+        """
+        runtime = context.runtime
+        known_streams = set(runtime.registries.stream_sources.keys())
+        out: dict[str, dict] = {}
+        for spec in specs:
+            alias, ref = self._parse_input(spec)
+            if ref not in known_streams:
+                raise ValueError(
+                    f"Unknown input stream '{ref}'. Known streams: {sorted(known_streams)}"
+                )
+            cfg = FeatureRecordConfig(record_stream=ref, id=alias)
+            it = build_feature_pipeline(context, cfg, stage=4)
+            out[alias] = {"iter": it, "aligned": True}
+        return out
+    @staticmethod
+    def _parse_input(text: str) -> tuple[str, str]:
+        # alias=stream_id
+        if "@" in text:
+            raise ValueError(
+                "composed inputs may not include '@stage'; streams align by default")
+        alias: Optional[str] = None
+        if "=" in text:
+            alias, text = text.split("=", 1)
+        ref = text
+        alias = alias or ref
+        return alias, ref
+def build_composed_source(stream_id: str, spec: ContractConfig, runtime) -> SourceInterface:
+    return _ComposedSource(runtime=runtime, stream_id=stream_id, spec=spec)

datapipeline/services/project_paths.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 from pathlib import Path
+from typing import Optional
 from datapipeline.utils.load import load_yaml
 from datapipeline.config.project import ProjectConfig
@@ -35,21 +36,19 @@ def sources_dir(project_yaml: Path) -> Path:
     return p
-def build_config_path(project_yaml: Path) -> Path:
-    """Return the resolved path to build.yaml declared in project.paths.build."""
+def tasks_dir(project_yaml: Path) -> Path:
+    """Return the resolved path to the tasks directory (project.paths.tasks)."""
     cfg = read_project(project_yaml)
-    build_path = getattr(cfg.paths, "build", None)
-    if not build_path:
-        raise FileNotFoundError(
-            "project.paths.build must point to a build.yaml configuration file."
-        )
-    p = Path(build_path)
+    tasks_path = getattr(cfg.paths, "tasks", None)
+    if not tasks_path:
+        raise FileNotFoundError("project.paths.tasks must point to a tasks directory.")
+    p = Path(tasks_path)
     if not p.is_absolute():
         p = _project_root(project_yaml) / p
-    if not p.exists():
-        raise FileNotFoundError(f"build config not found: {p}")
-    return p
+    if not p.exists() or not p.is_dir():
+        raise FileNotFoundError(f"tasks directory not found: {p}")
+    return p.resolve()
 def ensure_project_scaffold(project_yaml: Path) -> None:
@@ -64,14 +63,14 @@ def ensure_project_scaffold(project_yaml: Path) -> None:
         project_yaml.parent.mkdir(parents=True, exist_ok=True)
         default = (
             "version: 1\n"
+            "name: default\n"
             "paths:\n"
-            "  streams: ../../contracts\n"
-            "  sources: ../../sources\n"
+            "  streams: ./contracts\n"
+            "  sources: ./sources\n"
             "  dataset: dataset.yaml\n"
             "  postprocess: postprocess.yaml\n"
-            "  artifacts: ../../build/datasets/default\n"
-            "  build: build.yaml\n"
-            "  run: run.yaml\n"
+            "  artifacts: ../artifacts/default\n"
+            "  tasks: ./tasks\n"
             "globals:\n"
             "  start_time: 2021-01-01T00:00:00Z\n"
             "  end_time: 2021-12-31T23:00:00Z\n"
@@ -90,7 +89,35 @@ def ensure_project_scaffold(project_yaml: Path) -> None:
         if not sources.is_absolute():
             sources = _project_root(project_yaml) / sources
         sources.mkdir(parents=True, exist_ok=True)
+        tasks = getattr(cfg.paths, "tasks", None)
+        if tasks:
+            tasks_path = Path(tasks)
+            if not tasks_path.is_absolute():
+                tasks_path = _project_root(project_yaml) / tasks_path
+            tasks_path.mkdir(parents=True, exist_ok=True)
     except Exception:
         # If the file is malformed, leave it to callers to report; this helper
         # is best-effort to create a sensible starting point.
         pass
+def resolve_project_yaml_path(plugin_root: Path) -> Path:
+    """Return a best-effort project.yaml path for scaffolding.
+    Resolution order:
+    1) <plugin_root>/example/project.yaml
+    2) <plugin_root>/config/project.yaml
+    3) <plugin_root>/config/datasets/default/project.yaml
+    4) Fallback: <plugin_root>/example/project.yaml
+    """
+    candidates = [
+        plugin_root / "example" / "project.yaml",
+        plugin_root / "config" / "project.yaml",
+        plugin_root / "config" / "datasets" / "default" / "project.yaml",
+    ]
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    # Default to the first candidate; callers may scaffold a new project there.
+    return candidates[0]

jerry-thomas 0.3.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

jerry-thomas 0.3.0py3-none-any.whl → 1.0.1py3-none-any.whl