PyPI - jerry-thomas - Versions diffs - 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl - Mend

jerry-thomas 1.0.3py3-none-any.whl → 2.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

datapipeline/pipeline/pipelines.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import heapq
+from collections import defaultdict
 from collections.abc import Iterator, Sequence
 from typing import Any
 from itertools import tee
-from datapipeline.domain.sample import Sample
 from datapipeline.domain.vector import Vector
 from datapipeline.pipeline.utils.keygen import group_key_for
 from datapipeline.pipeline.utils.memory_sort import batch_sort
@@ -12,8 +12,9 @@ from datapipeline.pipeline.stages import (
     open_source_stream,
     build_record_stream,
     apply_record_operations,
+    order_record_stream,
     build_feature_stream,
-    regularize_feature_stream,
+    apply_stream_operations,
     apply_feature_transforms,
     vector_assemble_stage,
     sample_assemble_stage,
@@ -21,15 +22,61 @@ from datapipeline.pipeline.stages import (
     window_keys,
 )
 from datapipeline.pipeline.context import PipelineContext
+from datapipeline.pipeline.utils.spool_cache import SpoolCache
-def build_feature_pipeline(
+def _time_then_id(item: Any):
+    rec = getattr(item, "record", None)
+    if rec is not None:
+        t = getattr(rec, "time", None)
+    else:
+        recs = getattr(item, "records", None)
+        t = getattr(recs[0], "time", None) if recs else None
+    return (t, getattr(item, "id", None))
+def _build_feature_from_records(
     context: PipelineContext,
+    records: Iterator[Any],
     cfg: FeatureRecordConfig,
     stage: int | None = None,
+    batch_size: int | None = None,
+    partition_by: str | None = None,
 ) -> Iterator[Any]:
     runtime = context.runtime
-    record_stream_id = cfg.record_stream
+    if partition_by is None:
+        partition_by = runtime.registries.partition_by.get(cfg.record_stream)
+    features = build_feature_stream(
+        records,
+        base_feature_id=cfg.id,
+        field=cfg.field,
+        partition_by=partition_by,
+    )
+    if stage == 5:
+        return features
+    transformed = apply_feature_transforms(
+        context, features, cfg.scale, cfg.sequence)
+    if stage == 6:
+        return transformed
+    if batch_size is None:
+        batch_size = runtime.registries.sort_batch_size.get(cfg.record_stream)
+    sorted_for_grouping = batch_sort(
+        transformed, batch_size=batch_size, key=_time_then_id
+    )
+    return sorted_for_grouping
+def build_record_pipeline(
+    context: PipelineContext,
+    record_stream_id: str,
+    stage: int | None = None,
+) -> Iterator[Any]:
+    """Build a canonical record stream through stream transforms."""
+    runtime = context.runtime
     dtos = open_source_stream(context, record_stream_id)
     if stage == 0:
@@ -43,35 +90,41 @@ def build_feature_pipeline(
     if stage == 2:
         return records
-    partition_by = runtime.registries.partition_by.get(record_stream_id)
-    features = build_feature_stream(records, cfg.id, partition_by)
+    batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
+    records = order_record_stream(
+        context, records, record_stream_id, batch_size)
     if stage == 3:
-        return features
+        return records
-    batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
-    regularized = regularize_feature_stream(
-        context, features, record_stream_id, batch_size)
+    records = apply_stream_operations(context, records, record_stream_id)
     if stage == 4:
-        return regularized
+        return records
-    transformed = apply_feature_transforms(
-        context, regularized, cfg.scale, cfg.sequence)
-    if stage == 5:
-        return transformed
+    return records
-    def _time_then_id(item: Any):
-        rec = getattr(item, "record", None)
-        if rec is not None:
-            t = getattr(rec, "time", None)
-        else:
-            recs = getattr(item, "records", None)
-            t = getattr(recs[0], "time", None) if recs else None
-        return (t, getattr(item, "id", None))
-    sorted_for_grouping = batch_sort(
-        transformed, batch_size=batch_size, key=_time_then_id
+def build_feature_pipeline(
+    context: PipelineContext,
+    cfg: FeatureRecordConfig,
+    stage: int | None = None,
+) -> Iterator[Any]:
+    runtime = context.runtime
+    record_stream_id = cfg.record_stream
+    records = build_record_pipeline(context, record_stream_id, stage=stage)
+    if stage is not None and stage <= 4:
+        return records
+    batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
+    partition_by = runtime.registries.partition_by.get(record_stream_id)
+    return _build_feature_from_records(
+        context,
+        records,
+        cfg,
+        stage=stage,
+        batch_size=batch_size,
+        partition_by=partition_by,
     )
-    return sorted_for_grouping
 def build_vector_pipeline(
@@ -130,14 +183,45 @@ def _assemble_vectors(
 ) -> Iterator[tuple[tuple, Vector]]:
     if not configs:
         return iter(())
-    streams = [
-        build_feature_pipeline(
-            context,
-            cfg,
-        )
-        for cfg in configs
-    ]
+    runtime = context.runtime
+    grouped: dict[str, list[FeatureRecordConfig]] = defaultdict(list)
+    for cfg in configs:
+        grouped[cfg.record_stream].append(cfg)
+    streams: list[Iterator[Any]] = []
+    caches: list[SpoolCache] = []
+    for record_stream_id, cfgs in grouped.items():
+        records = build_record_pipeline(context, record_stream_id, stage=4)
+        if len(cfgs) == 1:
+            record_iters = (records,)
+        else:
+            cache = SpoolCache(records, name=record_stream_id)
+            caches.append(cache)
+            record_iters = tuple(cache.reader() for _ in cfgs)
+        batch_size = runtime.registries.sort_batch_size.get(record_stream_id)
+        partition_by = runtime.registries.partition_by.get(record_stream_id)
+        for cfg, rec_iter in zip(cfgs, record_iters):
+            streams.append(
+                _build_feature_from_records(
+                    context,
+                    rec_iter,
+                    cfg,
+                    batch_size=batch_size,
+                    partition_by=partition_by,
+                )
+            )
     merged = heapq.merge(
         *streams, key=lambda fr: group_key_for(fr, group_by_cadence)
     )
-    return vector_assemble_stage(merged, group_by_cadence)
+    def _with_cleanup() -> Iterator[tuple[tuple, Vector]]:
+        try:
+            yield from vector_assemble_stage(merged, group_by_cadence)
+        finally:
+            for cache in caches:
+                cache.close()
+    return _with_cleanup()

datapipeline/pipeline/stages.py CHANGED Viewed

@@ -20,11 +20,23 @@ from datapipeline.sources.models.source import Source
 from datapipeline.transforms.vector import VectorEnsureSchemaTransform
 from datapipeline.config.dataset.normalize import floor_time_to_bucket
 from datapipeline.utils.time import parse_timecode
+from datapipeline.transforms.utils import get_field, partition_key
 def open_source_stream(context: PipelineContext, stream_alias: str) -> Source:
     runtime = context.runtime
-    return runtime.registries.stream_sources.get(stream_alias).stream()
+    registry = runtime.registries.stream_sources
+    try:
+        source = registry.get(stream_alias)
+    except KeyError as exc:
+        available = sorted(registry.keys())
+        available_text = ", ".join(available) if available else "(none)"
+        raise KeyError(
+            "Unknown record_stream "
+            f"'{stream_alias}'. Check dataset.yaml and contracts/ ids. "
+            f"Available streams: {available_text}"
+        ) from exc
+    return source.stream()
 def build_record_stream(
@@ -49,45 +61,66 @@ def apply_record_operations(
     return records
+def _record_has_field(record: Any, field: str) -> bool:
+    if isinstance(record, dict):
+        return field in record
+    return hasattr(record, field)
 def build_feature_stream(
     record_stream: Iterable[TemporalRecord],
     base_feature_id: str,
+    field: str,
     partition_by: Any | None = None,
 ) -> Iterator[FeatureRecord]:
     keygen = FeatureIdGenerator(partition_by)
     for rec in record_stream:
+        if not _record_has_field(rec, field):
+            raise KeyError(
+                f"Record field '{field}' not found on {type(rec).__name__}")
         yield FeatureRecord(
             record=rec,
             id=keygen.generate(base_feature_id, rec),
+            value=get_field(rec, field),
         )
-def regularize_feature_stream(
+def order_record_stream(
     context: PipelineContext,
-    feature_stream: Iterable[FeatureRecord],
+    record_stream: Iterable[TemporalRecord],
     stream_id: str,
     batch_size: int,
-) -> Iterator[FeatureRecord]:
-    """Apply feature transforms defined in contract policies in order."""
-    # Sort by (id, time) to satisfy stream transforms (ensure_cadence/fill)
-    sorted = batch_sort(
-        feature_stream,
+) -> Iterator[TemporalRecord]:
+    """Return records sorted by (partition_key, time)."""
+    partition_by = context.runtime.registries.partition_by.get(stream_id)
+    return batch_sort(
+        record_stream,
         batch_size=batch_size,
-        key=lambda fr: (fr.id, fr.record.time),
+        key=lambda rec: (partition_key(rec, partition_by), rec.time),
     )
+def apply_stream_operations(
+    context: PipelineContext,
+    record_stream: Iterable[TemporalRecord],
+    stream_id: str,
+) -> Iterator[TemporalRecord]:
+    """Apply stream/debug transforms (expects input sorted by partition_key + time)."""
+    partition_by = context.runtime.registries.partition_by.get(stream_id)
     transformed = apply_transforms(
-        sorted,
+        record_stream,
         STREAM_TRANFORMS_EP,
         context.runtime.registries.stream_operations.get(stream_id),
         context,
+        extra_kwargs={"partition_by": partition_by},
     )
     transformed = apply_transforms(
         transformed,
         DEBUG_TRANSFORMS_EP,
         context.runtime.registries.debug_operations.get(stream_id),
         context,
+        extra_kwargs={"partition_by": partition_by},
     )
     return transformed
@@ -135,10 +168,9 @@ def vector_assemble_stage(
         feature_map = defaultdict(list)
         for fr in group:
             if isinstance(fr, FeatureRecordSequence):
-                records = fr.records
+                feature_map[fr.id].extend(fr.values)
             else:
-                records = [fr.record]
-            feature_map[fr.id].extend(records)
+                feature_map[fr.id].append(fr.value)
         vector = vectorize_record_group(feature_map)
         yield group_key, vector
@@ -242,16 +274,19 @@ def _apply_vector_schema(
         if not feature_entries:
             if context.schema_required:
-                raise RuntimeError("Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
+                raise RuntimeError(
+                    "Schema missing for payload 'features'. Run `jerry build` to materialize schema.json.")
             feature_stream = stream
         else:
-            feature_schema = VectorEnsureSchemaTransform(on_missing="fill", on_extra="drop")
+            feature_schema = VectorEnsureSchemaTransform(
+                on_missing="fill", on_extra="drop")
             feature_schema.bind_context(context)
             feature_stream = feature_schema(stream)
         def _apply_targets(upstream: Iterator[Sample]) -> Iterator[Sample]:
             if target_entries:
-                target_schema = VectorEnsureSchemaTransform(payload="targets", on_missing="fill", on_extra="drop")
+                target_schema = VectorEnsureSchemaTransform(
+                    payload="targets", on_missing="fill", on_extra="drop")
                 target_schema.bind_context(context)
                 return target_schema(upstream)
             if not context.schema_required:
@@ -264,6 +299,7 @@ def _apply_vector_schema(
                 return iter(())
             if first.targets is None:
                 return chain([first], iterator)
-            raise RuntimeError("Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
+            raise RuntimeError(
+                "Schema missing for payload 'targets'. Run `jerry build` to materialize schema.json.")
         return _apply_targets(feature_stream)

datapipeline/pipeline/utils/spool_cache.py ADDED Viewed

@@ -0,0 +1,142 @@
+import pickle
+import tempfile
+import weakref
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterator, Any
+_LEN_BYTES = 8
+def _encode_len(size: int) -> bytes:
+    return int(size).to_bytes(_LEN_BYTES, "little", signed=False)
+def _decode_len(raw: bytes) -> int:
+    return int.from_bytes(raw, "little", signed=False)
+@dataclass
+class _SpoolState:
+    writer: Any
+    path: Path
+    offsets: list[int]
+    source: Iterator[Any]
+    done: bool = False
+    def close(self) -> None:
+        try:
+            self.writer.close()
+        except Exception:
+            pass
+class SpoolCache:
+    """Disk-backed cache for iterators with multiple sequential readers."""
+    def __init__(self, source: Iterator[Any], *, name: str | None = None) -> None:
+        tmp = tempfile.NamedTemporaryFile(
+            prefix=f"dp-spool-{name or 'stream'}-",
+            suffix=".pkl",
+            delete=False,
+        )
+        path = Path(tmp.name)
+        self._state = _SpoolState(
+            writer=tmp,
+            path=path,
+            offsets=[],
+            source=iter(source),
+        )
+        self._finalizer = weakref.finalize(self, _cleanup, path, tmp)
+    @property
+    def path(self) -> Path:
+        return self._state.path
+    def close(self) -> None:
+        """Close writer and remove the spool file."""
+        if self._finalizer.alive:
+            self._finalizer()
+    def __enter__(self) -> "SpoolCache":
+        return self
+    def __exit__(self, exc_type, exc, tb) -> None:
+        self.close()
+    def reader(self) -> Iterator[Any]:
+        return _SpoolReader(self)
+    def _append_next(self) -> bool:
+        if self._state.done:
+            return False
+        try:
+            item = next(self._state.source)
+        except StopIteration:
+            self._state.done = True
+            self._state.writer.flush()
+            return False
+        try:
+            data = pickle.dumps(item, protocol=pickle.HIGHEST_PROTOCOL)
+        except Exception as exc:  # pragma: no cover - defensive
+            raise TypeError(
+                "SpoolCache requires picklable records for multi-feature fanout."
+            ) from exc
+        offset = self._state.writer.tell()
+        self._state.writer.write(_encode_len(len(data)))
+        self._state.writer.write(data)
+        self._state.writer.flush()
+        self._state.offsets.append(offset)
+        return True
+    def _ensure_index(self, index: int) -> None:
+        while len(self._state.offsets) <= index:
+            if not self._append_next():
+                break
+class _SpoolReader:
+    def __init__(self, cache: SpoolCache) -> None:
+        self._cache = cache
+        self._index = 0
+        self._fh = open(cache.path, "rb")
+    def __iter__(self) -> "_SpoolReader":
+        return self
+    def __next__(self) -> Any:
+        self._cache._ensure_index(self._index)
+        if self._index >= len(self._cache._state.offsets):
+            self._close()
+            raise StopIteration
+        offset = self._cache._state.offsets[self._index]
+        self._index += 1
+        self._fh.seek(offset)
+        raw = self._fh.read(_LEN_BYTES)
+        if not raw:
+            self._close()
+            raise StopIteration
+        size = _decode_len(raw)
+        payload = self._fh.read(size)
+        return pickle.loads(payload)
+    def _close(self) -> None:
+        try:
+            self._fh.close()
+        except Exception:
+            pass
+    def __del__(self) -> None:
+        self._close()
+def _cleanup(path: Path, writer: Any) -> None:
+    try:
+        writer.close()
+    except Exception:
+        pass
+    try:
+        path.unlink(missing_ok=True)
+    except Exception:
+        pass

datapipeline/pipeline/utils/transform_utils.py CHANGED Viewed

@@ -41,17 +41,35 @@ def _split_params(params: Any) -> Tuple[Tuple[Any, ...], dict[str, Any]]:
     return (params,), {}
+def _merge_extra_kwargs(
+    fn: Callable[..., Any],
+    kwargs: dict[str, Any],
+    extra_kwargs: Mapping[str, Any] | None,
+) -> dict[str, Any]:
+    if not extra_kwargs:
+        return kwargs
+    merged = dict(kwargs)
+    for key, value in extra_kwargs.items():
+        if key in merged:
+            continue
+        if _supports_parameter(fn, key):
+            merged[key] = value
+    return merged
 def _call_with_params(
     fn: Callable,
     stream: Iterator[Any],
     params: Any,
     context: Optional[PipelineContext],
+    extra_kwargs: Mapping[str, Any] | None = None,
 ) -> Iterator[Any]:
     """Invoke an entry-point callable with optional params semantics."""
     args, kwargs = _split_params(params)
     if context and _supports_parameter(fn, "context") and "context" not in kwargs:
         kwargs["context"] = context
+    kwargs = _merge_extra_kwargs(fn, kwargs, extra_kwargs)
     return fn(stream, *args, **kwargs)
@@ -59,12 +77,14 @@ def _instantiate_entry_point(
     cls: Callable[..., Any],
     params: Any,
     context: Optional[PipelineContext],
+    extra_kwargs: Mapping[str, Any] | None = None,
 ) -> Any:
     """Instantiate a transform class with parameters from the config."""
     args, kwargs = _split_params(params)
     if context and _supports_parameter(cls.__init__, "context") and "context" not in kwargs:
         kwargs["context"] = context
+    kwargs = _merge_extra_kwargs(cls.__init__, kwargs, extra_kwargs)
     return cls(*args, **kwargs)
@@ -83,6 +103,7 @@ def apply_transforms(
     context: Optional[PipelineContext] = None,
     observer: Callable[[TransformEvent], None] | None = None,
     observer_registry: ObserverRegistry | None = None,
+    extra_kwargs: Mapping[str, Any] | None = None,
 ) -> Iterator[Any]:
     """Instantiate and apply configured transforms in order."""
@@ -97,7 +118,9 @@ def apply_transforms(
             name, params = _extract_single_pair(transform, "Transform")
             ep = load_ep(group=group, name=name)
             if isclass(ep):
-                inst = _instantiate_entry_point(ep, params, context)
+                inst = _instantiate_entry_point(
+                    ep, params, context, extra_kwargs=extra_kwargs
+                )
                 _bind_context(inst, context)
                 eff_observer = observer
                 if eff_observer is None and registry:
@@ -107,7 +130,9 @@ def apply_transforms(
                 _attach_observer(inst, eff_observer)
                 stream = inst(stream)
             else:
-                stream = _call_with_params(ep, stream, params, context)
+                stream = _call_with_params(
+                    ep, stream, params, context, extra_kwargs=extra_kwargs
+                )
     return stream

datapipeline/services/artifacts.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 import json
 from pathlib import Path
@@ -80,8 +78,7 @@ class ArtifactManager:
         except FileNotFoundError as exc:
             message = (
                 f"Artifact file not found: {path}. "
-                "Run `jerry build --project <project.yaml>` (preferred) or "
-                "`jerry inspect expected --project <project.yaml>` to regenerate it."
+                "Run `jerry build --project <project.yaml>` to regenerate it."
             )
             raise RuntimeError(message) from exc

datapipeline/services/constants.py CHANGED Viewed

@@ -12,6 +12,7 @@ LOADERS_GROUP = "loaders"
 MAPPERS_GROUP = "mappers"
 FILTERS_GROUP = "filters"
 DEFAULT_IO_LOADER_EP = "core.io"
+DEFAULT_SYNTHETIC_LOADER_EP = "core.synthetic.ticks"
 # POSTPROCESS_GLOBAL_KEY = "__global__"
 POSTPROCESS_TRANSFORMS = "transforms"

datapipeline/services/factories.py CHANGED Viewed

@@ -6,8 +6,7 @@ from datapipeline.mappers.noop import identity
 from datapipeline.utils.placeholders import normalize_args
 from datapipeline.sources.models.base import SourceInterface
 from datapipeline.pipeline.context import PipelineContext
-from datapipeline.config.dataset.feature import FeatureRecordConfig
-from datapipeline.pipeline.pipelines import build_feature_pipeline
+from datapipeline.pipeline.pipelines import build_record_pipeline
 from datapipeline.pipeline.utils.transform_utils import _supports_parameter
 from inspect import isclass
 from typing import Iterator, Any, Optional
@@ -52,7 +51,7 @@ class _ComposedSource(SourceInterface):
         # Build aligned/aux iterators (unwrap FeatureRecord -> record for aligned)
         aligned_iters: dict[str, Iterator[Any]] = {
-            k: (fr.record for fr in v["iter"])  # stage>=3 yields FeatureRecord
+            k: (getattr(item, "record", item) for item in v["iter"])
             for k, v in aligned.items()
         }
         aux_iters: dict[str, Iterator[Any]] = {
@@ -111,7 +110,7 @@ class _ComposedSource(SourceInterface):
         """Parse and resolve composed inputs into iterators.
         Grammar: "[alias=]stream_id" only. All inputs are built to stage 4
-        and are alignable (FeatureRecord -> domain record unwrapped).
+        and are alignable (domain records with stream transforms applied).
         """
         runtime = context.runtime
         known_streams = set(runtime.registries.stream_sources.keys())
@@ -123,8 +122,7 @@ class _ComposedSource(SourceInterface):
                 raise ValueError(
                     f"Unknown input stream '{ref}'. Known streams: {sorted(known_streams)}"
                 )
-            cfg = FeatureRecordConfig(record_stream=ref, id=alias)
-            it = build_feature_pipeline(context, cfg, stage=4)
+            it = build_record_pipeline(context, ref, stage=4)
             out[alias] = {"iter": it, "aligned": True}
         return out

datapipeline/services/paths.py CHANGED Viewed

@@ -8,7 +8,16 @@ def pkg_root(start: Optional[Path] = None) -> tuple[Path, str, Path]:
     for d in [here, *here.parents]:
         pyproject = d / "pyproject.toml"
         if pyproject.exists():
-            return d, d.name, pyproject
+            pkg_name = d.name
+            src_dir = d / "src"
+            if src_dir.exists():
+                candidates = [
+                    p for p in src_dir.iterdir()
+                    if p.is_dir() and (p / "__init__.py").exists()
+                ]
+                if len(candidates) == 1:
+                    pkg_name = candidates[0].name
+            return d, pkg_name, pyproject
     print("[error] pyproject.toml not found (searched current and parent dirs)", file=sys.stderr)
     raise SystemExit(1)

datapipeline/services/project_paths.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 from pathlib import Path
 from typing import Optional

datapipeline/services/runs.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass, asdict
 from datetime import datetime, timezone
 from pathlib import Path

jerry-thomas 1.0.3__py3-none-any.whl → 2.0.1__py3-none-any.whl

jerry-thomas 1.0.3py3-none-any.whl → 2.0.1py3-none-any.whl