PyPI - jerry-thomas - Versions diffs - 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

jerry-thomas 0.3.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (164) hide show

datapipeline/analysis/vector/collector.py +120 -17
datapipeline/analysis/vector/matrix.py +33 -8
datapipeline/analysis/vector/report.py +162 -32
datapipeline/build/tasks/__init__.py +11 -0
datapipeline/build/tasks/config.py +74 -0
datapipeline/build/tasks/metadata.py +170 -0
datapipeline/build/tasks/scaler.py +73 -0
datapipeline/build/tasks/schema.py +60 -0
datapipeline/build/tasks/utils.py +169 -0
datapipeline/cli/app.py +304 -127
datapipeline/cli/commands/build.py +240 -16
datapipeline/cli/commands/contract.py +367 -0
datapipeline/cli/commands/domain.py +8 -3
datapipeline/cli/commands/inspect.py +401 -149
datapipeline/cli/commands/list_.py +30 -7
datapipeline/cli/commands/plugin.py +1 -1
datapipeline/cli/commands/run.py +227 -241
datapipeline/cli/commands/run_config.py +101 -0
datapipeline/cli/commands/serve_pipeline.py +156 -0
datapipeline/cli/commands/source.py +44 -8
datapipeline/cli/visuals/__init__.py +4 -2
datapipeline/cli/visuals/common.py +239 -0
datapipeline/cli/visuals/labels.py +15 -15
datapipeline/cli/visuals/runner.py +66 -0
datapipeline/cli/visuals/sections.py +20 -0
datapipeline/cli/visuals/sources.py +132 -119
datapipeline/cli/visuals/sources_basic.py +260 -0
datapipeline/cli/visuals/sources_off.py +76 -0
datapipeline/cli/visuals/sources_rich.py +414 -0
datapipeline/config/catalog.py +37 -3
datapipeline/config/context.py +214 -0
datapipeline/config/dataset/loader.py +21 -4
datapipeline/config/dataset/normalize.py +4 -4
datapipeline/config/metadata.py +43 -0
datapipeline/config/postprocess.py +2 -2
datapipeline/config/project.py +3 -2
datapipeline/config/resolution.py +129 -0
datapipeline/config/tasks.py +309 -0
datapipeline/config/workspace.py +155 -0
datapipeline/domain/__init__.py +12 -0
datapipeline/domain/record.py +11 -0
datapipeline/domain/sample.py +54 -0
datapipeline/integrations/ml/adapter.py +34 -20
datapipeline/integrations/ml/pandas_support.py +0 -2
datapipeline/integrations/ml/rows.py +1 -6
datapipeline/integrations/ml/torch_support.py +1 -3
datapipeline/io/factory.py +112 -0
datapipeline/io/output.py +132 -0
datapipeline/io/protocols.py +21 -0
datapipeline/io/serializers.py +219 -0
datapipeline/io/sinks/__init__.py +23 -0
datapipeline/io/sinks/base.py +2 -0
datapipeline/io/sinks/files.py +79 -0
datapipeline/io/sinks/rich.py +57 -0
datapipeline/io/sinks/stdout.py +18 -0
datapipeline/io/writers/__init__.py +14 -0
datapipeline/io/writers/base.py +28 -0
datapipeline/io/writers/csv_writer.py +25 -0
datapipeline/io/writers/jsonl.py +52 -0
datapipeline/io/writers/pickle_writer.py +30 -0
datapipeline/pipeline/artifacts.py +58 -0
datapipeline/pipeline/context.py +66 -7
datapipeline/pipeline/observability.py +65 -0
datapipeline/pipeline/pipelines.py +65 -13
datapipeline/pipeline/split.py +11 -10
datapipeline/pipeline/stages.py +127 -16
datapipeline/pipeline/utils/keygen.py +20 -7
datapipeline/pipeline/utils/memory_sort.py +22 -10
datapipeline/pipeline/utils/transform_utils.py +22 -0
datapipeline/runtime.py +5 -2
datapipeline/services/artifacts.py +12 -6
datapipeline/services/bootstrap/config.py +25 -0
datapipeline/services/bootstrap/core.py +52 -37
datapipeline/services/constants.py +6 -5
datapipeline/services/factories.py +123 -1
datapipeline/services/project_paths.py +43 -16
datapipeline/services/runs.py +208 -0
datapipeline/services/scaffold/domain.py +3 -2
datapipeline/services/scaffold/filter.py +3 -2
datapipeline/services/scaffold/mappers.py +9 -6
datapipeline/services/scaffold/plugin.py +3 -3
datapipeline/services/scaffold/source.py +93 -56
datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
datapipeline/sources/decoders.py +83 -18
datapipeline/sources/factory.py +26 -16
datapipeline/sources/models/__init__.py +2 -2
datapipeline/sources/models/generator.py +0 -7
datapipeline/sources/models/loader.py +3 -3
datapipeline/sources/models/parsing_error.py +24 -0
datapipeline/sources/models/source.py +6 -6
datapipeline/sources/synthetic/time/loader.py +14 -2
datapipeline/sources/transports.py +74 -37
datapipeline/templates/plugin_skeleton/README.md +74 -30
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
datapipeline/templates/stubs/dto.py.j2 +2 -0
datapipeline/templates/stubs/mapper.py.j2 +5 -4
datapipeline/templates/stubs/parser.py.j2 +2 -0
datapipeline/templates/stubs/record.py.j2 +2 -0
datapipeline/templates/stubs/source.yaml.j2 +2 -3
datapipeline/transforms/debug/lint.py +26 -41
datapipeline/transforms/feature/scaler.py +89 -13
datapipeline/transforms/record/floor_time.py +4 -4
datapipeline/transforms/sequence.py +2 -35
datapipeline/transforms/stream/dedupe.py +24 -0
datapipeline/transforms/stream/ensure_ticks.py +7 -6
datapipeline/transforms/vector/__init__.py +5 -0
datapipeline/transforms/vector/common.py +98 -0
datapipeline/transforms/vector/drop/__init__.py +4 -0
datapipeline/transforms/vector/drop/horizontal.py +79 -0
datapipeline/transforms/vector/drop/orchestrator.py +59 -0
datapipeline/transforms/vector/drop/vertical.py +182 -0
datapipeline/transforms/vector/ensure_schema.py +184 -0
datapipeline/transforms/vector/fill.py +87 -0
datapipeline/transforms/vector/replace.py +62 -0
datapipeline/utils/load.py +24 -3
datapipeline/utils/rich_compat.py +38 -0
datapipeline/utils/window.py +76 -0
jerry_thomas-1.0.0.dist-info/METADATA +825 -0
jerry_thomas-1.0.0.dist-info/RECORD +199 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
datapipeline/build/tasks.py +0 -186
datapipeline/cli/commands/link.py +0 -128
datapipeline/cli/commands/writers.py +0 -138
datapipeline/config/build.py +0 -64
datapipeline/config/run.py +0 -116
datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
datapipeline/transforms/vector.py +0 -210
jerry_thomas-0.3.0.dist-info/METADATA +0 -502
jerry_thomas-0.3.0.dist-info/RECORD +0 -139
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
{jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0

datapipeline/sources/decoders.py CHANGED Viewed

@@ -1,18 +1,20 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Iterable, Iterator, Dict, Any, Optional
+from typing import Iterable, Iterator, Any, Optional
+import codecs
 import csv
+import io
 import json
-from io import StringIO
+import pickle
 class Decoder(ABC):
     @abstractmethod
-    def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
+    def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
         pass
-    def count(self, lines: Iterable[str]) -> Optional[int]:
+    def count(self, chunks: Iterable[bytes]) -> Optional[int]:
         """Optional fast count of rows for the given stream.
         Default returns None. Subclasses may override for better visuals.
@@ -21,23 +23,55 @@ class Decoder(ABC):
         return None
+def _iter_text_lines(chunks: Iterable[bytes], encoding: str) -> Iterator[str]:
+    decoder = codecs.getincrementaldecoder(encoding)()
+    buffer = ""
+    for chunk in chunks:
+        buffer += decoder.decode(chunk)
+        while True:
+            idx = buffer.find("\n")
+            if idx == -1:
+                break
+            line, buffer = buffer[:idx], buffer[idx + 1 :]
+            if line.endswith("\r"):
+                line = line[:-1]
+            yield line
+    buffer += decoder.decode(b"", final=True)
+    if buffer:
+        if buffer.endswith("\r"):
+            buffer = buffer[:-1]
+        yield buffer
+def _read_all_text(chunks: Iterable[bytes], encoding: str) -> str:
+    decoder = codecs.getincrementaldecoder(encoding)()
+    parts: list[str] = []
+    for chunk in chunks:
+        parts.append(decoder.decode(chunk))
+    parts.append(decoder.decode(b"", final=True))
+    return "".join(parts)
 class CsvDecoder(Decoder):
-    def __init__(self, *, delimiter: str = ";"):
+    def __init__(self, *, delimiter: str = ";", encoding: str = "utf-8"):
         self.delimiter = delimiter
+        self.encoding = encoding
-    def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
-        # Stream directly from the line iterator; no buffering
-        reader = csv.DictReader(lines, delimiter=self.delimiter)
+    def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
+        reader = csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter)
         for row in reader:
             yield row
-    def count(self, lines: Iterable[str]) -> Optional[int]:
-        return sum(1 for _ in csv.DictReader(lines, delimiter=self.delimiter))
+    def count(self, chunks: Iterable[bytes]) -> Optional[int]:
+        return sum(1 for _ in csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter))
 class JsonDecoder(Decoder):
-    def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
-        text = "\n".join(lines)
+    def __init__(self, *, encoding: str = "utf-8"):
+        self.encoding = encoding
+    def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
+        text = _read_all_text(chunks, self.encoding)
         data = json.loads(text)
         if isinstance(data, list):
             for item in data:
@@ -46,19 +80,50 @@ class JsonDecoder(Decoder):
             # Yield a single object as one row
             yield data
-    def count(self, lines: Iterable[str]) -> Optional[int]:
-        text = "\n".join(lines)
+    def count(self, chunks: Iterable[bytes]) -> Optional[int]:
+        text = _read_all_text(chunks, self.encoding)
         data = json.loads(text)
         return len(data) if isinstance(data, list) else 1
 class JsonLinesDecoder(Decoder):
-    def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
-        for line in lines:
+    def __init__(self, *, encoding: str = "utf-8"):
+        self.encoding = encoding
+    def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
+        for line in _iter_text_lines(chunks, self.encoding):
             s = line.strip()
             if not s:
                 continue
             yield json.loads(s)
-    def count(self, lines: Iterable[str]) -> Optional[int]:
-        return sum(1 for s in lines if s.strip())
+    def count(self, chunks: Iterable[bytes]) -> Optional[int]:
+        return sum(1 for s in _iter_text_lines(chunks, self.encoding) if s.strip())
+class PickleDecoder(Decoder):
+    def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
+        buffer = io.BytesIO()
+        for chunk in chunks:
+            buffer.write(chunk)
+        buffer.seek(0)
+        unpickler = pickle.Unpickler(buffer)
+        try:
+            while True:
+                yield unpickler.load()
+        except EOFError:
+            return
+    def count(self, chunks: Iterable[bytes]) -> Optional[int]:
+        buffer = io.BytesIO()
+        for chunk in chunks:
+            buffer.write(chunk)
+        buffer.seek(0)
+        unpickler = pickle.Unpickler(buffer)
+        total = 0
+        try:
+            while True:
+                unpickler.load()
+                total += 1
+        except EOFError:
+            return total

datapipeline/sources/factory.py CHANGED Viewed

@@ -2,19 +2,24 @@ from __future__ import annotations
 from typing import Any, Dict
-from datapipeline.sources.composed_loader import ComposedRawLoader
-from datapipeline.sources.transports import FsFileSource, FsGlobSource, UrlSource
-from datapipeline.sources.decoders import CsvDecoder, JsonDecoder, JsonLinesDecoder
+from datapipeline.sources.data_loader import DataLoader
+from datapipeline.sources.transports import FsFileTransport, FsGlobTransport, HttpTransport
+from datapipeline.sources.decoders import (
+    CsvDecoder,
+    JsonDecoder,
+    JsonLinesDecoder,
+    PickleDecoder,
+)
-def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) -> ComposedRawLoader:
+def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) -> DataLoader:
     """Factory entrypoint that composes a transport and a decoder.
     Args (by transport/format):
-      transport: "fs" | "url"
-      format: "csv" | "json" | "json-lines" (required for fs/url)
+      transport: "fs" | "http"
+      format: "csv" | "json" | "json-lines" | "pickle" (required for fs/http)
       fs: path (str), glob (bool, optional), encoding (str, default utf-8), delimiter (csv only)
-      url: url (str), headers (dict, optional), encoding (str, default utf-8)
+      http: url (str), headers (dict, optional), params (dict, optional), encoding (str, default utf-8)
     """
     t = (transport or "").lower()
@@ -27,27 +32,32 @@ def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) ->
             raise ValueError("fs transport requires 'path'")
         encoding = kwargs.get("encoding", "utf-8")
         use_glob = bool(kwargs.get("glob", False))
-        source = FsGlobSource(path, encoding=encoding) if use_glob else FsFileSource(path, encoding=encoding)
-    elif t == "url":
+        source = FsGlobTransport(path) if use_glob else FsFileTransport(path)
+    elif t == "http":
         url = kwargs.get("url")
         if not url:
-            raise ValueError("url transport requires 'url'")
+            raise ValueError("http transport requires 'url'")
         headers: Dict[str, str] = dict(kwargs.get("headers") or {})
+        params: Dict[str, Any] = dict(kwargs.get("params") or {})
         encoding = kwargs.get("encoding", "utf-8")
-        source = UrlSource(url, headers=headers, encoding=encoding)
+        source = HttpTransport(url, headers=headers, params=params)
     else:
         raise ValueError(f"unsupported transport: {transport}")
     # Build decoder
     if fmt == "csv":
         delimiter = kwargs.get("delimiter", ";")
-        decoder = CsvDecoder(delimiter=delimiter)
+        decoder = CsvDecoder(delimiter=delimiter, encoding=encoding)
     elif fmt == "json":
-        decoder = JsonDecoder()
+        decoder = JsonDecoder(encoding=encoding)
     elif fmt == "json-lines":
-        decoder = JsonLinesDecoder()
+        decoder = JsonLinesDecoder(encoding=encoding)
+    elif fmt == "pickle":
+        if t != "fs":
+            raise ValueError("pickle loader currently supported only for fs transport")
+        decoder = PickleDecoder()
     else:
-        raise ValueError(f"unsupported format for composed loader: {format}")
+        raise ValueError(f"unsupported format for IO loader: {format}")
     allow_net = bool(kwargs.get("count_by_fetch", False))
-    return ComposedRawLoader(source, decoder, allow_network_count=allow_net)
+    return DataLoader(source, decoder, allow_network_count=allow_net)

datapipeline/sources/models/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from .base import SourceInterface
-from .loader import RawDataLoader, SyntheticLoader
+from .loader import BaseDataLoader, SyntheticLoader
 from .parser import DataParser
 from .generator import DataGenerator
 from .source import Source
@@ -7,7 +7,7 @@ from .synthetic import GenerativeSourceInterface
 __all__ = [
     "SourceInterface",
-    "RawDataLoader",
+    "BaseDataLoader",
     "SyntheticLoader",
     "DataParser",
     "DataGenerator",

datapipeline/sources/models/generator.py CHANGED Viewed

@@ -18,10 +18,3 @@ class DataGenerator(ABC):
     def __iter__(self) -> Iterator[Any]:
         return self.generate()
-class NoOpGenerator(DataGenerator):
-    """A data generator that yields no items."""
-    def generate(self) -> Iterator[Any]:
-        return iter(())

datapipeline/sources/models/loader.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Iterator, Any, Optional
 from .generator import DataGenerator
-class RawDataLoader(ABC):
+class BaseDataLoader(ABC):
     @abstractmethod
     def load(self) -> Iterator[Any]:
         pass
@@ -15,8 +15,8 @@ class RawDataLoader(ABC):
         return self.load()
-class SyntheticLoader(RawDataLoader):
-    """Adapter that turns a `DataGenerator` into a `RawDataLoader`.
+class SyntheticLoader(BaseDataLoader):
+    """Adapter that turns a `DataGenerator` into a `BaseDataLoader`.
     Keeps the `load()` contract used by the pipeline, while making the
     generative intent explicit and separate from I/O loaders.

datapipeline/sources/models/parsing_error.py ADDED Viewed

@@ -0,0 +1,24 @@
+from typing import Optional, Any
+class ParsingError(Exception):
+    """Raised when a single row fails to parse."""
+    def __init__(
+        self,
+        row: Any,
+        index: Optional[int] = None,
+        original_exc: Optional[BaseException] = None,
+    ):
+        self.row = row
+        self.index = index
+        self.original_exc = original_exc
+        prefix = f"Failed to parse row at index {index}: " if index is not None else "Failed to parse row: "
+        message = prefix + repr(row)
+        # If there’s an underlying exception, append its type/message
+        if original_exc is not None:
+            message += f" (caused by {type(original_exc).__name__}: {original_exc})"
+        super().__init__(message)

datapipeline/sources/models/source.py CHANGED Viewed

@@ -1,28 +1,28 @@
 from typing import Iterator, Generic, TypeVar, Optional
 from .base import SourceInterface
-from .loader import RawDataLoader
+from .loader import BaseDataLoader
 from .parser import DataParser
+from .parsing_error import ParsingError
 TRecord = TypeVar("TRecord")
 class Source(SourceInterface[TRecord], Generic[TRecord]):
-    def __init__(self, loader: RawDataLoader, parser: DataParser[TRecord]):
+    def __init__(self, loader: BaseDataLoader, parser: DataParser[TRecord]):
         self.loader = loader
         self.parser = parser
     def stream(self) -> Iterator[TRecord]:
-        for row in self.loader.load():
+        for i, row in enumerate(self.loader.load()):
             try:
                 parsed = self.parser.parse(row)
                 if parsed is not None:
                     yield parsed
-            except Exception:
-                continue
+            except Exception as exc:
+                raise ParsingError(row=row, index=i, original_exc=exc) from exc
     def count(self) -> Optional[int]:
         try:
             return self.loader.count()
         except Exception:
             return None

datapipeline/sources/synthetic/time/loader.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Iterator, Dict, Any, Optional
+import logging
 from datapipeline.sources.models.loader import SyntheticLoader
-from datapipeline.sources.models.generator import DataGenerator, NoOpGenerator
+from datapipeline.sources.models.generator import DataGenerator
 from datapipeline.utils.placeholders import coalesce_missing
 from datapipeline.utils.time import parse_timecode, parse_datetime
@@ -24,15 +25,26 @@ class TimeTicksGenerator(DataGenerator):
         return int((self.end - self.start).total_seconds() // secs) + 1
+logger = logging.getLogger(__name__)
 def make_time_loader(start: str, end: str, frequency: str | None = "1h") -> SyntheticLoader:
     """Factory entrypoint for synthetic time ticks loader.
     Returns a SyntheticLoader that wraps the TimeTicksGenerator.
+    Behavior on unresolved dates:
+    - Synthetic sources require explicit start/end bounds. If either `start` or
+      `end` is missing or resolves to an explicit null (MissingInterpolation),
+      raise a ValueError with guidance instead of silently yielding no data.
     """
     start_val = coalesce_missing(start)
     end_val = coalesce_missing(end)
     freq_val = coalesce_missing(frequency, default="1h")
     if start_val is None or end_val is None:
-        return SyntheticLoader(NoOpGenerator())
+        raise ValueError(
+            "synthetic time loader requires non-null start and end; "
+            "set explicit project.globals.start_time/end_time or override source.loader.args."
+        )
     return SyntheticLoader(TimeTicksGenerator(start_val, end_val, freq_val))

datapipeline/sources/transports.py CHANGED Viewed

@@ -1,66 +1,103 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
-from typing import Iterable, Iterator, List, Dict, Optional
+from typing import Iterable, Iterator, List, Dict, Optional, Any
 from urllib.request import Request, urlopen
 from urllib.error import URLError, HTTPError
+from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
-class TextSource(ABC):
-    """Abstract transport that yields text line streams per resource.
-    Each item from `streams()` is an iterable over text lines (no newlines).
-    """
+class Transport(ABC):
+    """Abstract transport that yields raw-byte streams per resource."""
     @abstractmethod
-    def streams(self) -> Iterator[Iterable[str]]:
+    def streams(self) -> Iterator[Iterable[bytes]]:
         pass
-class FsFileSource(TextSource):
-    def __init__(self, path: str, *, encoding: str = "utf-8"):
+class FsFileTransport(Transport):
+    def __init__(self, path: str, *, chunk_size: int = 65536):
         self.path = path
-        self.encoding = encoding
-    def streams(self) -> Iterator[Iterable[str]]:
-        def _iter() -> Iterator[str]:
-            with open(self.path, "r", encoding=self.encoding) as f:
-                for line in f:
-                    yield line
+        self.chunk_size = chunk_size
+    def streams(self) -> Iterator[Iterable[bytes]]:
+        def _iter() -> Iterator[bytes]:
+            with open(self.path, "rb") as f:
+                while True:
+                    chunk = f.read(self.chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
         yield _iter()
-class FsGlobSource(TextSource):
-    def __init__(self, pattern: str, *, encoding: str = "utf-8"):
+class FsGlobTransport(Transport):
+    def __init__(self, pattern: str, *, chunk_size: int = 65536):
         import glob as _glob
         self.pattern = pattern
-        self.encoding = encoding
+        self.chunk_size = chunk_size
         self._files: List[str] = sorted(_glob.glob(pattern))
-    def streams(self) -> Iterator[Iterable[str]]:
-        def _iter(path: str) -> Iterator[str]:
-            with open(path, "r", encoding=self.encoding) as f:
-                for line in f:
-                    yield line
-        for p in self._files:
-            yield _iter(p)
+        self._current_path: Optional[str] = None
+    @property
+    def files(self) -> List[str]:
+        return list(self._files)
+    @property
+    def current_path(self) -> Optional[str]:
+        return self._current_path
+    def streams(self) -> Iterator[Iterable[bytes]]:
+        def _iter(path: str) -> Iterator[bytes]:
+            with open(path, "rb") as f:
+                while True:
+                    chunk = f.read(self.chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
+        try:
+            for p in self._files:
+                self._current_path = p
+                yield _iter(p)
+        finally:
+            self._current_path = None
-class UrlSource(TextSource):
-    def __init__(self, url: str, *, headers: Optional[Dict[str, str]] = None, encoding: str = "utf-8"):
+class HttpTransport(Transport):
+    def __init__(self, url: str, headers: Optional[Dict[str, str]] = None, params: Optional[Dict[str, Any]] = None, chunk_size: int = 64 * 1024):
         self.url = url
         self.headers = dict(headers or {})
-        self.encoding = encoding
+        self.params: Dict[str, Any] = dict(params or {})
+        self.chunk_size = chunk_size
-    def streams(self) -> Iterator[Iterable[str]]:
-        req = Request(self.url, headers=self.headers)
+    def _build_url(self) -> str:
+        if not self.params:
+            return self.url
         try:
-            with urlopen(req) as resp:
-                data = resp.read()
+            parsed = urlparse(self.url)
+            existing = parse_qsl(parsed.query, keep_blank_values=True)
+            merged = existing + list(self.params.items())
+            query = urlencode(merged, doseq=True)
+            return urlunparse(parsed._replace(query=query))
+        except Exception:
+            return self.url
+    def streams(self) -> Iterator[Iterable[bytes]]:
+        req_url = self._build_url()
+        req = Request(req_url, headers=self.headers)
+        try:
+            resp = urlopen(req)
         except (URLError, HTTPError) as e:
             raise RuntimeError(f"failed to fetch {self.url}: {e}") from e
-        text = data.decode(self.encoding, errors="strict")
-        lines = text.splitlines()
-        yield iter(lines)
+        def byte_stream() -> Iterator[bytes]:
+            with resp:
+                while True:
+                    chunk = resp.read(self.chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
+        yield byte_stream()

jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

jerry-thomas 0.3.0py3-none-any.whl → 1.0.0py3-none-any.whl