jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. datapipeline/analysis/vector/collector.py +120 -17
  2. datapipeline/analysis/vector/matrix.py +33 -8
  3. datapipeline/analysis/vector/report.py +162 -32
  4. datapipeline/build/tasks/__init__.py +11 -0
  5. datapipeline/build/tasks/config.py +74 -0
  6. datapipeline/build/tasks/metadata.py +170 -0
  7. datapipeline/build/tasks/scaler.py +73 -0
  8. datapipeline/build/tasks/schema.py +60 -0
  9. datapipeline/build/tasks/utils.py +169 -0
  10. datapipeline/cli/app.py +304 -127
  11. datapipeline/cli/commands/build.py +240 -16
  12. datapipeline/cli/commands/contract.py +367 -0
  13. datapipeline/cli/commands/domain.py +8 -3
  14. datapipeline/cli/commands/inspect.py +401 -149
  15. datapipeline/cli/commands/list_.py +30 -7
  16. datapipeline/cli/commands/plugin.py +1 -1
  17. datapipeline/cli/commands/run.py +227 -241
  18. datapipeline/cli/commands/run_config.py +101 -0
  19. datapipeline/cli/commands/serve_pipeline.py +156 -0
  20. datapipeline/cli/commands/source.py +44 -8
  21. datapipeline/cli/visuals/__init__.py +4 -2
  22. datapipeline/cli/visuals/common.py +239 -0
  23. datapipeline/cli/visuals/labels.py +15 -15
  24. datapipeline/cli/visuals/runner.py +66 -0
  25. datapipeline/cli/visuals/sections.py +20 -0
  26. datapipeline/cli/visuals/sources.py +132 -119
  27. datapipeline/cli/visuals/sources_basic.py +260 -0
  28. datapipeline/cli/visuals/sources_off.py +76 -0
  29. datapipeline/cli/visuals/sources_rich.py +414 -0
  30. datapipeline/config/catalog.py +37 -3
  31. datapipeline/config/context.py +214 -0
  32. datapipeline/config/dataset/loader.py +21 -4
  33. datapipeline/config/dataset/normalize.py +4 -4
  34. datapipeline/config/metadata.py +43 -0
  35. datapipeline/config/postprocess.py +2 -2
  36. datapipeline/config/project.py +3 -2
  37. datapipeline/config/resolution.py +129 -0
  38. datapipeline/config/tasks.py +309 -0
  39. datapipeline/config/workspace.py +155 -0
  40. datapipeline/domain/__init__.py +12 -0
  41. datapipeline/domain/record.py +11 -0
  42. datapipeline/domain/sample.py +54 -0
  43. datapipeline/integrations/ml/adapter.py +34 -20
  44. datapipeline/integrations/ml/pandas_support.py +0 -2
  45. datapipeline/integrations/ml/rows.py +1 -6
  46. datapipeline/integrations/ml/torch_support.py +1 -3
  47. datapipeline/io/factory.py +112 -0
  48. datapipeline/io/output.py +132 -0
  49. datapipeline/io/protocols.py +21 -0
  50. datapipeline/io/serializers.py +219 -0
  51. datapipeline/io/sinks/__init__.py +23 -0
  52. datapipeline/io/sinks/base.py +2 -0
  53. datapipeline/io/sinks/files.py +79 -0
  54. datapipeline/io/sinks/rich.py +57 -0
  55. datapipeline/io/sinks/stdout.py +18 -0
  56. datapipeline/io/writers/__init__.py +14 -0
  57. datapipeline/io/writers/base.py +28 -0
  58. datapipeline/io/writers/csv_writer.py +25 -0
  59. datapipeline/io/writers/jsonl.py +52 -0
  60. datapipeline/io/writers/pickle_writer.py +30 -0
  61. datapipeline/pipeline/artifacts.py +58 -0
  62. datapipeline/pipeline/context.py +66 -7
  63. datapipeline/pipeline/observability.py +65 -0
  64. datapipeline/pipeline/pipelines.py +65 -13
  65. datapipeline/pipeline/split.py +11 -10
  66. datapipeline/pipeline/stages.py +127 -16
  67. datapipeline/pipeline/utils/keygen.py +20 -7
  68. datapipeline/pipeline/utils/memory_sort.py +22 -10
  69. datapipeline/pipeline/utils/transform_utils.py +22 -0
  70. datapipeline/runtime.py +5 -2
  71. datapipeline/services/artifacts.py +12 -6
  72. datapipeline/services/bootstrap/config.py +25 -0
  73. datapipeline/services/bootstrap/core.py +52 -37
  74. datapipeline/services/constants.py +6 -5
  75. datapipeline/services/factories.py +123 -1
  76. datapipeline/services/project_paths.py +43 -16
  77. datapipeline/services/runs.py +208 -0
  78. datapipeline/services/scaffold/domain.py +3 -2
  79. datapipeline/services/scaffold/filter.py +3 -2
  80. datapipeline/services/scaffold/mappers.py +9 -6
  81. datapipeline/services/scaffold/plugin.py +3 -3
  82. datapipeline/services/scaffold/source.py +93 -56
  83. datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
  84. datapipeline/sources/decoders.py +83 -18
  85. datapipeline/sources/factory.py +26 -16
  86. datapipeline/sources/models/__init__.py +2 -2
  87. datapipeline/sources/models/generator.py +0 -7
  88. datapipeline/sources/models/loader.py +3 -3
  89. datapipeline/sources/models/parsing_error.py +24 -0
  90. datapipeline/sources/models/source.py +6 -6
  91. datapipeline/sources/synthetic/time/loader.py +14 -2
  92. datapipeline/sources/transports.py +74 -37
  93. datapipeline/templates/plugin_skeleton/README.md +74 -30
  94. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
  95. datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
  96. datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
  97. datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
  98. datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
  99. datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
  100. datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
  101. datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
  102. datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
  103. datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
  104. datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
  105. datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
  106. datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
  107. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
  108. datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
  109. datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
  110. datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
  111. datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
  112. datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
  113. datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
  114. datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
  115. datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
  116. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
  117. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
  118. datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
  119. datapipeline/templates/stubs/dto.py.j2 +2 -0
  120. datapipeline/templates/stubs/mapper.py.j2 +5 -4
  121. datapipeline/templates/stubs/parser.py.j2 +2 -0
  122. datapipeline/templates/stubs/record.py.j2 +2 -0
  123. datapipeline/templates/stubs/source.yaml.j2 +2 -3
  124. datapipeline/transforms/debug/lint.py +26 -41
  125. datapipeline/transforms/feature/scaler.py +89 -13
  126. datapipeline/transforms/record/floor_time.py +4 -4
  127. datapipeline/transforms/sequence.py +2 -35
  128. datapipeline/transforms/stream/dedupe.py +24 -0
  129. datapipeline/transforms/stream/ensure_ticks.py +7 -6
  130. datapipeline/transforms/vector/__init__.py +5 -0
  131. datapipeline/transforms/vector/common.py +98 -0
  132. datapipeline/transforms/vector/drop/__init__.py +4 -0
  133. datapipeline/transforms/vector/drop/horizontal.py +79 -0
  134. datapipeline/transforms/vector/drop/orchestrator.py +59 -0
  135. datapipeline/transforms/vector/drop/vertical.py +182 -0
  136. datapipeline/transforms/vector/ensure_schema.py +184 -0
  137. datapipeline/transforms/vector/fill.py +87 -0
  138. datapipeline/transforms/vector/replace.py +62 -0
  139. datapipeline/utils/load.py +24 -3
  140. datapipeline/utils/rich_compat.py +38 -0
  141. datapipeline/utils/window.py +76 -0
  142. jerry_thomas-1.0.0.dist-info/METADATA +825 -0
  143. jerry_thomas-1.0.0.dist-info/RECORD +199 -0
  144. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
  145. datapipeline/build/tasks.py +0 -186
  146. datapipeline/cli/commands/link.py +0 -128
  147. datapipeline/cli/commands/writers.py +0 -138
  148. datapipeline/config/build.py +0 -64
  149. datapipeline/config/run.py +0 -116
  150. datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
  151. datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
  152. datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
  153. datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
  154. datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
  155. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
  156. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
  157. datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
  158. datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
  159. datapipeline/transforms/vector.py +0 -210
  160. jerry_thomas-0.3.0.dist-info/METADATA +0 -502
  161. jerry_thomas-0.3.0.dist-info/RECORD +0 -139
  162. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
  163. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
  164. {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,20 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from abc import ABC, abstractmethod
4
- from typing import Iterable, Iterator, Dict, Any, Optional
4
+ from typing import Iterable, Iterator, Any, Optional
5
+ import codecs
5
6
  import csv
7
+ import io
6
8
  import json
7
- from io import StringIO
9
+ import pickle
8
10
 
9
11
 
10
12
  class Decoder(ABC):
11
13
  @abstractmethod
12
- def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
14
+ def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
13
15
  pass
14
16
 
15
- def count(self, lines: Iterable[str]) -> Optional[int]:
17
+ def count(self, chunks: Iterable[bytes]) -> Optional[int]:
16
18
  """Optional fast count of rows for the given stream.
17
19
 
18
20
  Default returns None. Subclasses may override for better visuals.
@@ -21,23 +23,55 @@ class Decoder(ABC):
21
23
  return None
22
24
 
23
25
 
26
+ def _iter_text_lines(chunks: Iterable[bytes], encoding: str) -> Iterator[str]:
27
+ decoder = codecs.getincrementaldecoder(encoding)()
28
+ buffer = ""
29
+ for chunk in chunks:
30
+ buffer += decoder.decode(chunk)
31
+ while True:
32
+ idx = buffer.find("\n")
33
+ if idx == -1:
34
+ break
35
+ line, buffer = buffer[:idx], buffer[idx + 1 :]
36
+ if line.endswith("\r"):
37
+ line = line[:-1]
38
+ yield line
39
+ buffer += decoder.decode(b"", final=True)
40
+ if buffer:
41
+ if buffer.endswith("\r"):
42
+ buffer = buffer[:-1]
43
+ yield buffer
44
+
45
+
46
+ def _read_all_text(chunks: Iterable[bytes], encoding: str) -> str:
47
+ decoder = codecs.getincrementaldecoder(encoding)()
48
+ parts: list[str] = []
49
+ for chunk in chunks:
50
+ parts.append(decoder.decode(chunk))
51
+ parts.append(decoder.decode(b"", final=True))
52
+ return "".join(parts)
53
+
54
+
24
55
  class CsvDecoder(Decoder):
25
- def __init__(self, *, delimiter: str = ";"):
56
+ def __init__(self, *, delimiter: str = ";", encoding: str = "utf-8"):
26
57
  self.delimiter = delimiter
58
+ self.encoding = encoding
27
59
 
28
- def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
29
- # Stream directly from the line iterator; no buffering
30
- reader = csv.DictReader(lines, delimiter=self.delimiter)
60
+ def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
61
+ reader = csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter)
31
62
  for row in reader:
32
63
  yield row
33
64
 
34
- def count(self, lines: Iterable[str]) -> Optional[int]:
35
- return sum(1 for _ in csv.DictReader(lines, delimiter=self.delimiter))
65
+ def count(self, chunks: Iterable[bytes]) -> Optional[int]:
66
+ return sum(1 for _ in csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter))
36
67
 
37
68
 
38
69
  class JsonDecoder(Decoder):
39
- def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
40
- text = "\n".join(lines)
70
+ def __init__(self, *, encoding: str = "utf-8"):
71
+ self.encoding = encoding
72
+
73
+ def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
74
+ text = _read_all_text(chunks, self.encoding)
41
75
  data = json.loads(text)
42
76
  if isinstance(data, list):
43
77
  for item in data:
@@ -46,19 +80,50 @@ class JsonDecoder(Decoder):
46
80
  # Yield a single object as one row
47
81
  yield data
48
82
 
49
- def count(self, lines: Iterable[str]) -> Optional[int]:
50
- text = "\n".join(lines)
83
+ def count(self, chunks: Iterable[bytes]) -> Optional[int]:
84
+ text = _read_all_text(chunks, self.encoding)
51
85
  data = json.loads(text)
52
86
  return len(data) if isinstance(data, list) else 1
53
87
 
54
88
 
55
89
  class JsonLinesDecoder(Decoder):
56
- def decode(self, lines: Iterable[str]) -> Iterator[Dict[str, Any]]:
57
- for line in lines:
90
+ def __init__(self, *, encoding: str = "utf-8"):
91
+ self.encoding = encoding
92
+
93
+ def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
94
+ for line in _iter_text_lines(chunks, self.encoding):
58
95
  s = line.strip()
59
96
  if not s:
60
97
  continue
61
98
  yield json.loads(s)
62
99
 
63
- def count(self, lines: Iterable[str]) -> Optional[int]:
64
- return sum(1 for s in lines if s.strip())
100
+ def count(self, chunks: Iterable[bytes]) -> Optional[int]:
101
+ return sum(1 for s in _iter_text_lines(chunks, self.encoding) if s.strip())
102
+
103
+
104
+ class PickleDecoder(Decoder):
105
+ def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
106
+ buffer = io.BytesIO()
107
+ for chunk in chunks:
108
+ buffer.write(chunk)
109
+ buffer.seek(0)
110
+ unpickler = pickle.Unpickler(buffer)
111
+ try:
112
+ while True:
113
+ yield unpickler.load()
114
+ except EOFError:
115
+ return
116
+
117
+ def count(self, chunks: Iterable[bytes]) -> Optional[int]:
118
+ buffer = io.BytesIO()
119
+ for chunk in chunks:
120
+ buffer.write(chunk)
121
+ buffer.seek(0)
122
+ unpickler = pickle.Unpickler(buffer)
123
+ total = 0
124
+ try:
125
+ while True:
126
+ unpickler.load()
127
+ total += 1
128
+ except EOFError:
129
+ return total
@@ -2,19 +2,24 @@ from __future__ import annotations
2
2
 
3
3
  from typing import Any, Dict
4
4
 
5
- from datapipeline.sources.composed_loader import ComposedRawLoader
6
- from datapipeline.sources.transports import FsFileSource, FsGlobSource, UrlSource
7
- from datapipeline.sources.decoders import CsvDecoder, JsonDecoder, JsonLinesDecoder
5
+ from datapipeline.sources.data_loader import DataLoader
6
+ from datapipeline.sources.transports import FsFileTransport, FsGlobTransport, HttpTransport
7
+ from datapipeline.sources.decoders import (
8
+ CsvDecoder,
9
+ JsonDecoder,
10
+ JsonLinesDecoder,
11
+ PickleDecoder,
12
+ )
8
13
 
9
14
 
10
- def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) -> ComposedRawLoader:
15
+ def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) -> DataLoader:
11
16
  """Factory entrypoint that composes a transport and a decoder.
12
17
 
13
18
  Args (by transport/format):
14
- transport: "fs" | "url"
15
- format: "csv" | "json" | "json-lines" (required for fs/url)
19
+ transport: "fs" | "http"
20
+ format: "csv" | "json" | "json-lines" | "pickle" (required for fs/http)
16
21
  fs: path (str), glob (bool, optional), encoding (str, default utf-8), delimiter (csv only)
17
- url: url (str), headers (dict, optional), encoding (str, default utf-8)
22
+ http: url (str), headers (dict, optional), params (dict, optional), encoding (str, default utf-8)
18
23
  """
19
24
 
20
25
  t = (transport or "").lower()
@@ -27,27 +32,32 @@ def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) ->
27
32
  raise ValueError("fs transport requires 'path'")
28
33
  encoding = kwargs.get("encoding", "utf-8")
29
34
  use_glob = bool(kwargs.get("glob", False))
30
- source = FsGlobSource(path, encoding=encoding) if use_glob else FsFileSource(path, encoding=encoding)
31
- elif t == "url":
35
+ source = FsGlobTransport(path) if use_glob else FsFileTransport(path)
36
+ elif t == "http":
32
37
  url = kwargs.get("url")
33
38
  if not url:
34
- raise ValueError("url transport requires 'url'")
39
+ raise ValueError("http transport requires 'url'")
35
40
  headers: Dict[str, str] = dict(kwargs.get("headers") or {})
41
+ params: Dict[str, Any] = dict(kwargs.get("params") or {})
36
42
  encoding = kwargs.get("encoding", "utf-8")
37
- source = UrlSource(url, headers=headers, encoding=encoding)
43
+ source = HttpTransport(url, headers=headers, params=params)
38
44
  else:
39
45
  raise ValueError(f"unsupported transport: {transport}")
40
46
 
41
47
  # Build decoder
42
48
  if fmt == "csv":
43
49
  delimiter = kwargs.get("delimiter", ";")
44
- decoder = CsvDecoder(delimiter=delimiter)
50
+ decoder = CsvDecoder(delimiter=delimiter, encoding=encoding)
45
51
  elif fmt == "json":
46
- decoder = JsonDecoder()
52
+ decoder = JsonDecoder(encoding=encoding)
47
53
  elif fmt == "json-lines":
48
- decoder = JsonLinesDecoder()
54
+ decoder = JsonLinesDecoder(encoding=encoding)
55
+ elif fmt == "pickle":
56
+ if t != "fs":
57
+ raise ValueError("pickle loader currently supported only for fs transport")
58
+ decoder = PickleDecoder()
49
59
  else:
50
- raise ValueError(f"unsupported format for composed loader: {format}")
60
+ raise ValueError(f"unsupported format for IO loader: {format}")
51
61
 
52
62
  allow_net = bool(kwargs.get("count_by_fetch", False))
53
- return ComposedRawLoader(source, decoder, allow_network_count=allow_net)
63
+ return DataLoader(source, decoder, allow_network_count=allow_net)
@@ -1,5 +1,5 @@
1
1
  from .base import SourceInterface
2
- from .loader import RawDataLoader, SyntheticLoader
2
+ from .loader import BaseDataLoader, SyntheticLoader
3
3
  from .parser import DataParser
4
4
  from .generator import DataGenerator
5
5
  from .source import Source
@@ -7,7 +7,7 @@ from .synthetic import GenerativeSourceInterface
7
7
 
8
8
  __all__ = [
9
9
  "SourceInterface",
10
- "RawDataLoader",
10
+ "BaseDataLoader",
11
11
  "SyntheticLoader",
12
12
  "DataParser",
13
13
  "DataGenerator",
@@ -18,10 +18,3 @@ class DataGenerator(ABC):
18
18
 
19
19
  def __iter__(self) -> Iterator[Any]:
20
20
  return self.generate()
21
-
22
-
23
- class NoOpGenerator(DataGenerator):
24
- """A data generator that yields no items."""
25
-
26
- def generate(self) -> Iterator[Any]:
27
- return iter(())
@@ -3,7 +3,7 @@ from typing import Iterator, Any, Optional
3
3
  from .generator import DataGenerator
4
4
 
5
5
 
6
- class RawDataLoader(ABC):
6
+ class BaseDataLoader(ABC):
7
7
  @abstractmethod
8
8
  def load(self) -> Iterator[Any]:
9
9
  pass
@@ -15,8 +15,8 @@ class RawDataLoader(ABC):
15
15
  return self.load()
16
16
 
17
17
 
18
- class SyntheticLoader(RawDataLoader):
19
- """Adapter that turns a `DataGenerator` into a `RawDataLoader`.
18
+ class SyntheticLoader(BaseDataLoader):
19
+ """Adapter that turns a `DataGenerator` into a `BaseDataLoader`.
20
20
 
21
21
  Keeps the `load()` contract used by the pipeline, while making the
22
22
  generative intent explicit and separate from I/O loaders.
@@ -0,0 +1,24 @@
1
+ from typing import Optional, Any
2
+
3
+
4
+ class ParsingError(Exception):
5
+ """Raised when a single row fails to parse."""
6
+
7
+ def __init__(
8
+ self,
9
+ row: Any,
10
+ index: Optional[int] = None,
11
+ original_exc: Optional[BaseException] = None,
12
+ ):
13
+ self.row = row
14
+ self.index = index
15
+ self.original_exc = original_exc
16
+
17
+ prefix = f"Failed to parse row at index {index}: " if index is not None else "Failed to parse row: "
18
+ message = prefix + repr(row)
19
+
20
+ # If there’s an underlying exception, append its type/message
21
+ if original_exc is not None:
22
+ message += f" (caused by {type(original_exc).__name__}: {original_exc})"
23
+
24
+ super().__init__(message)
@@ -1,28 +1,28 @@
1
1
  from typing import Iterator, Generic, TypeVar, Optional
2
2
  from .base import SourceInterface
3
- from .loader import RawDataLoader
3
+ from .loader import BaseDataLoader
4
4
  from .parser import DataParser
5
+ from .parsing_error import ParsingError
5
6
 
6
7
  TRecord = TypeVar("TRecord")
7
8
 
8
9
 
9
10
  class Source(SourceInterface[TRecord], Generic[TRecord]):
10
- def __init__(self, loader: RawDataLoader, parser: DataParser[TRecord]):
11
+ def __init__(self, loader: BaseDataLoader, parser: DataParser[TRecord]):
11
12
  self.loader = loader
12
13
  self.parser = parser
13
14
 
14
15
  def stream(self) -> Iterator[TRecord]:
15
- for row in self.loader.load():
16
+ for i, row in enumerate(self.loader.load()):
16
17
  try:
17
18
  parsed = self.parser.parse(row)
18
19
  if parsed is not None:
19
20
  yield parsed
20
- except Exception:
21
- continue
21
+ except Exception as exc:
22
+ raise ParsingError(row=row, index=i, original_exc=exc) from exc
22
23
 
23
24
  def count(self) -> Optional[int]:
24
25
  try:
25
26
  return self.loader.count()
26
27
  except Exception:
27
28
  return None
28
-
@@ -1,6 +1,7 @@
1
1
  from typing import Iterator, Dict, Any, Optional
2
+ import logging
2
3
  from datapipeline.sources.models.loader import SyntheticLoader
3
- from datapipeline.sources.models.generator import DataGenerator, NoOpGenerator
4
+ from datapipeline.sources.models.generator import DataGenerator
4
5
  from datapipeline.utils.placeholders import coalesce_missing
5
6
  from datapipeline.utils.time import parse_timecode, parse_datetime
6
7
 
@@ -24,15 +25,26 @@ class TimeTicksGenerator(DataGenerator):
24
25
  return int((self.end - self.start).total_seconds() // secs) + 1
25
26
 
26
27
 
28
+ logger = logging.getLogger(__name__)
29
+
30
+
27
31
  def make_time_loader(start: str, end: str, frequency: str | None = "1h") -> SyntheticLoader:
28
32
  """Factory entrypoint for synthetic time ticks loader.
29
33
 
30
34
  Returns a SyntheticLoader that wraps the TimeTicksGenerator.
35
+
36
+ Behavior on unresolved dates:
37
+ - Synthetic sources require explicit start/end bounds. If either `start` or
38
+ `end` is missing or resolves to an explicit null (MissingInterpolation),
39
+ raise a ValueError with guidance instead of silently yielding no data.
31
40
  """
32
41
  start_val = coalesce_missing(start)
33
42
  end_val = coalesce_missing(end)
34
43
  freq_val = coalesce_missing(frequency, default="1h")
35
44
 
36
45
  if start_val is None or end_val is None:
37
- return SyntheticLoader(NoOpGenerator())
46
+ raise ValueError(
47
+ "synthetic time loader requires non-null start and end; "
48
+ "set explicit project.globals.start_time/end_time or override source.loader.args."
49
+ )
38
50
  return SyntheticLoader(TimeTicksGenerator(start_val, end_val, freq_val))
@@ -1,66 +1,103 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from abc import ABC, abstractmethod
4
- from typing import Iterable, Iterator, List, Dict, Optional
4
+ from typing import Iterable, Iterator, List, Dict, Optional, Any
5
5
  from urllib.request import Request, urlopen
6
6
  from urllib.error import URLError, HTTPError
7
+ from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
7
8
 
8
9
 
9
- class TextSource(ABC):
10
- """Abstract transport that yields text line streams per resource.
11
-
12
- Each item from `streams()` is an iterable over text lines (no newlines).
13
- """
10
+ class Transport(ABC):
11
+ """Abstract transport that yields raw-byte streams per resource."""
14
12
 
15
13
  @abstractmethod
16
- def streams(self) -> Iterator[Iterable[str]]:
14
+ def streams(self) -> Iterator[Iterable[bytes]]:
17
15
  pass
18
16
 
19
17
 
20
- class FsFileSource(TextSource):
21
- def __init__(self, path: str, *, encoding: str = "utf-8"):
18
+ class FsFileTransport(Transport):
19
+ def __init__(self, path: str, *, chunk_size: int = 65536):
22
20
  self.path = path
23
- self.encoding = encoding
24
-
25
- def streams(self) -> Iterator[Iterable[str]]:
26
- def _iter() -> Iterator[str]:
27
- with open(self.path, "r", encoding=self.encoding) as f:
28
- for line in f:
29
- yield line
21
+ self.chunk_size = chunk_size
22
+
23
+ def streams(self) -> Iterator[Iterable[bytes]]:
24
+ def _iter() -> Iterator[bytes]:
25
+ with open(self.path, "rb") as f:
26
+ while True:
27
+ chunk = f.read(self.chunk_size)
28
+ if not chunk:
29
+ break
30
+ yield chunk
30
31
  yield _iter()
31
32
 
32
33
 
33
- class FsGlobSource(TextSource):
34
- def __init__(self, pattern: str, *, encoding: str = "utf-8"):
34
+ class FsGlobTransport(Transport):
35
+ def __init__(self, pattern: str, *, chunk_size: int = 65536):
35
36
  import glob as _glob
36
37
 
37
38
  self.pattern = pattern
38
- self.encoding = encoding
39
+ self.chunk_size = chunk_size
39
40
  self._files: List[str] = sorted(_glob.glob(pattern))
40
-
41
- def streams(self) -> Iterator[Iterable[str]]:
42
- def _iter(path: str) -> Iterator[str]:
43
- with open(path, "r", encoding=self.encoding) as f:
44
- for line in f:
45
- yield line
46
- for p in self._files:
47
- yield _iter(p)
41
+ self._current_path: Optional[str] = None
42
+
43
+ @property
44
+ def files(self) -> List[str]:
45
+ return list(self._files)
46
+
47
+ @property
48
+ def current_path(self) -> Optional[str]:
49
+ return self._current_path
50
+
51
+ def streams(self) -> Iterator[Iterable[bytes]]:
52
+ def _iter(path: str) -> Iterator[bytes]:
53
+ with open(path, "rb") as f:
54
+ while True:
55
+ chunk = f.read(self.chunk_size)
56
+ if not chunk:
57
+ break
58
+ yield chunk
59
+ try:
60
+ for p in self._files:
61
+ self._current_path = p
62
+ yield _iter(p)
63
+ finally:
64
+ self._current_path = None
48
65
 
49
66
 
50
- class UrlSource(TextSource):
51
- def __init__(self, url: str, *, headers: Optional[Dict[str, str]] = None, encoding: str = "utf-8"):
67
+ class HttpTransport(Transport):
68
+ def __init__(self, url: str, headers: Optional[Dict[str, str]] = None, params: Optional[Dict[str, Any]] = None, chunk_size: int = 64 * 1024):
52
69
  self.url = url
53
70
  self.headers = dict(headers or {})
54
- self.encoding = encoding
71
+ self.params: Dict[str, Any] = dict(params or {})
72
+ self.chunk_size = chunk_size
55
73
 
56
- def streams(self) -> Iterator[Iterable[str]]:
57
- req = Request(self.url, headers=self.headers)
74
+ def _build_url(self) -> str:
75
+ if not self.params:
76
+ return self.url
58
77
  try:
59
- with urlopen(req) as resp:
60
- data = resp.read()
78
+ parsed = urlparse(self.url)
79
+ existing = parse_qsl(parsed.query, keep_blank_values=True)
80
+ merged = existing + list(self.params.items())
81
+ query = urlencode(merged, doseq=True)
82
+ return urlunparse(parsed._replace(query=query))
83
+ except Exception:
84
+ return self.url
85
+
86
+ def streams(self) -> Iterator[Iterable[bytes]]:
87
+ req_url = self._build_url()
88
+ req = Request(req_url, headers=self.headers)
89
+
90
+ try:
91
+ resp = urlopen(req)
61
92
  except (URLError, HTTPError) as e:
62
93
  raise RuntimeError(f"failed to fetch {self.url}: {e}") from e
63
94
 
64
- text = data.decode(self.encoding, errors="strict")
65
- lines = text.splitlines()
66
- yield iter(lines)
95
+ def byte_stream() -> Iterator[bytes]:
96
+ with resp:
97
+ while True:
98
+ chunk = resp.read(self.chunk_size)
99
+ if not chunk:
100
+ break
101
+ yield chunk
102
+
103
+ yield byte_stream()