jerry-thomas 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datapipeline/analysis/vector/collector.py +120 -17
- datapipeline/analysis/vector/matrix.py +33 -8
- datapipeline/analysis/vector/report.py +162 -32
- datapipeline/build/tasks/__init__.py +11 -0
- datapipeline/build/tasks/config.py +74 -0
- datapipeline/build/tasks/metadata.py +170 -0
- datapipeline/build/tasks/scaler.py +73 -0
- datapipeline/build/tasks/schema.py +60 -0
- datapipeline/build/tasks/utils.py +169 -0
- datapipeline/cli/app.py +304 -127
- datapipeline/cli/commands/build.py +240 -16
- datapipeline/cli/commands/contract.py +367 -0
- datapipeline/cli/commands/domain.py +8 -3
- datapipeline/cli/commands/inspect.py +401 -149
- datapipeline/cli/commands/list_.py +30 -7
- datapipeline/cli/commands/plugin.py +1 -1
- datapipeline/cli/commands/run.py +227 -241
- datapipeline/cli/commands/run_config.py +101 -0
- datapipeline/cli/commands/serve_pipeline.py +156 -0
- datapipeline/cli/commands/source.py +44 -8
- datapipeline/cli/visuals/__init__.py +4 -2
- datapipeline/cli/visuals/common.py +239 -0
- datapipeline/cli/visuals/labels.py +15 -15
- datapipeline/cli/visuals/runner.py +66 -0
- datapipeline/cli/visuals/sections.py +20 -0
- datapipeline/cli/visuals/sources.py +132 -119
- datapipeline/cli/visuals/sources_basic.py +260 -0
- datapipeline/cli/visuals/sources_off.py +76 -0
- datapipeline/cli/visuals/sources_rich.py +414 -0
- datapipeline/config/catalog.py +37 -3
- datapipeline/config/context.py +214 -0
- datapipeline/config/dataset/loader.py +21 -4
- datapipeline/config/dataset/normalize.py +4 -4
- datapipeline/config/metadata.py +43 -0
- datapipeline/config/postprocess.py +2 -2
- datapipeline/config/project.py +3 -2
- datapipeline/config/resolution.py +129 -0
- datapipeline/config/tasks.py +309 -0
- datapipeline/config/workspace.py +155 -0
- datapipeline/domain/__init__.py +12 -0
- datapipeline/domain/record.py +11 -0
- datapipeline/domain/sample.py +54 -0
- datapipeline/integrations/ml/adapter.py +34 -20
- datapipeline/integrations/ml/pandas_support.py +0 -2
- datapipeline/integrations/ml/rows.py +1 -6
- datapipeline/integrations/ml/torch_support.py +1 -3
- datapipeline/io/factory.py +112 -0
- datapipeline/io/output.py +132 -0
- datapipeline/io/protocols.py +21 -0
- datapipeline/io/serializers.py +219 -0
- datapipeline/io/sinks/__init__.py +23 -0
- datapipeline/io/sinks/base.py +2 -0
- datapipeline/io/sinks/files.py +79 -0
- datapipeline/io/sinks/rich.py +57 -0
- datapipeline/io/sinks/stdout.py +18 -0
- datapipeline/io/writers/__init__.py +14 -0
- datapipeline/io/writers/base.py +28 -0
- datapipeline/io/writers/csv_writer.py +25 -0
- datapipeline/io/writers/jsonl.py +52 -0
- datapipeline/io/writers/pickle_writer.py +30 -0
- datapipeline/pipeline/artifacts.py +58 -0
- datapipeline/pipeline/context.py +66 -7
- datapipeline/pipeline/observability.py +65 -0
- datapipeline/pipeline/pipelines.py +65 -13
- datapipeline/pipeline/split.py +11 -10
- datapipeline/pipeline/stages.py +127 -16
- datapipeline/pipeline/utils/keygen.py +20 -7
- datapipeline/pipeline/utils/memory_sort.py +22 -10
- datapipeline/pipeline/utils/transform_utils.py +22 -0
- datapipeline/runtime.py +5 -2
- datapipeline/services/artifacts.py +12 -6
- datapipeline/services/bootstrap/config.py +25 -0
- datapipeline/services/bootstrap/core.py +52 -37
- datapipeline/services/constants.py +6 -5
- datapipeline/services/factories.py +123 -1
- datapipeline/services/project_paths.py +43 -16
- datapipeline/services/runs.py +208 -0
- datapipeline/services/scaffold/domain.py +3 -2
- datapipeline/services/scaffold/filter.py +3 -2
- datapipeline/services/scaffold/mappers.py +9 -6
- datapipeline/services/scaffold/plugin.py +3 -3
- datapipeline/services/scaffold/source.py +93 -56
- datapipeline/sources/{composed_loader.py → data_loader.py} +9 -9
- datapipeline/sources/decoders.py +83 -18
- datapipeline/sources/factory.py +26 -16
- datapipeline/sources/models/__init__.py +2 -2
- datapipeline/sources/models/generator.py +0 -7
- datapipeline/sources/models/loader.py +3 -3
- datapipeline/sources/models/parsing_error.py +24 -0
- datapipeline/sources/models/source.py +6 -6
- datapipeline/sources/synthetic/time/loader.py +14 -2
- datapipeline/sources/transports.py +74 -37
- datapipeline/templates/plugin_skeleton/README.md +74 -30
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/example/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/example/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/example/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/{config/datasets/default → example}/project.yaml +11 -8
- datapipeline/templates/plugin_skeleton/example/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/example/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/example/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/example/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/example/tasks/serve.val.yaml +4 -0
- datapipeline/templates/plugin_skeleton/jerry.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.hour_sin.yaml +31 -0
- datapipeline/templates/plugin_skeleton/your-dataset/contracts/time.ticks.linear.yaml +30 -0
- datapipeline/templates/plugin_skeleton/your-dataset/dataset.yaml +18 -0
- datapipeline/templates/plugin_skeleton/your-dataset/postprocess.yaml +29 -0
- datapipeline/templates/plugin_skeleton/your-dataset/project.yaml +22 -0
- datapipeline/templates/plugin_skeleton/your-dataset/sources/synthetic.ticks.yaml +12 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/metadata.yaml +3 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/scaler.yaml +9 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/schema.yaml +2 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.test.yaml +4 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.train.yaml +28 -0
- datapipeline/templates/plugin_skeleton/your-dataset/tasks/serve.val.yaml +4 -0
- datapipeline/templates/stubs/dto.py.j2 +2 -0
- datapipeline/templates/stubs/mapper.py.j2 +5 -4
- datapipeline/templates/stubs/parser.py.j2 +2 -0
- datapipeline/templates/stubs/record.py.j2 +2 -0
- datapipeline/templates/stubs/source.yaml.j2 +2 -3
- datapipeline/transforms/debug/lint.py +26 -41
- datapipeline/transforms/feature/scaler.py +89 -13
- datapipeline/transforms/record/floor_time.py +4 -4
- datapipeline/transforms/sequence.py +2 -35
- datapipeline/transforms/stream/dedupe.py +24 -0
- datapipeline/transforms/stream/ensure_ticks.py +7 -6
- datapipeline/transforms/vector/__init__.py +5 -0
- datapipeline/transforms/vector/common.py +98 -0
- datapipeline/transforms/vector/drop/__init__.py +4 -0
- datapipeline/transforms/vector/drop/horizontal.py +79 -0
- datapipeline/transforms/vector/drop/orchestrator.py +59 -0
- datapipeline/transforms/vector/drop/vertical.py +182 -0
- datapipeline/transforms/vector/ensure_schema.py +184 -0
- datapipeline/transforms/vector/fill.py +87 -0
- datapipeline/transforms/vector/replace.py +62 -0
- datapipeline/utils/load.py +24 -3
- datapipeline/utils/rich_compat.py +38 -0
- datapipeline/utils/window.py +76 -0
- jerry_thomas-1.0.0.dist-info/METADATA +825 -0
- jerry_thomas-1.0.0.dist-info/RECORD +199 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/entry_points.txt +9 -8
- datapipeline/build/tasks.py +0 -186
- datapipeline/cli/commands/link.py +0 -128
- datapipeline/cli/commands/writers.py +0 -138
- datapipeline/config/build.py +0 -64
- datapipeline/config/run.py +0 -116
- datapipeline/templates/plugin_skeleton/config/contracts/time_hour_sin.synthetic.yaml +0 -24
- datapipeline/templates/plugin_skeleton/config/contracts/time_linear.synthetic.yaml +0 -23
- datapipeline/templates/plugin_skeleton/config/datasets/default/build.yaml +0 -9
- datapipeline/templates/plugin_skeleton/config/datasets/default/dataset.yaml +0 -14
- datapipeline/templates/plugin_skeleton/config/datasets/default/postprocess.yaml +0 -13
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_test.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_train.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/datasets/default/runs/run_val.yaml +0 -10
- datapipeline/templates/plugin_skeleton/config/sources/time_ticks.yaml +0 -11
- datapipeline/transforms/vector.py +0 -210
- jerry_thomas-0.3.0.dist-info/METADATA +0 -502
- jerry_thomas-0.3.0.dist-info/RECORD +0 -139
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/WHEEL +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {jerry_thomas-0.3.0.dist-info → jerry_thomas-1.0.0.dist-info}/top_level.txt +0 -0
datapipeline/sources/decoders.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import Iterable, Iterator,
|
|
4
|
+
from typing import Iterable, Iterator, Any, Optional
|
|
5
|
+
import codecs
|
|
5
6
|
import csv
|
|
7
|
+
import io
|
|
6
8
|
import json
|
|
7
|
-
|
|
9
|
+
import pickle
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class Decoder(ABC):
|
|
11
13
|
@abstractmethod
|
|
12
|
-
def decode(self,
|
|
14
|
+
def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
|
|
13
15
|
pass
|
|
14
16
|
|
|
15
|
-
def count(self,
|
|
17
|
+
def count(self, chunks: Iterable[bytes]) -> Optional[int]:
|
|
16
18
|
"""Optional fast count of rows for the given stream.
|
|
17
19
|
|
|
18
20
|
Default returns None. Subclasses may override for better visuals.
|
|
@@ -21,23 +23,55 @@ class Decoder(ABC):
|
|
|
21
23
|
return None
|
|
22
24
|
|
|
23
25
|
|
|
26
|
+
def _iter_text_lines(chunks: Iterable[bytes], encoding: str) -> Iterator[str]:
|
|
27
|
+
decoder = codecs.getincrementaldecoder(encoding)()
|
|
28
|
+
buffer = ""
|
|
29
|
+
for chunk in chunks:
|
|
30
|
+
buffer += decoder.decode(chunk)
|
|
31
|
+
while True:
|
|
32
|
+
idx = buffer.find("\n")
|
|
33
|
+
if idx == -1:
|
|
34
|
+
break
|
|
35
|
+
line, buffer = buffer[:idx], buffer[idx + 1 :]
|
|
36
|
+
if line.endswith("\r"):
|
|
37
|
+
line = line[:-1]
|
|
38
|
+
yield line
|
|
39
|
+
buffer += decoder.decode(b"", final=True)
|
|
40
|
+
if buffer:
|
|
41
|
+
if buffer.endswith("\r"):
|
|
42
|
+
buffer = buffer[:-1]
|
|
43
|
+
yield buffer
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _read_all_text(chunks: Iterable[bytes], encoding: str) -> str:
|
|
47
|
+
decoder = codecs.getincrementaldecoder(encoding)()
|
|
48
|
+
parts: list[str] = []
|
|
49
|
+
for chunk in chunks:
|
|
50
|
+
parts.append(decoder.decode(chunk))
|
|
51
|
+
parts.append(decoder.decode(b"", final=True))
|
|
52
|
+
return "".join(parts)
|
|
53
|
+
|
|
54
|
+
|
|
24
55
|
class CsvDecoder(Decoder):
|
|
25
|
-
def __init__(self, *, delimiter: str = ";"):
|
|
56
|
+
def __init__(self, *, delimiter: str = ";", encoding: str = "utf-8"):
|
|
26
57
|
self.delimiter = delimiter
|
|
58
|
+
self.encoding = encoding
|
|
27
59
|
|
|
28
|
-
def decode(self,
|
|
29
|
-
|
|
30
|
-
reader = csv.DictReader(lines, delimiter=self.delimiter)
|
|
60
|
+
def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
|
|
61
|
+
reader = csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter)
|
|
31
62
|
for row in reader:
|
|
32
63
|
yield row
|
|
33
64
|
|
|
34
|
-
def count(self,
|
|
35
|
-
return sum(1 for _ in csv.DictReader(
|
|
65
|
+
def count(self, chunks: Iterable[bytes]) -> Optional[int]:
|
|
66
|
+
return sum(1 for _ in csv.DictReader(_iter_text_lines(chunks, self.encoding), delimiter=self.delimiter))
|
|
36
67
|
|
|
37
68
|
|
|
38
69
|
class JsonDecoder(Decoder):
|
|
39
|
-
def
|
|
40
|
-
|
|
70
|
+
def __init__(self, *, encoding: str = "utf-8"):
|
|
71
|
+
self.encoding = encoding
|
|
72
|
+
|
|
73
|
+
def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
|
|
74
|
+
text = _read_all_text(chunks, self.encoding)
|
|
41
75
|
data = json.loads(text)
|
|
42
76
|
if isinstance(data, list):
|
|
43
77
|
for item in data:
|
|
@@ -46,19 +80,50 @@ class JsonDecoder(Decoder):
|
|
|
46
80
|
# Yield a single object as one row
|
|
47
81
|
yield data
|
|
48
82
|
|
|
49
|
-
def count(self,
|
|
50
|
-
text =
|
|
83
|
+
def count(self, chunks: Iterable[bytes]) -> Optional[int]:
|
|
84
|
+
text = _read_all_text(chunks, self.encoding)
|
|
51
85
|
data = json.loads(text)
|
|
52
86
|
return len(data) if isinstance(data, list) else 1
|
|
53
87
|
|
|
54
88
|
|
|
55
89
|
class JsonLinesDecoder(Decoder):
|
|
56
|
-
def
|
|
57
|
-
|
|
90
|
+
def __init__(self, *, encoding: str = "utf-8"):
|
|
91
|
+
self.encoding = encoding
|
|
92
|
+
|
|
93
|
+
def decode(self, chunks: Iterable[bytes]) -> Iterator[dict]:
|
|
94
|
+
for line in _iter_text_lines(chunks, self.encoding):
|
|
58
95
|
s = line.strip()
|
|
59
96
|
if not s:
|
|
60
97
|
continue
|
|
61
98
|
yield json.loads(s)
|
|
62
99
|
|
|
63
|
-
def count(self,
|
|
64
|
-
return sum(1 for s in
|
|
100
|
+
def count(self, chunks: Iterable[bytes]) -> Optional[int]:
|
|
101
|
+
return sum(1 for s in _iter_text_lines(chunks, self.encoding) if s.strip())
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class PickleDecoder(Decoder):
|
|
105
|
+
def decode(self, chunks: Iterable[bytes]) -> Iterator[Any]:
|
|
106
|
+
buffer = io.BytesIO()
|
|
107
|
+
for chunk in chunks:
|
|
108
|
+
buffer.write(chunk)
|
|
109
|
+
buffer.seek(0)
|
|
110
|
+
unpickler = pickle.Unpickler(buffer)
|
|
111
|
+
try:
|
|
112
|
+
while True:
|
|
113
|
+
yield unpickler.load()
|
|
114
|
+
except EOFError:
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
def count(self, chunks: Iterable[bytes]) -> Optional[int]:
|
|
118
|
+
buffer = io.BytesIO()
|
|
119
|
+
for chunk in chunks:
|
|
120
|
+
buffer.write(chunk)
|
|
121
|
+
buffer.seek(0)
|
|
122
|
+
unpickler = pickle.Unpickler(buffer)
|
|
123
|
+
total = 0
|
|
124
|
+
try:
|
|
125
|
+
while True:
|
|
126
|
+
unpickler.load()
|
|
127
|
+
total += 1
|
|
128
|
+
except EOFError:
|
|
129
|
+
return total
|
datapipeline/sources/factory.py
CHANGED
|
@@ -2,19 +2,24 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import Any, Dict
|
|
4
4
|
|
|
5
|
-
from datapipeline.sources.
|
|
6
|
-
from datapipeline.sources.transports import
|
|
7
|
-
from datapipeline.sources.decoders import
|
|
5
|
+
from datapipeline.sources.data_loader import DataLoader
|
|
6
|
+
from datapipeline.sources.transports import FsFileTransport, FsGlobTransport, HttpTransport
|
|
7
|
+
from datapipeline.sources.decoders import (
|
|
8
|
+
CsvDecoder,
|
|
9
|
+
JsonDecoder,
|
|
10
|
+
JsonLinesDecoder,
|
|
11
|
+
PickleDecoder,
|
|
12
|
+
)
|
|
8
13
|
|
|
9
14
|
|
|
10
|
-
def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) ->
|
|
15
|
+
def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) -> DataLoader:
|
|
11
16
|
"""Factory entrypoint that composes a transport and a decoder.
|
|
12
17
|
|
|
13
18
|
Args (by transport/format):
|
|
14
|
-
transport: "fs" | "
|
|
15
|
-
format: "csv" | "json" | "json-lines" (required for fs/
|
|
19
|
+
transport: "fs" | "http"
|
|
20
|
+
format: "csv" | "json" | "json-lines" | "pickle" (required for fs/http)
|
|
16
21
|
fs: path (str), glob (bool, optional), encoding (str, default utf-8), delimiter (csv only)
|
|
17
|
-
|
|
22
|
+
http: url (str), headers (dict, optional), params (dict, optional), encoding (str, default utf-8)
|
|
18
23
|
"""
|
|
19
24
|
|
|
20
25
|
t = (transport or "").lower()
|
|
@@ -27,27 +32,32 @@ def build_loader(*, transport: str, format: str | None = None, **kwargs: Any) ->
|
|
|
27
32
|
raise ValueError("fs transport requires 'path'")
|
|
28
33
|
encoding = kwargs.get("encoding", "utf-8")
|
|
29
34
|
use_glob = bool(kwargs.get("glob", False))
|
|
30
|
-
source =
|
|
31
|
-
elif t == "
|
|
35
|
+
source = FsGlobTransport(path) if use_glob else FsFileTransport(path)
|
|
36
|
+
elif t == "http":
|
|
32
37
|
url = kwargs.get("url")
|
|
33
38
|
if not url:
|
|
34
|
-
raise ValueError("
|
|
39
|
+
raise ValueError("http transport requires 'url'")
|
|
35
40
|
headers: Dict[str, str] = dict(kwargs.get("headers") or {})
|
|
41
|
+
params: Dict[str, Any] = dict(kwargs.get("params") or {})
|
|
36
42
|
encoding = kwargs.get("encoding", "utf-8")
|
|
37
|
-
source =
|
|
43
|
+
source = HttpTransport(url, headers=headers, params=params)
|
|
38
44
|
else:
|
|
39
45
|
raise ValueError(f"unsupported transport: {transport}")
|
|
40
46
|
|
|
41
47
|
# Build decoder
|
|
42
48
|
if fmt == "csv":
|
|
43
49
|
delimiter = kwargs.get("delimiter", ";")
|
|
44
|
-
decoder = CsvDecoder(delimiter=delimiter)
|
|
50
|
+
decoder = CsvDecoder(delimiter=delimiter, encoding=encoding)
|
|
45
51
|
elif fmt == "json":
|
|
46
|
-
decoder = JsonDecoder()
|
|
52
|
+
decoder = JsonDecoder(encoding=encoding)
|
|
47
53
|
elif fmt == "json-lines":
|
|
48
|
-
decoder = JsonLinesDecoder()
|
|
54
|
+
decoder = JsonLinesDecoder(encoding=encoding)
|
|
55
|
+
elif fmt == "pickle":
|
|
56
|
+
if t != "fs":
|
|
57
|
+
raise ValueError("pickle loader currently supported only for fs transport")
|
|
58
|
+
decoder = PickleDecoder()
|
|
49
59
|
else:
|
|
50
|
-
raise ValueError(f"unsupported format for
|
|
60
|
+
raise ValueError(f"unsupported format for IO loader: {format}")
|
|
51
61
|
|
|
52
62
|
allow_net = bool(kwargs.get("count_by_fetch", False))
|
|
53
|
-
return
|
|
63
|
+
return DataLoader(source, decoder, allow_network_count=allow_net)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from .base import SourceInterface
|
|
2
|
-
from .loader import
|
|
2
|
+
from .loader import BaseDataLoader, SyntheticLoader
|
|
3
3
|
from .parser import DataParser
|
|
4
4
|
from .generator import DataGenerator
|
|
5
5
|
from .source import Source
|
|
@@ -7,7 +7,7 @@ from .synthetic import GenerativeSourceInterface
|
|
|
7
7
|
|
|
8
8
|
__all__ = [
|
|
9
9
|
"SourceInterface",
|
|
10
|
-
"
|
|
10
|
+
"BaseDataLoader",
|
|
11
11
|
"SyntheticLoader",
|
|
12
12
|
"DataParser",
|
|
13
13
|
"DataGenerator",
|
|
@@ -3,7 +3,7 @@ from typing import Iterator, Any, Optional
|
|
|
3
3
|
from .generator import DataGenerator
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
6
|
+
class BaseDataLoader(ABC):
|
|
7
7
|
@abstractmethod
|
|
8
8
|
def load(self) -> Iterator[Any]:
|
|
9
9
|
pass
|
|
@@ -15,8 +15,8 @@ class RawDataLoader(ABC):
|
|
|
15
15
|
return self.load()
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class SyntheticLoader(
|
|
19
|
-
"""Adapter that turns a `DataGenerator` into a `
|
|
18
|
+
class SyntheticLoader(BaseDataLoader):
|
|
19
|
+
"""Adapter that turns a `DataGenerator` into a `BaseDataLoader`.
|
|
20
20
|
|
|
21
21
|
Keeps the `load()` contract used by the pipeline, while making the
|
|
22
22
|
generative intent explicit and separate from I/O loaders.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from typing import Optional, Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ParsingError(Exception):
|
|
5
|
+
"""Raised when a single row fails to parse."""
|
|
6
|
+
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
row: Any,
|
|
10
|
+
index: Optional[int] = None,
|
|
11
|
+
original_exc: Optional[BaseException] = None,
|
|
12
|
+
):
|
|
13
|
+
self.row = row
|
|
14
|
+
self.index = index
|
|
15
|
+
self.original_exc = original_exc
|
|
16
|
+
|
|
17
|
+
prefix = f"Failed to parse row at index {index}: " if index is not None else "Failed to parse row: "
|
|
18
|
+
message = prefix + repr(row)
|
|
19
|
+
|
|
20
|
+
# If there’s an underlying exception, append its type/message
|
|
21
|
+
if original_exc is not None:
|
|
22
|
+
message += f" (caused by {type(original_exc).__name__}: {original_exc})"
|
|
23
|
+
|
|
24
|
+
super().__init__(message)
|
|
@@ -1,28 +1,28 @@
|
|
|
1
1
|
from typing import Iterator, Generic, TypeVar, Optional
|
|
2
2
|
from .base import SourceInterface
|
|
3
|
-
from .loader import
|
|
3
|
+
from .loader import BaseDataLoader
|
|
4
4
|
from .parser import DataParser
|
|
5
|
+
from .parsing_error import ParsingError
|
|
5
6
|
|
|
6
7
|
TRecord = TypeVar("TRecord")
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class Source(SourceInterface[TRecord], Generic[TRecord]):
|
|
10
|
-
def __init__(self, loader:
|
|
11
|
+
def __init__(self, loader: BaseDataLoader, parser: DataParser[TRecord]):
|
|
11
12
|
self.loader = loader
|
|
12
13
|
self.parser = parser
|
|
13
14
|
|
|
14
15
|
def stream(self) -> Iterator[TRecord]:
|
|
15
|
-
for row in self.loader.load():
|
|
16
|
+
for i, row in enumerate(self.loader.load()):
|
|
16
17
|
try:
|
|
17
18
|
parsed = self.parser.parse(row)
|
|
18
19
|
if parsed is not None:
|
|
19
20
|
yield parsed
|
|
20
|
-
except Exception:
|
|
21
|
-
|
|
21
|
+
except Exception as exc:
|
|
22
|
+
raise ParsingError(row=row, index=i, original_exc=exc) from exc
|
|
22
23
|
|
|
23
24
|
def count(self) -> Optional[int]:
|
|
24
25
|
try:
|
|
25
26
|
return self.loader.count()
|
|
26
27
|
except Exception:
|
|
27
28
|
return None
|
|
28
|
-
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Iterator, Dict, Any, Optional
|
|
2
|
+
import logging
|
|
2
3
|
from datapipeline.sources.models.loader import SyntheticLoader
|
|
3
|
-
from datapipeline.sources.models.generator import DataGenerator
|
|
4
|
+
from datapipeline.sources.models.generator import DataGenerator
|
|
4
5
|
from datapipeline.utils.placeholders import coalesce_missing
|
|
5
6
|
from datapipeline.utils.time import parse_timecode, parse_datetime
|
|
6
7
|
|
|
@@ -24,15 +25,26 @@ class TimeTicksGenerator(DataGenerator):
|
|
|
24
25
|
return int((self.end - self.start).total_seconds() // secs) + 1
|
|
25
26
|
|
|
26
27
|
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
27
31
|
def make_time_loader(start: str, end: str, frequency: str | None = "1h") -> SyntheticLoader:
|
|
28
32
|
"""Factory entrypoint for synthetic time ticks loader.
|
|
29
33
|
|
|
30
34
|
Returns a SyntheticLoader that wraps the TimeTicksGenerator.
|
|
35
|
+
|
|
36
|
+
Behavior on unresolved dates:
|
|
37
|
+
- Synthetic sources require explicit start/end bounds. If either `start` or
|
|
38
|
+
`end` is missing or resolves to an explicit null (MissingInterpolation),
|
|
39
|
+
raise a ValueError with guidance instead of silently yielding no data.
|
|
31
40
|
"""
|
|
32
41
|
start_val = coalesce_missing(start)
|
|
33
42
|
end_val = coalesce_missing(end)
|
|
34
43
|
freq_val = coalesce_missing(frequency, default="1h")
|
|
35
44
|
|
|
36
45
|
if start_val is None or end_val is None:
|
|
37
|
-
|
|
46
|
+
raise ValueError(
|
|
47
|
+
"synthetic time loader requires non-null start and end; "
|
|
48
|
+
"set explicit project.globals.start_time/end_time or override source.loader.args."
|
|
49
|
+
)
|
|
38
50
|
return SyntheticLoader(TimeTicksGenerator(start_val, end_val, freq_val))
|
|
@@ -1,66 +1,103 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import Iterable, Iterator, List, Dict, Optional
|
|
4
|
+
from typing import Iterable, Iterator, List, Dict, Optional, Any
|
|
5
5
|
from urllib.request import Request, urlopen
|
|
6
6
|
from urllib.error import URLError, HTTPError
|
|
7
|
+
from urllib.parse import urlparse, parse_qsl, urlencode, urlunparse
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
class
|
|
10
|
-
"""Abstract transport that yields
|
|
11
|
-
|
|
12
|
-
Each item from `streams()` is an iterable over text lines (no newlines).
|
|
13
|
-
"""
|
|
10
|
+
class Transport(ABC):
|
|
11
|
+
"""Abstract transport that yields raw-byte streams per resource."""
|
|
14
12
|
|
|
15
13
|
@abstractmethod
|
|
16
|
-
def streams(self) -> Iterator[Iterable[
|
|
14
|
+
def streams(self) -> Iterator[Iterable[bytes]]:
|
|
17
15
|
pass
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
class
|
|
21
|
-
def __init__(self, path: str, *,
|
|
18
|
+
class FsFileTransport(Transport):
|
|
19
|
+
def __init__(self, path: str, *, chunk_size: int = 65536):
|
|
22
20
|
self.path = path
|
|
23
|
-
self.
|
|
24
|
-
|
|
25
|
-
def streams(self) -> Iterator[Iterable[
|
|
26
|
-
def _iter() -> Iterator[
|
|
27
|
-
with open(self.path, "
|
|
28
|
-
|
|
29
|
-
|
|
21
|
+
self.chunk_size = chunk_size
|
|
22
|
+
|
|
23
|
+
def streams(self) -> Iterator[Iterable[bytes]]:
|
|
24
|
+
def _iter() -> Iterator[bytes]:
|
|
25
|
+
with open(self.path, "rb") as f:
|
|
26
|
+
while True:
|
|
27
|
+
chunk = f.read(self.chunk_size)
|
|
28
|
+
if not chunk:
|
|
29
|
+
break
|
|
30
|
+
yield chunk
|
|
30
31
|
yield _iter()
|
|
31
32
|
|
|
32
33
|
|
|
33
|
-
class
|
|
34
|
-
def __init__(self, pattern: str, *,
|
|
34
|
+
class FsGlobTransport(Transport):
|
|
35
|
+
def __init__(self, pattern: str, *, chunk_size: int = 65536):
|
|
35
36
|
import glob as _glob
|
|
36
37
|
|
|
37
38
|
self.pattern = pattern
|
|
38
|
-
self.
|
|
39
|
+
self.chunk_size = chunk_size
|
|
39
40
|
self._files: List[str] = sorted(_glob.glob(pattern))
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
41
|
+
self._current_path: Optional[str] = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def files(self) -> List[str]:
|
|
45
|
+
return list(self._files)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def current_path(self) -> Optional[str]:
|
|
49
|
+
return self._current_path
|
|
50
|
+
|
|
51
|
+
def streams(self) -> Iterator[Iterable[bytes]]:
|
|
52
|
+
def _iter(path: str) -> Iterator[bytes]:
|
|
53
|
+
with open(path, "rb") as f:
|
|
54
|
+
while True:
|
|
55
|
+
chunk = f.read(self.chunk_size)
|
|
56
|
+
if not chunk:
|
|
57
|
+
break
|
|
58
|
+
yield chunk
|
|
59
|
+
try:
|
|
60
|
+
for p in self._files:
|
|
61
|
+
self._current_path = p
|
|
62
|
+
yield _iter(p)
|
|
63
|
+
finally:
|
|
64
|
+
self._current_path = None
|
|
48
65
|
|
|
49
66
|
|
|
50
|
-
class
|
|
51
|
-
def __init__(self, url: str,
|
|
67
|
+
class HttpTransport(Transport):
|
|
68
|
+
def __init__(self, url: str, headers: Optional[Dict[str, str]] = None, params: Optional[Dict[str, Any]] = None, chunk_size: int = 64 * 1024):
|
|
52
69
|
self.url = url
|
|
53
70
|
self.headers = dict(headers or {})
|
|
54
|
-
self.
|
|
71
|
+
self.params: Dict[str, Any] = dict(params or {})
|
|
72
|
+
self.chunk_size = chunk_size
|
|
55
73
|
|
|
56
|
-
def
|
|
57
|
-
|
|
74
|
+
def _build_url(self) -> str:
|
|
75
|
+
if not self.params:
|
|
76
|
+
return self.url
|
|
58
77
|
try:
|
|
59
|
-
|
|
60
|
-
|
|
78
|
+
parsed = urlparse(self.url)
|
|
79
|
+
existing = parse_qsl(parsed.query, keep_blank_values=True)
|
|
80
|
+
merged = existing + list(self.params.items())
|
|
81
|
+
query = urlencode(merged, doseq=True)
|
|
82
|
+
return urlunparse(parsed._replace(query=query))
|
|
83
|
+
except Exception:
|
|
84
|
+
return self.url
|
|
85
|
+
|
|
86
|
+
def streams(self) -> Iterator[Iterable[bytes]]:
|
|
87
|
+
req_url = self._build_url()
|
|
88
|
+
req = Request(req_url, headers=self.headers)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
resp = urlopen(req)
|
|
61
92
|
except (URLError, HTTPError) as e:
|
|
62
93
|
raise RuntimeError(f"failed to fetch {self.url}: {e}") from e
|
|
63
94
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
95
|
+
def byte_stream() -> Iterator[bytes]:
|
|
96
|
+
with resp:
|
|
97
|
+
while True:
|
|
98
|
+
chunk = resp.read(self.chunk_size)
|
|
99
|
+
if not chunk:
|
|
100
|
+
break
|
|
101
|
+
yield chunk
|
|
102
|
+
|
|
103
|
+
yield byte_stream()
|