etlplus 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +43 -0
- etlplus/__main__.py +22 -0
- etlplus/__version__.py +14 -0
- etlplus/api/README.md +237 -0
- etlplus/api/__init__.py +136 -0
- etlplus/api/auth.py +432 -0
- etlplus/api/config.py +633 -0
- etlplus/api/endpoint_client.py +885 -0
- etlplus/api/errors.py +170 -0
- etlplus/api/pagination/__init__.py +47 -0
- etlplus/api/pagination/client.py +188 -0
- etlplus/api/pagination/config.py +440 -0
- etlplus/api/pagination/paginator.py +775 -0
- etlplus/api/rate_limiting/__init__.py +38 -0
- etlplus/api/rate_limiting/config.py +343 -0
- etlplus/api/rate_limiting/rate_limiter.py +266 -0
- etlplus/api/request_manager.py +589 -0
- etlplus/api/retry_manager.py +430 -0
- etlplus/api/transport.py +325 -0
- etlplus/api/types.py +172 -0
- etlplus/cli/__init__.py +15 -0
- etlplus/cli/app.py +1367 -0
- etlplus/cli/handlers.py +775 -0
- etlplus/cli/main.py +616 -0
- etlplus/config/__init__.py +56 -0
- etlplus/config/connector.py +372 -0
- etlplus/config/jobs.py +311 -0
- etlplus/config/pipeline.py +339 -0
- etlplus/config/profile.py +78 -0
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/ddl.py +197 -0
- etlplus/enums.py +414 -0
- etlplus/extract.py +218 -0
- etlplus/file.py +657 -0
- etlplus/load.py +336 -0
- etlplus/mixins.py +62 -0
- etlplus/py.typed +0 -0
- etlplus/run.py +368 -0
- etlplus/run_helpers.py +843 -0
- etlplus/templates/__init__.py +5 -0
- etlplus/templates/ddl.sql.j2 +128 -0
- etlplus/templates/view.sql.j2 +69 -0
- etlplus/transform.py +1049 -0
- etlplus/types.py +227 -0
- etlplus/utils.py +638 -0
- etlplus/validate.py +493 -0
- etlplus/validation/__init__.py +44 -0
- etlplus/validation/utils.py +389 -0
- etlplus-0.5.4.dist-info/METADATA +616 -0
- etlplus-0.5.4.dist-info/RECORD +55 -0
- etlplus-0.5.4.dist-info/WHEEL +5 -0
- etlplus-0.5.4.dist-info/entry_points.txt +2 -0
- etlplus-0.5.4.dist-info/licenses/LICENSE +21 -0
- etlplus-0.5.4.dist-info/top_level.txt +1 -0
etlplus/run.py
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.run` module.
|
|
3
|
+
|
|
4
|
+
A module for running ETL jobs defined in YAML configurations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Mapping
|
|
10
|
+
from typing import Any
|
|
11
|
+
from typing import Final
|
|
12
|
+
from typing import TypedDict
|
|
13
|
+
from typing import cast
|
|
14
|
+
from urllib.parse import urlsplit
|
|
15
|
+
from urllib.parse import urlunsplit
|
|
16
|
+
|
|
17
|
+
import requests # type: ignore[import]
|
|
18
|
+
|
|
19
|
+
from .api import EndpointClient # noqa: F401 (re-exported for tests)
|
|
20
|
+
from .api import PaginationConfigMap
|
|
21
|
+
from .api import RequestOptions
|
|
22
|
+
from .api import RetryPolicy
|
|
23
|
+
from .api import Url
|
|
24
|
+
from .config import load_pipeline_config
|
|
25
|
+
from .enums import DataConnectorType
|
|
26
|
+
from .enums import coerce_data_connector_type
|
|
27
|
+
from .extract import extract
|
|
28
|
+
from .load import load
|
|
29
|
+
from .run_helpers import compose_api_request_env
|
|
30
|
+
from .run_helpers import compose_api_target_env
|
|
31
|
+
from .run_helpers import paginate_with_client
|
|
32
|
+
from .transform import transform
|
|
33
|
+
from .types import JSONDict
|
|
34
|
+
from .types import Timeout
|
|
35
|
+
from .utils import print_json
|
|
36
|
+
from .validate import validate
|
|
37
|
+
from .validation.utils import maybe_validate
|
|
38
|
+
|
|
39
|
+
# SECTION: EXPORTS ========================================================== #
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
__all__ = ['run']
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# SECTION: TYPED DICTS ====================================================== #
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class BaseApiHttpEnv(TypedDict, total=False):
|
|
49
|
+
"""
|
|
50
|
+
Common HTTP request environment for API interactions.
|
|
51
|
+
|
|
52
|
+
Fields shared by both source-side and target-side API operations.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# Request details
|
|
56
|
+
url: Url | None
|
|
57
|
+
headers: dict[str, str]
|
|
58
|
+
timeout: Timeout
|
|
59
|
+
|
|
60
|
+
# Session
|
|
61
|
+
session: requests.Session | None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class ApiRequestEnv(BaseApiHttpEnv, total=False):
|
|
65
|
+
"""
|
|
66
|
+
Composed request environment for API sources.
|
|
67
|
+
|
|
68
|
+
Returned by ``compose_api_request_env`` (run_helpers) and consumed by the
|
|
69
|
+
API extract branch. Values are fully merged with endpoint/API defaults and
|
|
70
|
+
job-level overrides, preserving the original precedence and behavior.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# Client
|
|
74
|
+
use_endpoints: bool
|
|
75
|
+
base_url: str | None
|
|
76
|
+
base_path: str | None
|
|
77
|
+
endpoints_map: dict[str, str] | None
|
|
78
|
+
endpoint_key: str | None
|
|
79
|
+
|
|
80
|
+
# Request
|
|
81
|
+
params: dict[str, Any]
|
|
82
|
+
pagination: PaginationConfigMap | None
|
|
83
|
+
sleep_seconds: float
|
|
84
|
+
|
|
85
|
+
# Reliability
|
|
86
|
+
retry: RetryPolicy | None
|
|
87
|
+
retry_network_errors: bool
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class ApiTargetEnv(BaseApiHttpEnv, total=False):
|
|
91
|
+
"""
|
|
92
|
+
Composed request environment for API targets.
|
|
93
|
+
|
|
94
|
+
Returned by ``compose_api_target_env`` (run_helpers) and consumed by the
|
|
95
|
+
API load branch. Values are merged from the target object, optional
|
|
96
|
+
API/endpoint reference, and job-level overrides, preserving original
|
|
97
|
+
precedence and behavior.
|
|
98
|
+
|
|
99
|
+
Notes
|
|
100
|
+
-----
|
|
101
|
+
- Precedence for inherited values matches original logic:
|
|
102
|
+
overrides -> target -> API profile defaults.
|
|
103
|
+
- Target composition does not include pagination/rate-limit/retry since
|
|
104
|
+
loads are single-request operations; only headers/timeout/session
|
|
105
|
+
apply.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
# Request
|
|
109
|
+
method: str | None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class SessionConfig(TypedDict, total=False):
|
|
113
|
+
"""
|
|
114
|
+
Minimal session configuration schema accepted by this runner.
|
|
115
|
+
|
|
116
|
+
Keys mirror common requests.Session options; all are optional.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
headers: Mapping[str, Any]
|
|
120
|
+
params: Mapping[str, Any]
|
|
121
|
+
auth: Any # (user, pass) tuple or requests-compatible auth object
|
|
122
|
+
verify: bool | str
|
|
123
|
+
cert: Any # str or (cert, key)
|
|
124
|
+
proxies: Mapping[str, Any]
|
|
125
|
+
cookies: Mapping[str, Any]
|
|
126
|
+
trust_env: bool
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# SECTION: CONSTANTS ======================================================== #
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def run(
|
|
139
|
+
job: str,
|
|
140
|
+
config_path: str | None = None,
|
|
141
|
+
) -> JSONDict:
|
|
142
|
+
"""
|
|
143
|
+
Run a pipeline job defined in a YAML configuration.
|
|
144
|
+
|
|
145
|
+
This mirrors the run-mode logic from ``etlplus.cli.cmd_pipeline``
|
|
146
|
+
(without the list/summary modes). By default it reads the configuration
|
|
147
|
+
from ``in/pipeline.yml``, but callers can provide an explicit
|
|
148
|
+
``config_path`` to override this.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
job : str
|
|
153
|
+
Job name to execute.
|
|
154
|
+
config_path : str | None, optional
|
|
155
|
+
Path to the pipeline YAML configuration. Defaults to
|
|
156
|
+
``in/pipeline.yml``.
|
|
157
|
+
|
|
158
|
+
Returns
|
|
159
|
+
-------
|
|
160
|
+
JSONDict
|
|
161
|
+
Result dictionary.
|
|
162
|
+
|
|
163
|
+
Raises
|
|
164
|
+
------
|
|
165
|
+
ValueError
|
|
166
|
+
If the job is not found or if there are configuration issues.
|
|
167
|
+
"""
|
|
168
|
+
cfg_path = config_path or DEFAULT_CONFIG_PATH
|
|
169
|
+
cfg = load_pipeline_config(cfg_path, substitute=True)
|
|
170
|
+
|
|
171
|
+
# Lookup job by name
|
|
172
|
+
if not (job_obj := next((j for j in cfg.jobs if j.name == job), None)):
|
|
173
|
+
raise ValueError(f'Job not found: {job}')
|
|
174
|
+
|
|
175
|
+
# Index sources/targets by name
|
|
176
|
+
sources_by_name = {getattr(s, 'name', None): s for s in cfg.sources}
|
|
177
|
+
targets_by_name = {getattr(t, 'name', None): t for t in cfg.targets}
|
|
178
|
+
|
|
179
|
+
# Extract.
|
|
180
|
+
if not job_obj.extract:
|
|
181
|
+
raise ValueError('Job missing "extract" section')
|
|
182
|
+
source_name = job_obj.extract.source
|
|
183
|
+
if source_name not in sources_by_name:
|
|
184
|
+
raise ValueError(f'Unknown source: {source_name}')
|
|
185
|
+
source_obj = sources_by_name[source_name]
|
|
186
|
+
ex_opts: dict[str, Any] = job_obj.extract.options or {}
|
|
187
|
+
|
|
188
|
+
data: Any
|
|
189
|
+
stype_raw = getattr(source_obj, 'type', None)
|
|
190
|
+
stype = coerce_data_connector_type(stype_raw or '')
|
|
191
|
+
match stype:
|
|
192
|
+
case DataConnectorType.FILE:
|
|
193
|
+
path = getattr(source_obj, 'path', None)
|
|
194
|
+
fmt = ex_opts.get('format') or getattr(
|
|
195
|
+
source_obj,
|
|
196
|
+
'format',
|
|
197
|
+
'json',
|
|
198
|
+
)
|
|
199
|
+
if not path:
|
|
200
|
+
raise ValueError('File source missing "path"')
|
|
201
|
+
data = extract('file', path, file_format=fmt)
|
|
202
|
+
case DataConnectorType.DATABASE:
|
|
203
|
+
conn = getattr(source_obj, 'connection_string', '')
|
|
204
|
+
data = extract('database', conn)
|
|
205
|
+
case DataConnectorType.API:
|
|
206
|
+
env = compose_api_request_env(cfg, source_obj, ex_opts)
|
|
207
|
+
if (
|
|
208
|
+
env.get('use_endpoints')
|
|
209
|
+
and env.get('base_url')
|
|
210
|
+
and env.get('endpoints_map')
|
|
211
|
+
and env.get('endpoint_key')
|
|
212
|
+
):
|
|
213
|
+
# Construct client using module-level EndpointClient so tests
|
|
214
|
+
# can monkeypatch this class on etlplus.run.
|
|
215
|
+
ClientClass = EndpointClient # noqa: N806
|
|
216
|
+
client = ClientClass(
|
|
217
|
+
base_url=cast(str, env['base_url']),
|
|
218
|
+
base_path=cast(str | None, env.get('base_path')),
|
|
219
|
+
endpoints=cast(dict[str, str], env['endpoints_map']),
|
|
220
|
+
retry=env.get('retry'),
|
|
221
|
+
retry_network_errors=bool(
|
|
222
|
+
env.get('retry_network_errors', False),
|
|
223
|
+
),
|
|
224
|
+
session=env.get('session'),
|
|
225
|
+
)
|
|
226
|
+
data = paginate_with_client(
|
|
227
|
+
client,
|
|
228
|
+
cast(str, env['endpoint_key']),
|
|
229
|
+
env.get('params'),
|
|
230
|
+
env.get('headers'),
|
|
231
|
+
env.get('timeout'),
|
|
232
|
+
env.get('pagination'),
|
|
233
|
+
cast(float | None, env.get('sleep_seconds')),
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
url = env.get('url')
|
|
237
|
+
if not url:
|
|
238
|
+
raise ValueError('API source missing URL')
|
|
239
|
+
parts = urlsplit(cast(str, url))
|
|
240
|
+
base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
|
|
241
|
+
ClientClass = EndpointClient # noqa: N806
|
|
242
|
+
client = ClientClass(
|
|
243
|
+
base_url=base,
|
|
244
|
+
base_path=None,
|
|
245
|
+
endpoints={},
|
|
246
|
+
retry=env.get('retry'),
|
|
247
|
+
retry_network_errors=bool(
|
|
248
|
+
env.get('retry_network_errors', False),
|
|
249
|
+
),
|
|
250
|
+
session=env.get('session'),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
request_options = RequestOptions(
|
|
254
|
+
params=cast(Mapping[str, Any] | None, env.get('params')),
|
|
255
|
+
headers=cast(Mapping[str, str] | None, env.get('headers')),
|
|
256
|
+
timeout=cast(Timeout | None, env.get('timeout')),
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
data = client.paginate_url(
|
|
260
|
+
cast(str, url),
|
|
261
|
+
cast(PaginationConfigMap | None, env.get('pagination')),
|
|
262
|
+
request=request_options,
|
|
263
|
+
sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
|
|
264
|
+
)
|
|
265
|
+
case _:
|
|
266
|
+
# ``coerce_data_connector_type`` already raises for invalid
|
|
267
|
+
# connector types; this branch is defensive only.
|
|
268
|
+
raise ValueError(f'Unsupported source type: {stype_raw}')
|
|
269
|
+
|
|
270
|
+
# DRY: unified validation helper (pre/post transform)
|
|
271
|
+
val_ref = job_obj.validate
|
|
272
|
+
enabled_validation = val_ref is not None
|
|
273
|
+
if enabled_validation:
|
|
274
|
+
# Type narrowing for static checkers
|
|
275
|
+
assert val_ref is not None
|
|
276
|
+
rules = cfg.validations.get(val_ref.ruleset, {})
|
|
277
|
+
severity = (val_ref.severity or 'error').lower()
|
|
278
|
+
phase = (val_ref.phase or 'before_transform').lower()
|
|
279
|
+
else:
|
|
280
|
+
rules = {}
|
|
281
|
+
severity = 'error'
|
|
282
|
+
phase = 'before_transform'
|
|
283
|
+
|
|
284
|
+
# Pre-transform validation (if configured).
|
|
285
|
+
data = maybe_validate(
|
|
286
|
+
data,
|
|
287
|
+
'before_transform',
|
|
288
|
+
enabled=enabled_validation,
|
|
289
|
+
rules=rules,
|
|
290
|
+
phase=phase,
|
|
291
|
+
severity=severity,
|
|
292
|
+
validate_fn=validate, # type: ignore[arg-type]
|
|
293
|
+
print_json_fn=print_json,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Transform (optional).
|
|
297
|
+
if job_obj.transform:
|
|
298
|
+
ops: Any = cfg.transforms.get(job_obj.transform.pipeline, {})
|
|
299
|
+
data = transform(data, ops)
|
|
300
|
+
|
|
301
|
+
# Post-transform validation (if configured)
|
|
302
|
+
data = maybe_validate(
|
|
303
|
+
data,
|
|
304
|
+
'after_transform',
|
|
305
|
+
enabled=enabled_validation,
|
|
306
|
+
rules=rules,
|
|
307
|
+
phase=phase,
|
|
308
|
+
severity=severity,
|
|
309
|
+
validate_fn=validate, # type: ignore[arg-type]
|
|
310
|
+
print_json_fn=print_json,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Load.
|
|
314
|
+
if not job_obj.load:
|
|
315
|
+
raise ValueError('Job missing "load" section')
|
|
316
|
+
target_name = job_obj.load.target
|
|
317
|
+
if target_name not in targets_by_name:
|
|
318
|
+
raise ValueError(f'Unknown target: {target_name}')
|
|
319
|
+
target_obj = targets_by_name[target_name]
|
|
320
|
+
overrides = job_obj.load.overrides or {}
|
|
321
|
+
|
|
322
|
+
ttype_raw = getattr(target_obj, 'type', None)
|
|
323
|
+
ttype = coerce_data_connector_type(ttype_raw or '')
|
|
324
|
+
match ttype:
|
|
325
|
+
case DataConnectorType.FILE:
|
|
326
|
+
path = overrides.get('path') or getattr(target_obj, 'path', None)
|
|
327
|
+
fmt = overrides.get('format') or getattr(
|
|
328
|
+
target_obj,
|
|
329
|
+
'format',
|
|
330
|
+
'json',
|
|
331
|
+
)
|
|
332
|
+
if not path:
|
|
333
|
+
raise ValueError('File target missing "path"')
|
|
334
|
+
result = load(data, 'file', path, file_format=fmt)
|
|
335
|
+
case DataConnectorType.API:
|
|
336
|
+
env_t = compose_api_target_env(cfg, target_obj, overrides)
|
|
337
|
+
url_t = env_t.get('url')
|
|
338
|
+
if not url_t:
|
|
339
|
+
raise ValueError('API target missing "url"')
|
|
340
|
+
kwargs_t: dict[str, Any] = {}
|
|
341
|
+
if env_t.get('headers'):
|
|
342
|
+
kwargs_t['headers'] = cast(dict[str, str], env_t['headers'])
|
|
343
|
+
if env_t.get('timeout') is not None:
|
|
344
|
+
kwargs_t['timeout'] = env_t['timeout']
|
|
345
|
+
if env_t.get('session') is not None:
|
|
346
|
+
kwargs_t['session'] = env_t['session']
|
|
347
|
+
result = load(
|
|
348
|
+
data,
|
|
349
|
+
'api',
|
|
350
|
+
cast(str, url_t),
|
|
351
|
+
method=cast(str | Any, env_t.get('method') or 'post'),
|
|
352
|
+
**kwargs_t,
|
|
353
|
+
)
|
|
354
|
+
case DataConnectorType.DATABASE:
|
|
355
|
+
conn = overrides.get('connection_string') or getattr(
|
|
356
|
+
target_obj,
|
|
357
|
+
'connection_string',
|
|
358
|
+
'',
|
|
359
|
+
)
|
|
360
|
+
result = load(data, 'database', str(conn))
|
|
361
|
+
case _:
|
|
362
|
+
# ``coerce_data_connector_type`` already raises for invalid
|
|
363
|
+
# connector types; this branch is defensive only.
|
|
364
|
+
raise ValueError(f'Unsupported target type: {ttype_raw}')
|
|
365
|
+
|
|
366
|
+
# Return the terminal load result directly; callers (e.g., CLI) can wrap
|
|
367
|
+
# it in their own envelope when needed.
|
|
368
|
+
return cast(JSONDict, result)
|