etlplus 0.12.13__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/README.md +1 -1
- etlplus/__init__.py +1 -26
- etlplus/api/__init__.py +8 -0
- etlplus/api/endpoint_client.py +3 -3
- etlplus/{run_helpers.py → api/utils.py} +121 -79
- etlplus/cli/handlers.py +17 -7
- etlplus/config/jobs.py +14 -4
- etlplus/dag.py +103 -0
- etlplus/{validation → ops}/README.md +2 -2
- etlplus/ops/__init__.py +57 -0
- etlplus/{extract.py → ops/extract.py} +78 -94
- etlplus/{load.py → ops/load.py} +73 -93
- etlplus/{run.py → ops/run.py} +14 -103
- etlplus/{transform.py → ops/transform.py} +75 -68
- etlplus/{validation → ops}/utils.py +62 -15
- etlplus/{validate.py → ops/validate.py} +19 -9
- etlplus/types.py +2 -2
- {etlplus-0.12.13.dist-info → etlplus-0.14.0.dist-info}/METADATA +4 -4
- {etlplus-0.12.13.dist-info → etlplus-0.14.0.dist-info}/RECORD +23 -22
- etlplus/validation/__init__.py +0 -44
- {etlplus-0.12.13.dist-info → etlplus-0.14.0.dist-info}/WHEEL +0 -0
- {etlplus-0.12.13.dist-info → etlplus-0.14.0.dist-info}/entry_points.txt +0 -0
- {etlplus-0.12.13.dist-info → etlplus-0.14.0.dist-info}/licenses/LICENSE +0 -0
- {etlplus-0.12.13.dist-info → etlplus-0.14.0.dist-info}/top_level.txt +0 -0
etlplus/README.md
CHANGED
|
@@ -23,7 +23,7 @@ Back to project overview: see the top-level [README](../README.md).
|
|
|
23
23
|
## Quickstart
|
|
24
24
|
|
|
25
25
|
```python
|
|
26
|
-
from etlplus import extract, validate, transform, load
|
|
26
|
+
from etlplus.ops import extract, validate, transform, load
|
|
27
27
|
|
|
28
28
|
data = extract("file", "input.csv")
|
|
29
29
|
filtered = transform(data, {"filter": {"field": "age", "op": "gt", "value": 25}})
|
etlplus/__init__.py
CHANGED
|
@@ -2,42 +2,17 @@
|
|
|
2
2
|
:mod:`etlplus` package.
|
|
3
3
|
|
|
4
4
|
Top-level facade for the ETLPlus toolkit.
|
|
5
|
-
|
|
6
|
-
Importing :mod:`etlplus` exposes the handful of coarse-grained helpers most
|
|
7
|
-
users care about: ``extract``, ``transform``, ``load``, ``validate``, and
|
|
8
|
-
``run``. Each helper delegates to the richer modules under ``etlplus.*`` while
|
|
9
|
-
presenting a compact public API surface.
|
|
10
|
-
|
|
11
|
-
Examples
|
|
12
|
-
--------
|
|
13
|
-
>>> from etlplus import extract, transform
|
|
14
|
-
>>> raw = extract('file', 'input.json')
|
|
15
|
-
>>> curated = transform(raw, {'select': ['id', 'name']})
|
|
16
|
-
|
|
17
|
-
See Also
|
|
18
|
-
--------
|
|
19
|
-
- :mod:`etlplus.cli` for the command-line interface
|
|
20
|
-
- :mod:`etlplus.run` for orchestrating pipeline jobs
|
|
21
5
|
"""
|
|
22
6
|
|
|
23
7
|
from .__version__ import __version__
|
|
24
8
|
|
|
25
9
|
__author__ = 'ETLPlus Team'
|
|
26
10
|
|
|
27
|
-
from .extract import extract
|
|
28
|
-
from .load import load
|
|
29
|
-
from .run import run
|
|
30
|
-
from .transform import transform
|
|
31
|
-
from .validate import validate
|
|
32
11
|
|
|
33
12
|
# SECTION: EXPORTS ========================================================== #
|
|
34
13
|
|
|
35
14
|
|
|
36
15
|
__all__ = [
|
|
16
|
+
'__author__',
|
|
37
17
|
'__version__',
|
|
38
|
-
'extract',
|
|
39
|
-
'load',
|
|
40
|
-
'run',
|
|
41
|
-
'transform',
|
|
42
|
-
'validate',
|
|
43
18
|
]
|
etlplus/api/__init__.py
CHANGED
|
@@ -98,6 +98,10 @@ from .types import Headers
|
|
|
98
98
|
from .types import Params
|
|
99
99
|
from .types import RequestOptions
|
|
100
100
|
from .types import Url
|
|
101
|
+
from .utils import compose_api_request_env
|
|
102
|
+
from .utils import compose_api_target_env
|
|
103
|
+
from .utils import paginate_with_client
|
|
104
|
+
from .utils import resolve_request
|
|
101
105
|
|
|
102
106
|
# SECTION: EXPORTS ========================================================== #
|
|
103
107
|
|
|
@@ -122,6 +126,10 @@ __all__ = [
|
|
|
122
126
|
'PaginationType',
|
|
123
127
|
# Functions
|
|
124
128
|
'build_http_adapter',
|
|
129
|
+
'compose_api_request_env',
|
|
130
|
+
'compose_api_target_env',
|
|
131
|
+
'paginate_with_client',
|
|
132
|
+
'resolve_request',
|
|
125
133
|
# Type Aliases
|
|
126
134
|
'CursorPaginationConfigMap',
|
|
127
135
|
'Headers',
|
etlplus/api/endpoint_client.py
CHANGED
|
@@ -455,7 +455,7 @@ class EndpointClient:
|
|
|
455
455
|
-------
|
|
456
456
|
JSONData
|
|
457
457
|
Parsed JSON payload or fallback structure matching
|
|
458
|
-
:func:`etlplus.extract.extract_from_api` semantics.
|
|
458
|
+
:func:`etlplus.ops.extract.extract_from_api` semantics.
|
|
459
459
|
"""
|
|
460
460
|
return self._request_manager.get(url, **kwargs)
|
|
461
461
|
|
|
@@ -479,7 +479,7 @@ class EndpointClient:
|
|
|
479
479
|
-------
|
|
480
480
|
JSONData
|
|
481
481
|
Parsed JSON payload or fallback structure matching
|
|
482
|
-
:func:`etlplus.extract.extract_from_api` semantics.
|
|
482
|
+
:func:`etlplus.ops.extract.extract_from_api` semantics.
|
|
483
483
|
"""
|
|
484
484
|
return self._request_manager.post(url, **kwargs)
|
|
485
485
|
|
|
@@ -506,7 +506,7 @@ class EndpointClient:
|
|
|
506
506
|
-------
|
|
507
507
|
JSONData
|
|
508
508
|
Parsed JSON payload or fallback structure matching
|
|
509
|
-
:func:`etlplus.extract.extract_from_api` semantics.
|
|
509
|
+
:func:`etlplus.ops.extract.extract_from_api` semantics.
|
|
510
510
|
"""
|
|
511
511
|
return self._request_manager.request(method, url, **kwargs)
|
|
512
512
|
|
|
@@ -1,30 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
|
-
:mod:`etlplus.
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
request/load environments, pagination configs, session objects, and endpoint
|
|
6
|
-
clients. Extracted to keep ``run.py`` focused on orchestration while enabling
|
|
7
|
-
reuse and testability.
|
|
8
|
-
|
|
9
|
-
Public (re-export safe) helpers:
|
|
10
|
-
- build_pagination_cfg(pagination, overrides)
|
|
11
|
-
- build_session(cfg)
|
|
12
|
-
- compose_api_request_env(cfg, source_obj, extract_opts)
|
|
13
|
-
- compose_api_target_env(cfg, target_obj, overrides)
|
|
14
|
-
- build_endpoint_client(base_url, base_path, endpoints, env)
|
|
15
|
-
- compute_rl_sleep_seconds(rate_limit, overrides)
|
|
16
|
-
- paginate_with_client(client, endpoint_key, params, headers,
|
|
17
|
-
timeout, pagination, sleep_seconds)
|
|
18
|
-
|
|
19
|
-
Notes
|
|
20
|
-
-----
|
|
21
|
-
These helpers intentionally accept permissive ``Any``/``Mapping`` inputs to
|
|
22
|
-
avoid tight coupling with config dataclasses while keeping runtime flexible.
|
|
2
|
+
:mod:`etlplus.api.utils` module.
|
|
3
|
+
|
|
4
|
+
Shared HTTP helpers for API clients that communicate with REST endpoints.
|
|
23
5
|
"""
|
|
24
6
|
|
|
25
7
|
from __future__ import annotations
|
|
26
8
|
|
|
27
9
|
import inspect
|
|
10
|
+
from collections.abc import Callable
|
|
28
11
|
from collections.abc import Mapping
|
|
29
12
|
from typing import Any
|
|
30
13
|
from typing import TypedDict
|
|
@@ -32,24 +15,33 @@ from typing import cast
|
|
|
32
15
|
|
|
33
16
|
import requests # type: ignore[import]
|
|
34
17
|
|
|
35
|
-
from
|
|
36
|
-
from
|
|
37
|
-
from .
|
|
38
|
-
from .
|
|
39
|
-
from .
|
|
40
|
-
from .
|
|
41
|
-
from .
|
|
42
|
-
from .
|
|
43
|
-
from .
|
|
44
|
-
from .
|
|
45
|
-
from .
|
|
46
|
-
from .
|
|
47
|
-
from .types import
|
|
18
|
+
from ..enums import HttpMethod
|
|
19
|
+
from ..types import Timeout
|
|
20
|
+
from .config import ApiConfig
|
|
21
|
+
from .config import EndpointConfig
|
|
22
|
+
from .endpoint_client import EndpointClient
|
|
23
|
+
from .pagination import PaginationConfig
|
|
24
|
+
from .pagination import PaginationConfigMap
|
|
25
|
+
from .rate_limiting import RateLimitConfig
|
|
26
|
+
from .rate_limiting import RateLimitConfigMap
|
|
27
|
+
from .rate_limiting import RateLimiter
|
|
28
|
+
from .retry_manager import RetryPolicy
|
|
29
|
+
from .types import Headers
|
|
30
|
+
from .types import Params
|
|
31
|
+
from .types import Url
|
|
32
|
+
|
|
33
|
+
# SECTION: CONSTANTS ======================================================== #
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
DEFAULT_TIMEOUT: float = 10.0
|
|
37
|
+
|
|
48
38
|
|
|
49
39
|
# SECTION: EXPORTS ========================================================== #
|
|
50
40
|
|
|
51
41
|
|
|
52
42
|
__all__ = [
|
|
43
|
+
# Constants
|
|
44
|
+
'DEFAULT_TIMEOUT',
|
|
53
45
|
# Functions
|
|
54
46
|
'build_endpoint_client',
|
|
55
47
|
'build_pagination_cfg',
|
|
@@ -58,6 +50,7 @@ __all__ = [
|
|
|
58
50
|
'compose_api_target_env',
|
|
59
51
|
'compute_rl_sleep_seconds',
|
|
60
52
|
'paginate_with_client',
|
|
53
|
+
'resolve_request',
|
|
61
54
|
# Typed Dicts
|
|
62
55
|
'ApiRequestEnv',
|
|
63
56
|
'ApiTargetEnv',
|
|
@@ -68,43 +61,83 @@ __all__ = [
|
|
|
68
61
|
# SECTION: TYPED DICTS ====================================================== #
|
|
69
62
|
|
|
70
63
|
|
|
71
|
-
class
|
|
72
|
-
"""
|
|
64
|
+
class BaseApiHttpEnv(TypedDict, total=False):
|
|
65
|
+
"""
|
|
66
|
+
Common HTTP request environment for API interactions.
|
|
67
|
+
|
|
68
|
+
Fields shared by both source-side and target-side API operations.
|
|
69
|
+
"""
|
|
73
70
|
|
|
71
|
+
# Request details
|
|
74
72
|
url: Url | None
|
|
75
73
|
headers: dict[str, str]
|
|
76
74
|
timeout: Timeout
|
|
75
|
+
|
|
76
|
+
# Session
|
|
77
77
|
session: requests.Session | None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ApiRequestEnv(BaseApiHttpEnv, total=False):
|
|
81
|
+
"""
|
|
82
|
+
Composed HTTP request environment configuration for REST API sources.
|
|
83
|
+
|
|
84
|
+
Returned by :func:`compose_api_request_env` and consumed by the API extract
|
|
85
|
+
branch. Values are fully merged with endpoint/API defaults and job-level
|
|
86
|
+
overrides, preserving the original precedence and behavior.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
# Client
|
|
78
90
|
use_endpoints: bool
|
|
79
91
|
base_url: str | None
|
|
80
92
|
base_path: str | None
|
|
81
93
|
endpoints_map: dict[str, str] | None
|
|
82
94
|
endpoint_key: str | None
|
|
95
|
+
|
|
96
|
+
# Request
|
|
83
97
|
params: dict[str, Any]
|
|
84
98
|
pagination: PaginationConfigMap | None
|
|
85
99
|
sleep_seconds: float
|
|
100
|
+
|
|
101
|
+
# Reliability
|
|
86
102
|
retry: RetryPolicy | None
|
|
87
103
|
retry_network_errors: bool
|
|
88
104
|
|
|
89
105
|
|
|
90
|
-
class ApiTargetEnv(
|
|
91
|
-
"""
|
|
106
|
+
class ApiTargetEnv(BaseApiHttpEnv, total=False):
|
|
107
|
+
"""
|
|
108
|
+
Composed HTTP request environment configuration for REST API targets.
|
|
109
|
+
|
|
110
|
+
Returned by :func:`compose_api_target_env` and consumed by the API load
|
|
111
|
+
branch. Values are merged from the target object, optional API/endpoint
|
|
112
|
+
reference, and job-level overrides, preserving original precedence and
|
|
113
|
+
behavior.
|
|
114
|
+
|
|
115
|
+
Notes
|
|
116
|
+
-----
|
|
117
|
+
- Precedence for inherited values matches original logic:
|
|
118
|
+
overrides -> target -> API profile defaults.
|
|
119
|
+
- Target composition does not include pagination/rate-limit/retry since
|
|
120
|
+
loads are single-request operations; only headers/timeout/session
|
|
121
|
+
apply.
|
|
122
|
+
"""
|
|
92
123
|
|
|
93
|
-
|
|
94
|
-
headers: dict[str, str]
|
|
95
|
-
timeout: Timeout
|
|
96
|
-
session: requests.Session | None
|
|
124
|
+
# Request
|
|
97
125
|
method: str | None
|
|
98
126
|
|
|
99
127
|
|
|
100
128
|
class SessionConfig(TypedDict, total=False):
|
|
101
|
-
"""
|
|
129
|
+
"""
|
|
130
|
+
Minimal session configuration schema accepted by the
|
|
131
|
+
:class:`requests.Session` runner.
|
|
132
|
+
|
|
133
|
+
Keys mirror common :class:`requests.Session` options; all are optional.
|
|
134
|
+
"""
|
|
102
135
|
|
|
103
136
|
headers: Mapping[str, Any]
|
|
104
137
|
params: Mapping[str, Any]
|
|
105
|
-
auth: Any
|
|
138
|
+
auth: Any # (user, pass) tuple or requests-compatible auth object
|
|
106
139
|
verify: bool | str
|
|
107
|
-
cert: Any
|
|
140
|
+
cert: Any # str or (cert, key)
|
|
108
141
|
proxies: Mapping[str, Any]
|
|
109
142
|
cookies: Mapping[str, Any]
|
|
110
143
|
trust_env: bool
|
|
@@ -113,9 +146,6 @@ class SessionConfig(TypedDict, total=False):
|
|
|
113
146
|
# SECTION: INTERNAL FUNCTIONS ============================================== #
|
|
114
147
|
|
|
115
148
|
|
|
116
|
-
# -- API Environment Composition -- #
|
|
117
|
-
|
|
118
|
-
|
|
119
149
|
def _get_api_cfg_and_endpoint(
|
|
120
150
|
cfg: Any,
|
|
121
151
|
api_name: str,
|
|
@@ -226,9 +256,6 @@ def _merge_session_cfg_three(
|
|
|
226
256
|
return cast(SessionConfig | None, (merged or None))
|
|
227
257
|
|
|
228
258
|
|
|
229
|
-
# -- Mapping Helpers -- #
|
|
230
|
-
|
|
231
|
-
|
|
232
259
|
def _copy_mapping(
|
|
233
260
|
mapping: Mapping[str, Any] | None,
|
|
234
261
|
) -> dict[str, Any]:
|
|
@@ -266,9 +293,6 @@ def _update_mapping(
|
|
|
266
293
|
target.update(extra)
|
|
267
294
|
|
|
268
295
|
|
|
269
|
-
# -- Session -- #
|
|
270
|
-
|
|
271
|
-
|
|
272
296
|
def _build_session_optional(
|
|
273
297
|
cfg: SessionConfig | None,
|
|
274
298
|
) -> requests.Session | None:
|
|
@@ -285,7 +309,6 @@ def _build_session_optional(
|
|
|
285
309
|
requests.Session | None
|
|
286
310
|
Configured session or ``None``.
|
|
287
311
|
"""
|
|
288
|
-
|
|
289
312
|
if isinstance(cfg, dict):
|
|
290
313
|
return build_session(cfg)
|
|
291
314
|
return None
|
|
@@ -294,9 +317,6 @@ def _build_session_optional(
|
|
|
294
317
|
# SECTION: FUNCTIONS ======================================================== #
|
|
295
318
|
|
|
296
319
|
|
|
297
|
-
# -- API Environment Composition -- #
|
|
298
|
-
|
|
299
|
-
|
|
300
320
|
def build_endpoint_client(
|
|
301
321
|
*,
|
|
302
322
|
base_url: str,
|
|
@@ -323,15 +343,7 @@ def build_endpoint_client(
|
|
|
323
343
|
EndpointClient
|
|
324
344
|
The constructed endpoint client.
|
|
325
345
|
"""
|
|
326
|
-
|
|
327
|
-
# propagate here by preferring the class on the run module if present.
|
|
328
|
-
try:
|
|
329
|
-
from . import run as run_mod # local import to avoid cycles
|
|
330
|
-
|
|
331
|
-
ClientClass = getattr(run_mod, 'EndpointClient', EndpointClient)
|
|
332
|
-
except (ImportError, AttributeError): # pragma: no cover - fallback path
|
|
333
|
-
ClientClass = EndpointClient
|
|
334
|
-
return ClientClass(
|
|
346
|
+
return EndpointClient(
|
|
335
347
|
base_url=base_url,
|
|
336
348
|
base_path=base_path,
|
|
337
349
|
endpoints=endpoints,
|
|
@@ -558,9 +570,6 @@ def compose_api_target_env(
|
|
|
558
570
|
}
|
|
559
571
|
|
|
560
572
|
|
|
561
|
-
# -- Pagination -- #
|
|
562
|
-
|
|
563
|
-
|
|
564
573
|
def build_pagination_cfg(
|
|
565
574
|
pagination: PaginationConfig | None,
|
|
566
575
|
overrides: Mapping[str, Any] | None,
|
|
@@ -667,9 +676,6 @@ def build_pagination_cfg(
|
|
|
667
676
|
return cast(PaginationConfigMap, cfg)
|
|
668
677
|
|
|
669
678
|
|
|
670
|
-
# -- Pagination Invocation -- #
|
|
671
|
-
|
|
672
|
-
|
|
673
679
|
def paginate_with_client(
|
|
674
680
|
client: Any,
|
|
675
681
|
endpoint_key: str,
|
|
@@ -727,9 +733,6 @@ def paginate_with_client(
|
|
|
727
733
|
return client.paginate(endpoint_key, **kw_pag)
|
|
728
734
|
|
|
729
735
|
|
|
730
|
-
# -- Rate Limit -- #
|
|
731
|
-
|
|
732
|
-
|
|
733
736
|
def compute_rl_sleep_seconds(
|
|
734
737
|
rate_limit: RateLimitConfig | Mapping[str, Any] | None,
|
|
735
738
|
overrides: Mapping[str, Any] | None,
|
|
@@ -782,9 +785,6 @@ def compute_rl_sleep_seconds(
|
|
|
782
785
|
)
|
|
783
786
|
|
|
784
787
|
|
|
785
|
-
# -- Session -- #
|
|
786
|
-
|
|
787
|
-
|
|
788
788
|
def build_session(
|
|
789
789
|
cfg: SessionConfig | None,
|
|
790
790
|
) -> requests.Session:
|
|
@@ -841,3 +841,45 @@ def build_session(
|
|
|
841
841
|
pass
|
|
842
842
|
|
|
843
843
|
return s
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
def resolve_request(
|
|
847
|
+
method: HttpMethod | str,
|
|
848
|
+
*,
|
|
849
|
+
session: Any | None = None,
|
|
850
|
+
timeout: Timeout = None,
|
|
851
|
+
) -> tuple[Callable[..., requests.Response], float, HttpMethod]:
|
|
852
|
+
"""
|
|
853
|
+
Resolve a request callable and effective timeout for an HTTP method.
|
|
854
|
+
|
|
855
|
+
Parameters
|
|
856
|
+
----------
|
|
857
|
+
method : HttpMethod | str
|
|
858
|
+
HTTP method to execute.
|
|
859
|
+
session : Any | None, optional
|
|
860
|
+
Requests-compatible session object. Defaults to module-level
|
|
861
|
+
``requests``.
|
|
862
|
+
timeout : Timeout, optional
|
|
863
|
+
Timeout in seconds for the request. Uses ``DEFAULT_TIMEOUT`` when
|
|
864
|
+
omitted.
|
|
865
|
+
|
|
866
|
+
Returns
|
|
867
|
+
-------
|
|
868
|
+
tuple[Callable[..., requests.Response], float, HttpMethod]
|
|
869
|
+
Tuple of (callable, timeout_seconds, resolved_method).
|
|
870
|
+
|
|
871
|
+
Raises
|
|
872
|
+
------
|
|
873
|
+
TypeError
|
|
874
|
+
If the session object does not expose the requested HTTP method.
|
|
875
|
+
"""
|
|
876
|
+
http_method = HttpMethod.coerce(method)
|
|
877
|
+
request_timeout = DEFAULT_TIMEOUT if timeout is None else timeout
|
|
878
|
+
requester = session or requests
|
|
879
|
+
request_callable = getattr(requester, http_method.value, None)
|
|
880
|
+
if not callable(request_callable):
|
|
881
|
+
raise TypeError(
|
|
882
|
+
'Session object must supply a callable '
|
|
883
|
+
f'"{http_method.value}" method',
|
|
884
|
+
)
|
|
885
|
+
return request_callable, request_timeout, http_method
|
etlplus/cli/handlers.py
CHANGED
|
@@ -18,15 +18,16 @@ from ..config import PipelineConfig
|
|
|
18
18
|
from ..config import load_pipeline_config
|
|
19
19
|
from ..database import load_table_spec
|
|
20
20
|
from ..database import render_tables
|
|
21
|
-
from ..extract import extract
|
|
22
21
|
from ..file import File
|
|
23
|
-
from ..
|
|
24
|
-
from ..
|
|
25
|
-
from ..
|
|
22
|
+
from ..file import FileFormat
|
|
23
|
+
from ..ops import extract
|
|
24
|
+
from ..ops import load
|
|
25
|
+
from ..ops import run
|
|
26
|
+
from ..ops import transform
|
|
27
|
+
from ..ops import validate
|
|
28
|
+
from ..ops.validate import FieldRules
|
|
26
29
|
from ..types import JSONData
|
|
27
30
|
from ..types import TemplateKey
|
|
28
|
-
from ..validate import FieldRules
|
|
29
|
-
from ..validate import validate
|
|
30
31
|
from . import io as cli_io
|
|
31
32
|
|
|
32
33
|
# SECTION: EXPORTS ========================================================== #
|
|
@@ -569,8 +570,17 @@ def transform_handler(
|
|
|
569
570
|
|
|
570
571
|
data = transform(payload, cast(TransformOperations, operations_payload))
|
|
571
572
|
|
|
573
|
+
# TODO: Generalize to handle non-file targets.
|
|
572
574
|
if target and target != '-':
|
|
573
|
-
|
|
575
|
+
# Convert target to Path and target_format to FileFormat if needed
|
|
576
|
+
file_path = Path(target)
|
|
577
|
+
file_format = None
|
|
578
|
+
if target_format is not None:
|
|
579
|
+
try:
|
|
580
|
+
file_format = FileFormat(target_format)
|
|
581
|
+
except ValueError:
|
|
582
|
+
file_format = None # or handle error as appropriate
|
|
583
|
+
File(file_path, file_format=file_format).write(data)
|
|
574
584
|
print(f'Data transformed and saved to {target}')
|
|
575
585
|
return 0
|
|
576
586
|
|
etlplus/config/jobs.py
CHANGED
|
@@ -34,10 +34,7 @@ __all__ = [
|
|
|
34
34
|
]
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
# SECTION:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# SECTION: CLASSES ========================================================== #
|
|
37
|
+
# SECTION: DATA CLASSES ===================================================== #
|
|
41
38
|
|
|
42
39
|
|
|
43
40
|
@dataclass(kw_only=True, slots=True)
|
|
@@ -100,6 +97,8 @@ class JobConfig:
|
|
|
100
97
|
Unique job name.
|
|
101
98
|
description : str | None
|
|
102
99
|
Optional human-friendly description.
|
|
100
|
+
depends_on : list[str]
|
|
101
|
+
Optional job dependency list. Dependencies must refer to other jobs.
|
|
103
102
|
extract : ExtractRef | None
|
|
104
103
|
Extraction reference.
|
|
105
104
|
validate : ValidationRef | None
|
|
@@ -114,6 +113,7 @@ class JobConfig:
|
|
|
114
113
|
|
|
115
114
|
name: str
|
|
116
115
|
description: str | None = None
|
|
116
|
+
depends_on: list[str] = field(default_factory=list)
|
|
117
117
|
extract: ExtractRef | None = None
|
|
118
118
|
validate: ValidationRef | None = None
|
|
119
119
|
transform: TransformRef | None = None
|
|
@@ -149,9 +149,19 @@ class JobConfig:
|
|
|
149
149
|
if description is not None and not isinstance(description, str):
|
|
150
150
|
description = str(description)
|
|
151
151
|
|
|
152
|
+
depends_raw = data.get('depends_on')
|
|
153
|
+
depends_on: list[str] = []
|
|
154
|
+
if isinstance(depends_raw, str):
|
|
155
|
+
depends_on = [depends_raw]
|
|
156
|
+
elif isinstance(depends_raw, list):
|
|
157
|
+
for entry in depends_raw:
|
|
158
|
+
if isinstance(entry, str):
|
|
159
|
+
depends_on.append(entry)
|
|
160
|
+
|
|
152
161
|
return cls(
|
|
153
162
|
name=name,
|
|
154
163
|
description=description,
|
|
164
|
+
depends_on=depends_on,
|
|
155
165
|
extract=ExtractRef.from_obj(data.get('extract')),
|
|
156
166
|
validate=ValidationRef.from_obj(data.get('validate')),
|
|
157
167
|
transform=TransformRef.from_obj(data.get('transform')),
|
etlplus/dag.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.dag` module.
|
|
3
|
+
|
|
4
|
+
Lightweight directed acyclic graph (DAG) helpers for ordering jobs based on
|
|
5
|
+
``depends_on``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections import deque
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
from .config.jobs import JobConfig
|
|
14
|
+
|
|
15
|
+
# SECTION: EXPORTS ========================================================== #
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
'DagError',
|
|
20
|
+
'topological_sort_jobs',
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# SECTION: ERRORS =========================================================== #
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(slots=True)
|
|
28
|
+
class DagError(ValueError):
|
|
29
|
+
"""
|
|
30
|
+
Raised when the job dependency graph is invalid.
|
|
31
|
+
|
|
32
|
+
Attributes
|
|
33
|
+
----------
|
|
34
|
+
message : str
|
|
35
|
+
Error message.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# -- Attributes -- #
|
|
39
|
+
|
|
40
|
+
message: str
|
|
41
|
+
|
|
42
|
+
# -- Magic Methods (Object Representation) -- #
|
|
43
|
+
|
|
44
|
+
def __str__(self) -> str:
|
|
45
|
+
return self.message
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# SECTION: FUNCTIONS ======================================================== #
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def topological_sort_jobs(
|
|
52
|
+
jobs: list[JobConfig],
|
|
53
|
+
) -> list[JobConfig]:
|
|
54
|
+
"""
|
|
55
|
+
Return jobs in topological order based on ``depends_on``.
|
|
56
|
+
|
|
57
|
+
Parameters
|
|
58
|
+
----------
|
|
59
|
+
jobs : list[JobConfig]
|
|
60
|
+
List of job configurations to sort.
|
|
61
|
+
|
|
62
|
+
Returns
|
|
63
|
+
-------
|
|
64
|
+
list[JobConfig]
|
|
65
|
+
Jobs sorted in topological order.
|
|
66
|
+
|
|
67
|
+
Raises
|
|
68
|
+
------
|
|
69
|
+
DagError
|
|
70
|
+
If a dependency is missing, self-referential, or when a cycle is
|
|
71
|
+
detected.
|
|
72
|
+
"""
|
|
73
|
+
index = {job.name: job for job in jobs}
|
|
74
|
+
edges: dict[str, set[str]] = {name: set() for name in index}
|
|
75
|
+
indegree: dict[str, int] = {name: 0 for name in index}
|
|
76
|
+
|
|
77
|
+
for job in jobs:
|
|
78
|
+
for dep in job.depends_on:
|
|
79
|
+
if dep not in index:
|
|
80
|
+
raise DagError(
|
|
81
|
+
f'Unknown dependency "{dep}" in job "{job.name}"',
|
|
82
|
+
)
|
|
83
|
+
if dep == job.name:
|
|
84
|
+
raise DagError(f'Job "{job.name}" depends on itself')
|
|
85
|
+
if job.name not in edges[dep]:
|
|
86
|
+
edges[dep].add(job.name)
|
|
87
|
+
indegree[job.name] += 1
|
|
88
|
+
|
|
89
|
+
queue = deque(sorted(name for name, deg in indegree.items() if deg == 0))
|
|
90
|
+
ordered: list[str] = []
|
|
91
|
+
|
|
92
|
+
while queue:
|
|
93
|
+
name = queue.popleft()
|
|
94
|
+
ordered.append(name)
|
|
95
|
+
for child in sorted(edges[name]):
|
|
96
|
+
indegree[child] -= 1
|
|
97
|
+
if indegree[child] == 0:
|
|
98
|
+
queue.append(child)
|
|
99
|
+
|
|
100
|
+
if len(ordered) != len(jobs):
|
|
101
|
+
raise DagError('Dependency cycle detected')
|
|
102
|
+
|
|
103
|
+
return [index[name] for name in ordered]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# etlplus.
|
|
1
|
+
# etlplus.ops subpackage
|
|
2
2
|
|
|
3
3
|
Documentation for the `etlplus.validation` subpackage: data validation utilities and helpers.
|
|
4
4
|
|
|
@@ -8,7 +8,7 @@ Documentation for the `etlplus.validation` subpackage: data validation utilities
|
|
|
8
8
|
|
|
9
9
|
Back to project overview: see the top-level [README](../../README.md).
|
|
10
10
|
|
|
11
|
-
- [etlplus.
|
|
11
|
+
- [etlplus.ops subpackage](#etlplusops-subpackage)
|
|
12
12
|
- [Validation Features](#validation-features)
|
|
13
13
|
- [Defining Validation Rules](#defining-validation-rules)
|
|
14
14
|
- [Example: Validating Data](#example-validating-data)
|
etlplus/ops/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.ops` package.
|
|
3
|
+
|
|
4
|
+
Data operations helpers.
|
|
5
|
+
|
|
6
|
+
Importing :mod:`etlplus.ops` exposes the coarse-grained helpers most users care
|
|
7
|
+
about: ``extract``, ``transform``, ``load``, ``validate``, and ``run``. Each
|
|
8
|
+
helper delegates to the richer modules under ``etlplus.ops.*`` while
|
|
9
|
+
presenting a compact public API surface. Conditional validation orchestration
|
|
10
|
+
is available via :func:`etlplus.ops.utils.maybe_validate`.
|
|
11
|
+
|
|
12
|
+
Examples
|
|
13
|
+
--------
|
|
14
|
+
>>> from etlplus.ops import extract, transform
|
|
15
|
+
>>> raw = extract('file', 'input.json')
|
|
16
|
+
>>> curated = transform(raw, {'select': ['id', 'name']})
|
|
17
|
+
|
|
18
|
+
>>> from etlplus.ops.utils import maybe_validate
|
|
19
|
+
>>> payload = {'name': 'Alice'}
|
|
20
|
+
>>> rules = {'required': ['name']}
|
|
21
|
+
>>> def validator(data, config):
|
|
22
|
+
... missing = [field for field in config['required'] if field not in data]
|
|
23
|
+
... return {'valid': not missing, 'errors': missing, 'data': data}
|
|
24
|
+
>>> maybe_validate(
|
|
25
|
+
... payload,
|
|
26
|
+
... when='both',
|
|
27
|
+
... enabled=True,
|
|
28
|
+
... rules=rules,
|
|
29
|
+
... phase='before_transform',
|
|
30
|
+
... severity='warn',
|
|
31
|
+
... validate_fn=validator,
|
|
32
|
+
... print_json_fn=lambda message: message,
|
|
33
|
+
... )
|
|
34
|
+
{'name': 'Alice'}
|
|
35
|
+
|
|
36
|
+
See Also
|
|
37
|
+
--------
|
|
38
|
+
:mod:`etlplus.ops.run`
|
|
39
|
+
:mod:`etlplus.ops.utils`
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from .extract import extract
|
|
43
|
+
from .load import load
|
|
44
|
+
from .run import run
|
|
45
|
+
from .transform import transform
|
|
46
|
+
from .validate import validate
|
|
47
|
+
|
|
48
|
+
# SECTION: EXPORTS ========================================================== #
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
'extract',
|
|
53
|
+
'load',
|
|
54
|
+
'run',
|
|
55
|
+
'transform',
|
|
56
|
+
'validate',
|
|
57
|
+
]
|