etlplus 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. etlplus/__init__.py +43 -0
  2. etlplus/__main__.py +22 -0
  3. etlplus/__version__.py +14 -0
  4. etlplus/api/README.md +237 -0
  5. etlplus/api/__init__.py +136 -0
  6. etlplus/api/auth.py +432 -0
  7. etlplus/api/config.py +633 -0
  8. etlplus/api/endpoint_client.py +885 -0
  9. etlplus/api/errors.py +170 -0
  10. etlplus/api/pagination/__init__.py +47 -0
  11. etlplus/api/pagination/client.py +188 -0
  12. etlplus/api/pagination/config.py +440 -0
  13. etlplus/api/pagination/paginator.py +775 -0
  14. etlplus/api/rate_limiting/__init__.py +38 -0
  15. etlplus/api/rate_limiting/config.py +343 -0
  16. etlplus/api/rate_limiting/rate_limiter.py +266 -0
  17. etlplus/api/request_manager.py +589 -0
  18. etlplus/api/retry_manager.py +430 -0
  19. etlplus/api/transport.py +325 -0
  20. etlplus/api/types.py +172 -0
  21. etlplus/cli/__init__.py +15 -0
  22. etlplus/cli/app.py +1367 -0
  23. etlplus/cli/handlers.py +775 -0
  24. etlplus/cli/main.py +616 -0
  25. etlplus/config/__init__.py +56 -0
  26. etlplus/config/connector.py +372 -0
  27. etlplus/config/jobs.py +311 -0
  28. etlplus/config/pipeline.py +339 -0
  29. etlplus/config/profile.py +78 -0
  30. etlplus/config/types.py +204 -0
  31. etlplus/config/utils.py +120 -0
  32. etlplus/ddl.py +197 -0
  33. etlplus/enums.py +414 -0
  34. etlplus/extract.py +218 -0
  35. etlplus/file.py +657 -0
  36. etlplus/load.py +336 -0
  37. etlplus/mixins.py +62 -0
  38. etlplus/py.typed +0 -0
  39. etlplus/run.py +368 -0
  40. etlplus/run_helpers.py +843 -0
  41. etlplus/templates/__init__.py +5 -0
  42. etlplus/templates/ddl.sql.j2 +128 -0
  43. etlplus/templates/view.sql.j2 +69 -0
  44. etlplus/transform.py +1049 -0
  45. etlplus/types.py +227 -0
  46. etlplus/utils.py +638 -0
  47. etlplus/validate.py +493 -0
  48. etlplus/validation/__init__.py +44 -0
  49. etlplus/validation/utils.py +389 -0
  50. etlplus-0.5.4.dist-info/METADATA +616 -0
  51. etlplus-0.5.4.dist-info/RECORD +55 -0
  52. etlplus-0.5.4.dist-info/WHEEL +5 -0
  53. etlplus-0.5.4.dist-info/entry_points.txt +2 -0
  54. etlplus-0.5.4.dist-info/licenses/LICENSE +21 -0
  55. etlplus-0.5.4.dist-info/top_level.txt +1 -0
etlplus/run.py ADDED
@@ -0,0 +1,368 @@
1
+ """
2
+ :mod:`etlplus.run` module.
3
+
4
+ A module for running ETL jobs defined in YAML configurations.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import Mapping
10
+ from typing import Any
11
+ from typing import Final
12
+ from typing import TypedDict
13
+ from typing import cast
14
+ from urllib.parse import urlsplit
15
+ from urllib.parse import urlunsplit
16
+
17
+ import requests # type: ignore[import]
18
+
19
+ from .api import EndpointClient # noqa: F401 (re-exported for tests)
20
+ from .api import PaginationConfigMap
21
+ from .api import RequestOptions
22
+ from .api import RetryPolicy
23
+ from .api import Url
24
+ from .config import load_pipeline_config
25
+ from .enums import DataConnectorType
26
+ from .enums import coerce_data_connector_type
27
+ from .extract import extract
28
+ from .load import load
29
+ from .run_helpers import compose_api_request_env
30
+ from .run_helpers import compose_api_target_env
31
+ from .run_helpers import paginate_with_client
32
+ from .transform import transform
33
+ from .types import JSONDict
34
+ from .types import Timeout
35
+ from .utils import print_json
36
+ from .validate import validate
37
+ from .validation.utils import maybe_validate
38
+
39
+ # SECTION: EXPORTS ========================================================== #
40
+
41
+
42
+ __all__ = ['run']
43
+
44
+
45
+ # SECTION: TYPED DICTS ====================================================== #
46
+
47
+
48
+ class BaseApiHttpEnv(TypedDict, total=False):
49
+ """
50
+ Common HTTP request environment for API interactions.
51
+
52
+ Fields shared by both source-side and target-side API operations.
53
+ """
54
+
55
+ # Request details
56
+ url: Url | None
57
+ headers: dict[str, str]
58
+ timeout: Timeout
59
+
60
+ # Session
61
+ session: requests.Session | None
62
+
63
+
64
+ class ApiRequestEnv(BaseApiHttpEnv, total=False):
65
+ """
66
+ Composed request environment for API sources.
67
+
68
+ Returned by ``compose_api_request_env`` (run_helpers) and consumed by the
69
+ API extract branch. Values are fully merged with endpoint/API defaults and
70
+ job-level overrides, preserving the original precedence and behavior.
71
+ """
72
+
73
+ # Client
74
+ use_endpoints: bool
75
+ base_url: str | None
76
+ base_path: str | None
77
+ endpoints_map: dict[str, str] | None
78
+ endpoint_key: str | None
79
+
80
+ # Request
81
+ params: dict[str, Any]
82
+ pagination: PaginationConfigMap | None
83
+ sleep_seconds: float
84
+
85
+ # Reliability
86
+ retry: RetryPolicy | None
87
+ retry_network_errors: bool
88
+
89
+
90
+ class ApiTargetEnv(BaseApiHttpEnv, total=False):
91
+ """
92
+ Composed request environment for API targets.
93
+
94
+ Returned by ``compose_api_target_env`` (run_helpers) and consumed by the
95
+ API load branch. Values are merged from the target object, optional
96
+ API/endpoint reference, and job-level overrides, preserving original
97
+ precedence and behavior.
98
+
99
+ Notes
100
+ -----
101
+ - Precedence for inherited values matches original logic:
102
+ overrides -> target -> API profile defaults.
103
+ - Target composition does not include pagination/rate-limit/retry since
104
+ loads are single-request operations; only headers/timeout/session
105
+ apply.
106
+ """
107
+
108
+ # Request
109
+ method: str | None
110
+
111
+
112
+ class SessionConfig(TypedDict, total=False):
113
+ """
114
+ Minimal session configuration schema accepted by this runner.
115
+
116
+ Keys mirror common requests.Session options; all are optional.
117
+ """
118
+
119
+ headers: Mapping[str, Any]
120
+ params: Mapping[str, Any]
121
+ auth: Any # (user, pass) tuple or requests-compatible auth object
122
+ verify: bool | str
123
+ cert: Any # str or (cert, key)
124
+ proxies: Mapping[str, Any]
125
+ cookies: Mapping[str, Any]
126
+ trust_env: bool
127
+
128
+
129
+ # SECTION: CONSTANTS ======================================================== #
130
+
131
+
132
+ DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
133
+
134
+
135
+ # SECTION: FUNCTIONS ======================================================== #
136
+
137
+
138
+ def run(
139
+ job: str,
140
+ config_path: str | None = None,
141
+ ) -> JSONDict:
142
+ """
143
+ Run a pipeline job defined in a YAML configuration.
144
+
145
+ This mirrors the run-mode logic from ``etlplus.cli.cmd_pipeline``
146
+ (without the list/summary modes). By default it reads the configuration
147
+ from ``in/pipeline.yml``, but callers can provide an explicit
148
+ ``config_path`` to override this.
149
+
150
+ Parameters
151
+ ----------
152
+ job : str
153
+ Job name to execute.
154
+ config_path : str | None, optional
155
+ Path to the pipeline YAML configuration. Defaults to
156
+ ``in/pipeline.yml``.
157
+
158
+ Returns
159
+ -------
160
+ JSONDict
161
+ Result dictionary.
162
+
163
+ Raises
164
+ ------
165
+ ValueError
166
+ If the job is not found or if there are configuration issues.
167
+ """
168
+ cfg_path = config_path or DEFAULT_CONFIG_PATH
169
+ cfg = load_pipeline_config(cfg_path, substitute=True)
170
+
171
+ # Lookup job by name
172
+ if not (job_obj := next((j for j in cfg.jobs if j.name == job), None)):
173
+ raise ValueError(f'Job not found: {job}')
174
+
175
+ # Index sources/targets by name
176
+ sources_by_name = {getattr(s, 'name', None): s for s in cfg.sources}
177
+ targets_by_name = {getattr(t, 'name', None): t for t in cfg.targets}
178
+
179
+ # Extract.
180
+ if not job_obj.extract:
181
+ raise ValueError('Job missing "extract" section')
182
+ source_name = job_obj.extract.source
183
+ if source_name not in sources_by_name:
184
+ raise ValueError(f'Unknown source: {source_name}')
185
+ source_obj = sources_by_name[source_name]
186
+ ex_opts: dict[str, Any] = job_obj.extract.options or {}
187
+
188
+ data: Any
189
+ stype_raw = getattr(source_obj, 'type', None)
190
+ stype = coerce_data_connector_type(stype_raw or '')
191
+ match stype:
192
+ case DataConnectorType.FILE:
193
+ path = getattr(source_obj, 'path', None)
194
+ fmt = ex_opts.get('format') or getattr(
195
+ source_obj,
196
+ 'format',
197
+ 'json',
198
+ )
199
+ if not path:
200
+ raise ValueError('File source missing "path"')
201
+ data = extract('file', path, file_format=fmt)
202
+ case DataConnectorType.DATABASE:
203
+ conn = getattr(source_obj, 'connection_string', '')
204
+ data = extract('database', conn)
205
+ case DataConnectorType.API:
206
+ env = compose_api_request_env(cfg, source_obj, ex_opts)
207
+ if (
208
+ env.get('use_endpoints')
209
+ and env.get('base_url')
210
+ and env.get('endpoints_map')
211
+ and env.get('endpoint_key')
212
+ ):
213
+ # Construct client using module-level EndpointClient so tests
214
+ # can monkeypatch this class on etlplus.run.
215
+ ClientClass = EndpointClient # noqa: N806
216
+ client = ClientClass(
217
+ base_url=cast(str, env['base_url']),
218
+ base_path=cast(str | None, env.get('base_path')),
219
+ endpoints=cast(dict[str, str], env['endpoints_map']),
220
+ retry=env.get('retry'),
221
+ retry_network_errors=bool(
222
+ env.get('retry_network_errors', False),
223
+ ),
224
+ session=env.get('session'),
225
+ )
226
+ data = paginate_with_client(
227
+ client,
228
+ cast(str, env['endpoint_key']),
229
+ env.get('params'),
230
+ env.get('headers'),
231
+ env.get('timeout'),
232
+ env.get('pagination'),
233
+ cast(float | None, env.get('sleep_seconds')),
234
+ )
235
+ else:
236
+ url = env.get('url')
237
+ if not url:
238
+ raise ValueError('API source missing URL')
239
+ parts = urlsplit(cast(str, url))
240
+ base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
241
+ ClientClass = EndpointClient # noqa: N806
242
+ client = ClientClass(
243
+ base_url=base,
244
+ base_path=None,
245
+ endpoints={},
246
+ retry=env.get('retry'),
247
+ retry_network_errors=bool(
248
+ env.get('retry_network_errors', False),
249
+ ),
250
+ session=env.get('session'),
251
+ )
252
+
253
+ request_options = RequestOptions(
254
+ params=cast(Mapping[str, Any] | None, env.get('params')),
255
+ headers=cast(Mapping[str, str] | None, env.get('headers')),
256
+ timeout=cast(Timeout | None, env.get('timeout')),
257
+ )
258
+
259
+ data = client.paginate_url(
260
+ cast(str, url),
261
+ cast(PaginationConfigMap | None, env.get('pagination')),
262
+ request=request_options,
263
+ sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
264
+ )
265
+ case _:
266
+ # ``coerce_data_connector_type`` already raises for invalid
267
+ # connector types; this branch is defensive only.
268
+ raise ValueError(f'Unsupported source type: {stype_raw}')
269
+
270
+ # DRY: unified validation helper (pre/post transform)
271
+ val_ref = job_obj.validate
272
+ enabled_validation = val_ref is not None
273
+ if enabled_validation:
274
+ # Type narrowing for static checkers
275
+ assert val_ref is not None
276
+ rules = cfg.validations.get(val_ref.ruleset, {})
277
+ severity = (val_ref.severity or 'error').lower()
278
+ phase = (val_ref.phase or 'before_transform').lower()
279
+ else:
280
+ rules = {}
281
+ severity = 'error'
282
+ phase = 'before_transform'
283
+
284
+ # Pre-transform validation (if configured).
285
+ data = maybe_validate(
286
+ data,
287
+ 'before_transform',
288
+ enabled=enabled_validation,
289
+ rules=rules,
290
+ phase=phase,
291
+ severity=severity,
292
+ validate_fn=validate, # type: ignore[arg-type]
293
+ print_json_fn=print_json,
294
+ )
295
+
296
+ # Transform (optional).
297
+ if job_obj.transform:
298
+ ops: Any = cfg.transforms.get(job_obj.transform.pipeline, {})
299
+ data = transform(data, ops)
300
+
301
+ # Post-transform validation (if configured)
302
+ data = maybe_validate(
303
+ data,
304
+ 'after_transform',
305
+ enabled=enabled_validation,
306
+ rules=rules,
307
+ phase=phase,
308
+ severity=severity,
309
+ validate_fn=validate, # type: ignore[arg-type]
310
+ print_json_fn=print_json,
311
+ )
312
+
313
+ # Load.
314
+ if not job_obj.load:
315
+ raise ValueError('Job missing "load" section')
316
+ target_name = job_obj.load.target
317
+ if target_name not in targets_by_name:
318
+ raise ValueError(f'Unknown target: {target_name}')
319
+ target_obj = targets_by_name[target_name]
320
+ overrides = job_obj.load.overrides or {}
321
+
322
+ ttype_raw = getattr(target_obj, 'type', None)
323
+ ttype = coerce_data_connector_type(ttype_raw or '')
324
+ match ttype:
325
+ case DataConnectorType.FILE:
326
+ path = overrides.get('path') or getattr(target_obj, 'path', None)
327
+ fmt = overrides.get('format') or getattr(
328
+ target_obj,
329
+ 'format',
330
+ 'json',
331
+ )
332
+ if not path:
333
+ raise ValueError('File target missing "path"')
334
+ result = load(data, 'file', path, file_format=fmt)
335
+ case DataConnectorType.API:
336
+ env_t = compose_api_target_env(cfg, target_obj, overrides)
337
+ url_t = env_t.get('url')
338
+ if not url_t:
339
+ raise ValueError('API target missing "url"')
340
+ kwargs_t: dict[str, Any] = {}
341
+ if env_t.get('headers'):
342
+ kwargs_t['headers'] = cast(dict[str, str], env_t['headers'])
343
+ if env_t.get('timeout') is not None:
344
+ kwargs_t['timeout'] = env_t['timeout']
345
+ if env_t.get('session') is not None:
346
+ kwargs_t['session'] = env_t['session']
347
+ result = load(
348
+ data,
349
+ 'api',
350
+ cast(str, url_t),
351
+ method=cast(str | Any, env_t.get('method') or 'post'),
352
+ **kwargs_t,
353
+ )
354
+ case DataConnectorType.DATABASE:
355
+ conn = overrides.get('connection_string') or getattr(
356
+ target_obj,
357
+ 'connection_string',
358
+ '',
359
+ )
360
+ result = load(data, 'database', str(conn))
361
+ case _:
362
+ # ``coerce_data_connector_type`` already raises for invalid
363
+ # connector types; this branch is defensive only.
364
+ raise ValueError(f'Unsupported target type: {ttype_raw}')
365
+
366
+ # Return the terminal load result directly; callers (e.g., CLI) can wrap
367
+ # it in their own envelope when needed.
368
+ return cast(JSONDict, result)