etlplus 0.15.5__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/ops/load.py CHANGED
@@ -8,13 +8,15 @@ from __future__ import annotations
8
8
 
9
9
  import json
10
10
  import sys
11
+ from collections.abc import Mapping
11
12
  from pathlib import Path
12
13
  from typing import Any
13
14
  from typing import cast
14
15
 
15
16
  from ..api import HttpMethod
17
+ from ..api import compose_api_target_env
16
18
  from ..api.utils import resolve_request
17
- from ..enums import DataConnectorType
19
+ from ..connector import DataConnectorType
18
20
  from ..file import File
19
21
  from ..file import FileFormat
20
22
  from ..types import JSONData
@@ -39,6 +41,108 @@ __all__ = [
39
41
  # SECTION: INTERNAL FUNCTIONS ============================================== #
40
42
 
41
43
 
44
+ def _load_data_from_str(
45
+ source: str,
46
+ ) -> JSONData:
47
+ """
48
+ Load JSON data from a string or file path.
49
+
50
+ Parameters
51
+ ----------
52
+ source : str
53
+ Input string representing a file path or JSON payload.
54
+
55
+ Returns
56
+ -------
57
+ JSONData
58
+ Parsed JSON payload.
59
+ """
60
+ # Special case: '-' means read JSON from STDIN (Unix convention).
61
+ if source == '-':
62
+ raw = sys.stdin.read()
63
+ return _parse_json_string(raw)
64
+
65
+ candidate = Path(source)
66
+ if candidate.exists():
67
+ try:
68
+ return File(candidate, FileFormat.JSON).read()
69
+ except (OSError, json.JSONDecodeError, ValueError):
70
+ # Fall back to treating the string as raw JSON content.
71
+ pass
72
+ return _parse_json_string(source)
73
+
74
+
75
+ def _load_to_api_env(
76
+ data: JSONData,
77
+ env: Mapping[str, Any],
78
+ ) -> JSONDict:
79
+ """
80
+ Load data to an API target using a normalized environment.
81
+
82
+ Parameters
83
+ ----------
84
+ data : JSONData
85
+ Payload to load.
86
+ env : Mapping[str, Any]
87
+ Normalized request environment.
88
+
89
+ Returns
90
+ -------
91
+ JSONDict
92
+ Load result payload.
93
+
94
+ Raises
95
+ ------
96
+ ValueError
97
+ If required parameters are missing.
98
+ """
99
+ url = env.get('url')
100
+ if not url:
101
+ raise ValueError('API target missing "url"')
102
+ method = env.get('method') or 'post'
103
+ kwargs: dict[str, Any] = {}
104
+ headers = env.get('headers')
105
+ if headers:
106
+ kwargs['headers'] = cast(dict[str, str], headers)
107
+ if env.get('timeout') is not None:
108
+ kwargs['timeout'] = env.get('timeout')
109
+ session = env.get('session')
110
+ if session is not None:
111
+ kwargs['session'] = session
112
+ extra_kwargs = env.get('request_kwargs')
113
+ if isinstance(extra_kwargs, Mapping):
114
+ kwargs.update(extra_kwargs)
115
+ timeout = kwargs.pop('timeout', 10.0)
116
+ session = kwargs.pop('session', None)
117
+ request_callable, timeout, http_method = resolve_request(
118
+ method,
119
+ session=session,
120
+ timeout=timeout,
121
+ )
122
+ response = request_callable(
123
+ cast(str, url),
124
+ json=data,
125
+ timeout=timeout,
126
+ **kwargs,
127
+ )
128
+ response.raise_for_status()
129
+
130
+ # Try JSON first, fall back to text.
131
+ try:
132
+ payload: Any = response.json()
133
+ except ValueError:
134
+ payload = response.text
135
+
136
+ return {
137
+ 'status': 'success',
138
+ 'status_code': response.status_code,
139
+ 'message': f'Data loaded to {url}',
140
+ 'response': payload,
141
+ 'records': count_records(data),
142
+ 'method': http_method.value.upper(),
143
+ }
144
+
145
+
42
146
  def _parse_json_string(
43
147
  raw: str,
44
148
  ) -> JSONData:
@@ -113,18 +217,7 @@ def load_data(
113
217
  return File(source, FileFormat.JSON).read()
114
218
 
115
219
  if isinstance(source, str):
116
- # Special case: '-' means read JSON from STDIN (Unix convention).
117
- if source == '-':
118
- raw = sys.stdin.read()
119
- return _parse_json_string(raw)
120
- candidate = Path(source)
121
- if candidate.exists():
122
- try:
123
- return File(candidate, FileFormat.JSON).read()
124
- except (OSError, json.JSONDecodeError, ValueError):
125
- # Fall back to treating the string as raw JSON content.
126
- pass
127
- return _parse_json_string(source)
220
+ return _load_data_from_str(source)
128
221
 
129
222
  raise TypeError(
130
223
  'source must be a mapping, sequence of mappings, path, or JSON string',
@@ -158,30 +251,43 @@ def load_to_api(
158
251
  Result dictionary including response payload or text.
159
252
  """
160
253
  # Apply a conservative timeout to guard against hanging requests.
161
- timeout = kwargs.pop('timeout', 10.0)
162
- session = kwargs.pop('session', None)
163
- request_callable, timeout, http_method = resolve_request(
164
- method,
165
- session=session,
166
- timeout=timeout,
167
- )
168
- response = request_callable(url, json=data, timeout=timeout, **kwargs)
169
- response.raise_for_status()
254
+ env = {
255
+ 'url': url,
256
+ 'method': method,
257
+ 'timeout': kwargs.pop('timeout', 10.0),
258
+ 'session': kwargs.pop('session', None),
259
+ 'request_kwargs': kwargs,
260
+ }
261
+ return _load_to_api_env(data, env)
170
262
 
171
- # Try JSON first, fall back to text.
172
- try:
173
- payload: Any = response.json()
174
- except ValueError:
175
- payload = response.text
176
263
 
177
- return {
178
- 'status': 'success',
179
- 'status_code': response.status_code,
180
- 'message': f'Data loaded to {url}',
181
- 'response': payload,
182
- 'records': count_records(data),
183
- 'method': http_method.value.upper(),
184
- }
264
+ def load_to_api_target(
265
+ cfg: Any,
266
+ target_obj: Any,
267
+ overrides: dict[str, Any],
268
+ data: JSONData,
269
+ ) -> JSONDict:
270
+ """
271
+ Load data to an API target connector.
272
+
273
+ Parameters
274
+ ----------
275
+ cfg : Any
276
+ Pipeline configuration.
277
+ target_obj : Any
278
+ Connector configuration.
279
+ overrides : dict[str, Any]
280
+ Load-time overrides.
281
+ data : JSONData
282
+ Payload to load.
283
+
284
+ Returns
285
+ -------
286
+ JSONDict
287
+ Load result.
288
+ """
289
+ env = compose_api_target_env(cfg, target_obj, overrides)
290
+ return _load_to_api_env(data, env)
185
291
 
186
292
 
187
293
  def load_to_database(
etlplus/ops/run.py CHANGED
@@ -6,31 +6,23 @@ A module for running ETL jobs defined in YAML configurations.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from collections.abc import Mapping
10
9
  from typing import Any
11
10
  from typing import Final
12
11
  from typing import cast
13
- from urllib.parse import urlsplit
14
- from urllib.parse import urlunsplit
15
12
 
16
- from ..api import EndpointClient # noqa: F401 (re-exported for tests)
17
13
  from ..api import HttpMethod
18
- from ..api import PaginationConfigMap
19
- from ..api import RequestOptions
20
- from ..api import compose_api_request_env
21
- from ..api import compose_api_target_env
22
- from ..api import paginate_with_client
23
- from ..enums import DataConnectorType
14
+ from ..connector import DataConnectorType
24
15
  from ..file import FileFormat
25
16
  from ..types import JSONData
26
17
  from ..types import JSONDict
27
18
  from ..types import PipelineConfig
28
19
  from ..types import StrPath
29
- from ..types import Timeout
30
20
  from ..utils import print_json
31
21
  from ..workflow import load_pipeline_config
32
22
  from .extract import extract
23
+ from .extract import extract_from_api_source
33
24
  from .load import load
25
+ from .load import load_to_api_target
34
26
  from .transform import transform
35
27
  from .utils import maybe_validate
36
28
  from .validate import validate
@@ -54,6 +46,75 @@ DEFAULT_CONFIG_PATH: Final[str] = 'in/pipeline.yml'
54
46
  # SECTION: INTERNAL FUNCTIONS =============================================== #
55
47
 
56
48
 
49
+ def _index_connectors(
50
+ connectors: list[Any],
51
+ *,
52
+ label: str,
53
+ ) -> dict[str, Any]:
54
+ """
55
+ Index connectors by name with a helpful error on duplicates.
56
+
57
+ Parameters
58
+ ----------
59
+ connectors : list[Any]
60
+ Connector objects to index.
61
+ label : str
62
+ Label used in error messages (e.g., ``"source"``).
63
+
64
+ Returns
65
+ -------
66
+ dict[str, Any]
67
+ Mapping of connector names to connector objects.
68
+
69
+ Raises
70
+ ------
71
+ ValueError
72
+ If duplicate connector names are found.
73
+ """
74
+ indexed: dict[str, Any] = {}
75
+ for connector in connectors:
76
+ name = getattr(connector, 'name', None)
77
+ if not isinstance(name, str) or not name:
78
+ continue
79
+ if name in indexed:
80
+ raise ValueError(f'Duplicate {label} connector name: {name}')
81
+ indexed[name] = connector
82
+ return indexed
83
+
84
+
85
+ def _require_named_connector(
86
+ connectors: dict[str, Any],
87
+ name: str,
88
+ *,
89
+ label: str,
90
+ ) -> Any:
91
+ """
92
+ Return a connector by name or raise a helpful error.
93
+
94
+ Parameters
95
+ ----------
96
+ connectors : dict[str, Any]
97
+ Mapping of connector names to connector objects.
98
+ name : str
99
+ Connector name to retrieve.
100
+ label : str
101
+ Label used in error messages (e.g., ``"source"``).
102
+
103
+ Returns
104
+ -------
105
+ Any
106
+ Connector object.
107
+
108
+ Raises
109
+ ------
110
+ ValueError
111
+ If the connector name is not found.
112
+ """
113
+ if name not in connectors:
114
+ raise ValueError(f'Unknown {label}: {name}')
115
+ return connectors[name]
116
+
117
+
57
118
  def _resolve_validation_config(
58
119
  job_obj: Any,
59
120
  cfg: Any,
@@ -122,16 +183,18 @@ def run(
122
183
  raise ValueError(f'Job not found: {job}')
123
184
 
124
185
  # Index sources/targets by name
125
- sources_by_name = {getattr(s, 'name', None): s for s in cfg.sources}
126
- targets_by_name = {getattr(t, 'name', None): t for t in cfg.targets}
186
+ sources_by_name = _index_connectors(cfg.sources, label='source')
187
+ targets_by_name = _index_connectors(cfg.targets, label='target')
127
188
 
128
189
  # Extract.
129
190
  if not job_obj.extract:
130
191
  raise ValueError('Job missing "extract" section')
131
192
  source_name = job_obj.extract.source
132
- if source_name not in sources_by_name:
133
- raise ValueError(f'Unknown source: {source_name}')
134
- source_obj = sources_by_name[source_name]
193
+ source_obj = _require_named_connector(
194
+ sources_by_name,
195
+ source_name,
196
+ label='source',
197
+ )
135
198
  ex_opts: dict[str, Any] = job_obj.extract.options or {}
136
199
 
137
200
  data: Any
@@ -151,68 +214,7 @@ def run(
151
214
  conn = getattr(source_obj, 'connection_string', '')
152
215
  data = extract('database', conn)
153
216
  case DataConnectorType.API:
154
- env = compose_api_request_env(cfg, source_obj, ex_opts)
155
- if (
156
- env.get('use_endpoints')
157
- and env.get('base_url')
158
- and env.get('endpoints_map')
159
- and env.get('endpoint_key')
160
- ):
161
- # Construct client using module-level EndpointClient so tests
162
- # can monkeypatch this class on etlplus.ops.run.
163
- ClientClass = EndpointClient # noqa: N806
164
- client = ClientClass(
165
- base_url=cast(str, env.get('base_url')),
166
- base_path=cast(str | None, env.get('base_path')),
167
- endpoints=cast(
168
- dict[str, str],
169
- env.get('endpoints_map', {}),
170
- ),
171
- retry=env.get('retry'),
172
- retry_network_errors=bool(
173
- env.get('retry_network_errors', False),
174
- ),
175
- session=env.get('session'),
176
- )
177
- data = paginate_with_client(
178
- client,
179
- cast(str, env.get('endpoint_key')),
180
- env.get('params'),
181
- env.get('headers'),
182
- env.get('timeout'),
183
- env.get('pagination'),
184
- cast(float | None, env.get('sleep_seconds')),
185
- )
186
- else:
187
- url = env.get('url')
188
- if not url:
189
- raise ValueError('API source missing URL')
190
- parts = urlsplit(cast(str, url))
191
- base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
192
- ClientClass = EndpointClient # noqa: N806
193
- client = ClientClass(
194
- base_url=base,
195
- base_path=None,
196
- endpoints={},
197
- retry=env.get('retry'),
198
- retry_network_errors=bool(
199
- env.get('retry_network_errors', False),
200
- ),
201
- session=env.get('session'),
202
- )
203
-
204
- request_options = RequestOptions(
205
- params=cast(Mapping[str, Any] | None, env.get('params')),
206
- headers=cast(Mapping[str, str] | None, env.get('headers')),
207
- timeout=cast(Timeout | None, env.get('timeout')),
208
- )
209
-
210
- data = client.paginate_url(
211
- cast(str, url),
212
- cast(PaginationConfigMap | None, env.get('pagination')),
213
- request=request_options,
214
- sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
215
- )
217
+ data = extract_from_api_source(cfg, source_obj, ex_opts)
216
218
  case _:
217
219
  # :meth:`coerce` already raises for invalid connector types, but
218
220
  # keep explicit guard for defensive programming.
@@ -256,9 +258,11 @@ def run(
256
258
  if not job_obj.load:
257
259
  raise ValueError('Job missing "load" section')
258
260
  target_name = job_obj.load.target
259
- if target_name not in targets_by_name:
260
- raise ValueError(f'Unknown target: {target_name}')
261
- target_obj = targets_by_name[target_name]
261
+ target_obj = _require_named_connector(
262
+ targets_by_name,
263
+ target_name,
264
+ label='target',
265
+ )
262
266
  overrides = job_obj.load.overrides or {}
263
267
 
264
268
  ttype_raw = getattr(target_obj, 'type', None)
@@ -274,26 +278,7 @@ def run(
274
278
  raise ValueError('File target missing "path"')
275
279
  result = load(data, 'file', path, file_format=fmt)
276
280
  case DataConnectorType.API:
277
- env_t = compose_api_target_env(cfg, target_obj, overrides)
278
- url_t = env_t.get('url')
279
- if not url_t:
280
- raise ValueError('API target missing "url"')
281
- kwargs_t: dict[str, Any] = {}
282
- headers = env_t.get('headers')
283
- if headers:
284
- kwargs_t['headers'] = cast(dict[str, str], headers)
285
- if env_t.get('timeout') is not None:
286
- kwargs_t['timeout'] = env_t.get('timeout')
287
- session = env_t.get('session')
288
- if session is not None:
289
- kwargs_t['session'] = session
290
- result = load(
291
- data,
292
- 'api',
293
- cast(str, url_t),
294
- method=cast(str | Any, env_t.get('method') or 'post'),
295
- **kwargs_t,
296
- )
281
+ result = load_to_api_target(cfg, target_obj, overrides, data)
297
282
  case DataConnectorType.DATABASE:
298
283
  conn = overrides.get('connection_string') or getattr(
299
284
  target_obj,
etlplus/ops/transform.py CHANGED
@@ -206,15 +206,12 @@ def _normalize_specs(
206
206
  """
207
207
  if config is None:
208
208
  return []
209
- if isinstance(config, Sequence) and not isinstance(
210
- config,
211
- (str, bytes, bytearray),
212
- ):
209
+ if _is_sequence_not_text(config):
213
210
  # Already a sequence of step specs; normalize to a list.
214
- return list(config) # type: ignore[list-item]
211
+ return list(cast(Sequence[StepSpec], config))
215
212
 
216
213
  # Single spec
217
- return [config]
214
+ return [cast(StepSpec, config)]
218
215
 
219
216
 
220
217
  def _normalize_operation_keys(ops: Mapping[Any, Any]) -> dict[str, Any]:
@@ -702,7 +699,31 @@ def _apply_sort_step(
702
699
  # -- Helpers -- #
703
700
 
704
701
 
705
- def _is_plain_fields_list(obj: Any) -> bool:
702
+ def _is_sequence_not_text(
703
+ obj: Any,
704
+ ) -> bool:
705
+ """
706
+ Return ``True`` for non-text sequences.
707
+
708
+ Parameters
709
+ ----------
710
+ obj : Any
711
+ The object to check.
712
+
713
+ Returns
714
+ -------
715
+ bool
716
+ ``True`` when *obj* is a non-text sequence.
717
+ """
718
+ return isinstance(obj, Sequence) and not isinstance(
719
+ obj,
720
+ (str, bytes, bytearray),
721
+ )
722
+
723
+
724
+ def _is_plain_fields_list(
725
+ obj: Any,
726
+ ) -> bool:
706
727
  """
707
728
  Return True if obj is a non-text sequence of non-mapping items.
708
729
 
@@ -719,10 +740,8 @@ def _is_plain_fields_list(obj: Any) -> bool:
719
740
  True if obj is a non-text sequence of non-mapping items, False
720
741
  otherwise.
721
742
  """
722
- return (
723
- isinstance(obj, Sequence)
724
- and not isinstance(obj, (str, bytes, bytearray))
725
- and not any(isinstance(x, Mapping) for x in obj)
743
+ return _is_sequence_not_text(obj) and not any(
744
+ isinstance(x, Mapping) for x in obj
726
745
  )
727
746
 
728
747
 
etlplus/types.py CHANGED
@@ -12,8 +12,7 @@ Notes
12
12
  See Also
13
13
  --------
14
14
  - :mod:`etlplus.api.types` for HTTP-specific aliases and data classes
15
- - :mod:`etlplus.workflow.types` for workflow-specific aliases and TypedDict
16
- surfaces
15
+ - :mod:`etlplus.connector.types` for connector-specific aliases
17
16
 
18
17
  Examples
19
18
  --------
@@ -81,6 +80,8 @@ __all__ = [
81
80
  # Type Aliases (Networking / Runtime)
82
81
  'Sleeper',
83
82
  'Timeout',
83
+ # Type Aliases (Templates)
84
+ 'TemplateKey',
84
85
  ]
85
86
 
86
87
 
@@ -6,11 +6,6 @@ Job workflow helpers.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from .connector import Connector
10
- from .connector import ConnectorApi
11
- from .connector import ConnectorDb
12
- from .connector import ConnectorFile
13
- from .connector import parse_connector
14
9
  from .dag import topological_sort_jobs
15
10
  from .jobs import ExtractRef
16
11
  from .jobs import JobConfig
@@ -19,25 +14,21 @@ from .jobs import TransformRef
19
14
  from .jobs import ValidationRef
20
15
  from .pipeline import PipelineConfig
21
16
  from .pipeline import load_pipeline_config
17
+ from .profile import ProfileConfig
22
18
 
23
19
  # SECTION: EXPORTS ========================================================== #
24
20
 
25
21
 
26
22
  __all__ = [
27
23
  # Data Classes
28
- 'ConnectorApi',
29
- 'ConnectorDb',
30
- 'ConnectorFile',
31
24
  'ExtractRef',
32
25
  'JobConfig',
33
26
  'LoadRef',
34
27
  'PipelineConfig',
28
+ 'ProfileConfig',
35
29
  'TransformRef',
36
30
  'ValidationRef',
37
31
  # Functions
38
32
  'load_pipeline_config',
39
- 'parse_connector',
40
33
  'topological_sort_jobs',
41
- # Type Aliases
42
- 'Connector',
43
34
  ]
etlplus/workflow/dag.py CHANGED
@@ -47,6 +47,28 @@ class DagError(ValueError):
47
47
  return self.message
48
48
 
49
49
 
50
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
51
+
52
+
53
+ def _ready(
54
+ indegree: dict[str, int],
55
+ ) -> list[str]:
56
+ """
57
+ Return a sorted list of nodes with zero indegree.
58
+
59
+ Parameters
60
+ ----------
61
+ indegree : dict[str, int]
62
+ Mapping of node name to indegree.
63
+
64
+ Returns
65
+ -------
66
+ list[str]
67
+ Sorted list of node names ready to process.
68
+ """
69
+ return sorted(name for name, deg in indegree.items() if deg == 0)
70
+
71
+
50
72
  # SECTION: FUNCTIONS ======================================================== #
51
73
 
52
74
 
@@ -88,7 +110,7 @@ def topological_sort_jobs(
88
110
  edges[dep].add(job.name)
89
111
  indegree[job.name] += 1
90
112
 
91
- queue = deque(sorted(name for name, deg in indegree.items() if deg == 0))
113
+ queue = deque(_ready(indegree))
92
114
  ordered: list[str] = []
93
115
 
94
116
  while queue: