etlplus 0.16.0__py3-none-any.whl → 0.16.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
etlplus/ops/extract.py CHANGED
@@ -6,11 +6,19 @@ Helpers to extract data from files, databases, and REST APIs.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ from collections.abc import Mapping
9
10
  from pathlib import Path
10
11
  from typing import Any
11
12
  from typing import cast
13
+ from urllib.parse import urlsplit
14
+ from urllib.parse import urlunsplit
12
15
 
16
+ from ..api import EndpointClient
13
17
  from ..api import HttpMethod
18
+ from ..api import PaginationConfigMap
19
+ from ..api import RequestOptions
20
+ from ..api import compose_api_request_env
21
+ from ..api import paginate_with_client
14
22
  from ..api.utils import resolve_request
15
23
  from ..connector import DataConnectorType
16
24
  from ..file import File
@@ -19,6 +27,7 @@ from ..types import JSONData
19
27
  from ..types import JSONDict
20
28
  from ..types import JSONList
21
29
  from ..types import StrPath
30
+ from ..types import Timeout
22
31
 
23
32
  # SECTION: EXPORTS ========================================================== #
24
33
 
@@ -32,50 +41,164 @@ __all__ = [
32
41
  ]
33
42
 
34
43
 
35
- # SECTION: FUNCTIONS ======================================================== #
44
+ # SECTION: INTERNAL FUNCTIONS =============================================== #
36
45
 
37
46
 
38
- def extract_from_api(
39
- url: str,
40
- method: HttpMethod | str = HttpMethod.GET,
41
- **kwargs: Any,
47
+ def _build_client(
48
+ *,
49
+ base_url: str,
50
+ base_path: str | None,
51
+ endpoints: dict[str, str],
52
+ retry: Any,
53
+ retry_network_errors: bool,
54
+ session: Any,
55
+ ) -> EndpointClient:
56
+ """
57
+ Construct an API client with shared defaults.
58
+
59
+ Parameters
60
+ ----------
61
+ base_url : str
62
+ API base URL.
63
+ base_path : str | None
64
+ Base path to prepend for endpoints.
65
+ endpoints : dict[str, str]
66
+ Endpoint name to path mappings.
67
+ retry : Any
68
+ Retry policy configuration.
69
+ retry_network_errors : bool
70
+ Whether to retry on network errors.
71
+ session : Any
72
+ Optional requests session.
73
+
74
+ Returns
75
+ -------
76
+ EndpointClient
77
+ Configured endpoint client instance.
78
+ """
79
+ ClientClass = EndpointClient # noqa: N806
80
+ return ClientClass(
81
+ base_url=base_url,
82
+ base_path=base_path,
83
+ endpoints=endpoints,
84
+ retry=retry,
85
+ retry_network_errors=retry_network_errors,
86
+ session=session,
87
+ )
88
+
89
+
90
+ def _extract_from_api_env(
91
+ env: Mapping[str, Any],
92
+ *,
93
+ use_client: bool,
42
94
  ) -> JSONData:
43
95
  """
44
- Extract data from a REST API.
96
+ Extract API data from a normalized request environment.
45
97
 
46
98
  Parameters
47
99
  ----------
48
- url : str
49
- API endpoint URL.
50
- method : HttpMethod | str, optional
51
- HTTP method to use. Defaults to ``GET``.
52
- **kwargs : Any
53
- Extra arguments forwarded to the underlying ``requests`` call
54
- (for example, ``timeout``). To use a pre-configured
55
- :class:`requests.Session`, provide it via ``session``.
56
- When omitted, ``timeout`` defaults to 10 seconds.
100
+ env : Mapping[str, Any]
101
+ Normalized environment describing API request parameters.
102
+ use_client : bool
103
+ Whether to use the endpoint client/pagination machinery.
57
104
 
58
105
  Returns
59
106
  -------
60
107
  JSONData
61
- Parsed JSON payload, or a fallback object with raw text.
108
+ Extracted payload.
62
109
 
63
110
  Raises
64
111
  ------
65
- TypeError
66
- If a provided ``session`` does not expose the required HTTP
67
- method (for example, ``get``).
112
+ ValueError
113
+ If required parameters are missing.
68
114
  """
69
- timeout = kwargs.pop('timeout', None)
70
- session = kwargs.pop('session', None)
115
+ if (
116
+ use_client
117
+ and env.get('use_endpoints')
118
+ and env.get('base_url')
119
+ and env.get('endpoints_map')
120
+ and env.get('endpoint_key')
121
+ ):
122
+ client = _build_client(
123
+ base_url=cast(str, env.get('base_url')),
124
+ base_path=cast(str | None, env.get('base_path')),
125
+ endpoints=cast(dict[str, str], env.get('endpoints_map', {})),
126
+ retry=env.get('retry'),
127
+ retry_network_errors=bool(env.get('retry_network_errors', False)),
128
+ session=env.get('session'),
129
+ )
130
+ return paginate_with_client(
131
+ client,
132
+ cast(str, env.get('endpoint_key')),
133
+ env.get('params'),
134
+ env.get('headers'),
135
+ env.get('timeout'),
136
+ env.get('pagination'),
137
+ cast(float | None, env.get('sleep_seconds')),
138
+ )
139
+
140
+ url = env.get('url')
141
+ if not url:
142
+ raise ValueError('API source missing URL')
143
+
144
+ if use_client:
145
+ parts = urlsplit(cast(str, url))
146
+ base = urlunsplit((parts.scheme, parts.netloc, '', '', ''))
147
+ client = _build_client(
148
+ base_url=base,
149
+ base_path=None,
150
+ endpoints={},
151
+ retry=env.get('retry'),
152
+ retry_network_errors=bool(env.get('retry_network_errors', False)),
153
+ session=env.get('session'),
154
+ )
155
+ request_options = RequestOptions(
156
+ params=cast(Mapping[str, Any] | None, env.get('params')),
157
+ headers=cast(Mapping[str, str] | None, env.get('headers')),
158
+ timeout=cast(Timeout | None, env.get('timeout')),
159
+ )
160
+
161
+ return client.paginate_url(
162
+ cast(str, url),
163
+ cast(PaginationConfigMap | None, env.get('pagination')),
164
+ request=request_options,
165
+ sleep_seconds=cast(float, env.get('sleep_seconds', 0.0)),
166
+ )
167
+
168
+ method = env.get('method', HttpMethod.GET)
169
+ timeout = env.get('timeout', None)
170
+ session = env.get('session', None)
171
+ request_kwargs = dict(env.get('request_kwargs') or {})
71
172
  request_callable, timeout, _ = resolve_request(
72
173
  method,
73
174
  session=session,
74
175
  timeout=timeout,
75
176
  )
76
- response = request_callable(url, timeout=timeout, **kwargs)
177
+ response = request_callable(
178
+ cast(str, url),
179
+ timeout=timeout,
180
+ **request_kwargs,
181
+ )
77
182
  response.raise_for_status()
183
+ return _parse_api_response(response)
184
+
78
185
 
186
+ def _parse_api_response(
187
+ response: Any,
188
+ ) -> JSONData:
189
+ """
190
+ Parse API responses into a consistent JSON payload.
191
+
192
+ Parameters
193
+ ----------
194
+ response : Any
195
+ HTTP response object exposing ``headers``, ``json()``, and ``text``.
196
+
197
+ Returns
198
+ -------
199
+ JSONData
200
+ Parsed JSON payload, or a fallback object with raw text.
201
+ """
79
202
  content_type = response.headers.get('content-type', '').lower()
80
203
  if 'application/json' in content_type:
81
204
  try:
@@ -99,6 +222,70 @@ def extract_from_api(
99
222
  return {'content': response.text, 'content_type': content_type}
100
223
 
101
224
 
225
+ # SECTION: FUNCTIONS ======================================================== #
226
+
227
+
228
+ def extract_from_api(
229
+ url: str,
230
+ method: HttpMethod | str = HttpMethod.GET,
231
+ **kwargs: Any,
232
+ ) -> JSONData:
233
+ """
234
+ Extract data from a REST API.
235
+
236
+ Parameters
237
+ ----------
238
+ url : str
239
+ API endpoint URL.
240
+ method : HttpMethod | str, optional
241
+ HTTP method to use. Defaults to ``GET``.
242
+ **kwargs : Any
243
+ Extra arguments forwarded to the underlying ``requests`` call
244
+ (for example, ``timeout``). To use a pre-configured
245
+ :class:`requests.Session`, provide it via ``session``.
246
+ When omitted, ``timeout`` defaults to 10 seconds.
247
+
248
+ Returns
249
+ -------
250
+ JSONData
251
+ Parsed JSON payload, or a fallback object with raw text.
252
+ """
253
+ env = {
254
+ 'url': url,
255
+ 'method': method,
256
+ 'timeout': kwargs.pop('timeout', None),
257
+ 'session': kwargs.pop('session', None),
258
+ 'request_kwargs': kwargs,
259
+ }
260
+ return _extract_from_api_env(env, use_client=False)
261
+
262
+
263
+ def extract_from_api_source(
264
+ cfg: Any,
265
+ source_obj: Any,
266
+ overrides: dict[str, Any],
267
+ ) -> JSONData:
268
+ """
269
+ Extract data from a REST API source connector.
270
+
271
+ Parameters
272
+ ----------
273
+ cfg : Any
274
+ Pipeline configuration.
275
+ source_obj : Any
276
+ Connector configuration.
277
+ overrides : dict[str, Any]
278
+ Extract-time overrides.
279
+
280
+ Returns
281
+ -------
282
+ JSONData
283
+ Extracted payload.
284
+ """
285
+ env = compose_api_request_env(cfg, source_obj, overrides)
286
+ return _extract_from_api_env(env, use_client=True)
287
+
288
+
102
289
  def extract_from_database(
103
290
  connection_string: str,
104
291
  ) -> JSONList:
etlplus/ops/load.py CHANGED
@@ -8,11 +8,13 @@ from __future__ import annotations
8
8
 
9
9
  import json
10
10
  import sys
11
+ from collections.abc import Mapping
11
12
  from pathlib import Path
12
13
  from typing import Any
13
14
  from typing import cast
14
15
 
15
16
  from ..api import HttpMethod
17
+ from ..api import compose_api_target_env
16
18
  from ..api.utils import resolve_request
17
19
  from ..connector import DataConnectorType
18
20
  from ..file import File
@@ -39,6 +41,108 @@ __all__ = [
39
41
  # SECTION: INTERNAL FUNCTIONS ============================================== #
40
42
 
41
43
 
44
+ def _load_data_from_str(
45
+ source: str,
46
+ ) -> JSONData:
47
+ """
48
+ Load JSON data from a string or file path.
49
+
50
+ Parameters
51
+ ----------
52
+ source : str
53
+ Input string representing a file path or JSON payload.
54
+
55
+ Returns
56
+ -------
57
+ JSONData
58
+ Parsed JSON payload.
59
+ """
60
+ # Special case: '-' means read JSON from STDIN (Unix convention).
61
+ if source == '-':
62
+ raw = sys.stdin.read()
63
+ return _parse_json_string(raw)
64
+
65
+ candidate = Path(source)
66
+ if candidate.exists():
67
+ try:
68
+ return File(candidate, FileFormat.JSON).read()
69
+ except (OSError, json.JSONDecodeError, ValueError):
70
+ # Fall back to treating the string as raw JSON content.
71
+ pass
72
+ return _parse_json_string(source)
73
+
74
+
75
+ def _load_to_api_env(
76
+ data: JSONData,
77
+ env: Mapping[str, Any],
78
+ ) -> JSONDict:
79
+ """
80
+ Load data to an API target using a normalized environment.
81
+
82
+ Parameters
83
+ ----------
84
+ data : JSONData
85
+ Payload to load.
86
+ env : Mapping[str, Any]
87
+ Normalized request environment.
88
+
89
+ Returns
90
+ -------
91
+ JSONDict
92
+ Load result payload.
93
+
94
+ Raises
95
+ ------
96
+ ValueError
97
+ If required parameters are missing.
98
+ """
99
+ url = env.get('url')
100
+ if not url:
101
+ raise ValueError('API target missing "url"')
102
+ method = env.get('method') or 'post'
103
+ kwargs: dict[str, Any] = {}
104
+ headers = env.get('headers')
105
+ if headers:
106
+ kwargs['headers'] = cast(dict[str, str], headers)
107
+ if env.get('timeout') is not None:
108
+ kwargs['timeout'] = env.get('timeout')
109
+ session = env.get('session')
110
+ if session is not None:
111
+ kwargs['session'] = session
112
+ extra_kwargs = env.get('request_kwargs')
113
+ if isinstance(extra_kwargs, Mapping):
114
+ kwargs.update(extra_kwargs)
115
+ timeout = kwargs.pop('timeout', 10.0)
116
+ session = kwargs.pop('session', None)
117
+ request_callable, timeout, http_method = resolve_request(
118
+ method,
119
+ session=session,
120
+ timeout=timeout,
121
+ )
122
+ response = request_callable(
123
+ cast(str, url),
124
+ json=data,
125
+ timeout=timeout,
126
+ **kwargs,
127
+ )
128
+ response.raise_for_status()
129
+
130
+ # Try JSON first, fall back to text.
131
+ try:
132
+ payload: Any = response.json()
133
+ except ValueError:
134
+ payload = response.text
135
+
136
+ return {
137
+ 'status': 'success',
138
+ 'status_code': response.status_code,
139
+ 'message': f'Data loaded to {url}',
140
+ 'response': payload,
141
+ 'records': count_records(data),
142
+ 'method': http_method.value.upper(),
143
+ }
144
+
145
+
42
146
  def _parse_json_string(
43
147
  raw: str,
44
148
  ) -> JSONData:
@@ -113,18 +217,7 @@ def load_data(
113
217
  return File(source, FileFormat.JSON).read()
114
218
 
115
219
  if isinstance(source, str):
116
- # Special case: '-' means read JSON from STDIN (Unix convention).
117
- if source == '-':
118
- raw = sys.stdin.read()
119
- return _parse_json_string(raw)
120
- candidate = Path(source)
121
- if candidate.exists():
122
- try:
123
- return File(candidate, FileFormat.JSON).read()
124
- except (OSError, json.JSONDecodeError, ValueError):
125
- # Fall back to treating the string as raw JSON content.
126
- pass
127
- return _parse_json_string(source)
220
+ return _load_data_from_str(source)
128
221
 
129
222
  raise TypeError(
130
223
  'source must be a mapping, sequence of mappings, path, or JSON string',
@@ -158,30 +251,43 @@ def load_to_api(
158
251
  Result dictionary including response payload or text.
159
252
  """
160
253
  # Apply a conservative timeout to guard against hanging requests.
161
- timeout = kwargs.pop('timeout', 10.0)
162
- session = kwargs.pop('session', None)
163
- request_callable, timeout, http_method = resolve_request(
164
- method,
165
- session=session,
166
- timeout=timeout,
167
- )
168
- response = request_callable(url, json=data, timeout=timeout, **kwargs)
169
- response.raise_for_status()
254
+ env = {
255
+ 'url': url,
256
+ 'method': method,
257
+ 'timeout': kwargs.pop('timeout', 10.0),
258
+ 'session': kwargs.pop('session', None),
259
+ 'request_kwargs': kwargs,
260
+ }
261
+ return _load_to_api_env(data, env)
170
262
 
171
- # Try JSON first, fall back to text.
172
- try:
173
- payload: Any = response.json()
174
- except ValueError:
175
- payload = response.text
176
263
 
177
- return {
178
- 'status': 'success',
179
- 'status_code': response.status_code,
180
- 'message': f'Data loaded to {url}',
181
- 'response': payload,
182
- 'records': count_records(data),
183
- 'method': http_method.value.upper(),
184
- }
264
+ def load_to_api_target(
265
+ cfg: Any,
266
+ target_obj: Any,
267
+ overrides: dict[str, Any],
268
+ data: JSONData,
269
+ ) -> JSONDict:
270
+ """
271
+ Load data to an API target connector.
272
+
273
+ Parameters
274
+ ----------
275
+ cfg : Any
276
+ Pipeline configuration.
277
+ target_obj : Any
278
+ Connector configuration.
279
+ overrides : dict[str, Any]
280
+ Load-time overrides.
281
+ data : JSONData
282
+ Payload to load.
283
+
284
+ Returns
285
+ -------
286
+ JSONDict
287
+ Load result.
288
+ """
289
+ env = compose_api_target_env(cfg, target_obj, overrides)
290
+ return _load_to_api_env(data, env)
185
291
 
186
292
 
187
293
  def load_to_database(