etlplus 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- etlplus/__init__.py +43 -0
- etlplus/__main__.py +22 -0
- etlplus/__version__.py +14 -0
- etlplus/api/README.md +237 -0
- etlplus/api/__init__.py +136 -0
- etlplus/api/auth.py +432 -0
- etlplus/api/config.py +633 -0
- etlplus/api/endpoint_client.py +885 -0
- etlplus/api/errors.py +170 -0
- etlplus/api/pagination/__init__.py +47 -0
- etlplus/api/pagination/client.py +188 -0
- etlplus/api/pagination/config.py +440 -0
- etlplus/api/pagination/paginator.py +775 -0
- etlplus/api/rate_limiting/__init__.py +38 -0
- etlplus/api/rate_limiting/config.py +343 -0
- etlplus/api/rate_limiting/rate_limiter.py +266 -0
- etlplus/api/request_manager.py +589 -0
- etlplus/api/retry_manager.py +430 -0
- etlplus/api/transport.py +325 -0
- etlplus/api/types.py +172 -0
- etlplus/cli/__init__.py +15 -0
- etlplus/cli/app.py +1367 -0
- etlplus/cli/handlers.py +775 -0
- etlplus/cli/main.py +616 -0
- etlplus/config/__init__.py +56 -0
- etlplus/config/connector.py +372 -0
- etlplus/config/jobs.py +311 -0
- etlplus/config/pipeline.py +339 -0
- etlplus/config/profile.py +78 -0
- etlplus/config/types.py +204 -0
- etlplus/config/utils.py +120 -0
- etlplus/ddl.py +197 -0
- etlplus/enums.py +414 -0
- etlplus/extract.py +218 -0
- etlplus/file.py +657 -0
- etlplus/load.py +336 -0
- etlplus/mixins.py +62 -0
- etlplus/py.typed +0 -0
- etlplus/run.py +368 -0
- etlplus/run_helpers.py +843 -0
- etlplus/templates/__init__.py +5 -0
- etlplus/templates/ddl.sql.j2 +128 -0
- etlplus/templates/view.sql.j2 +69 -0
- etlplus/transform.py +1049 -0
- etlplus/types.py +227 -0
- etlplus/utils.py +638 -0
- etlplus/validate.py +493 -0
- etlplus/validation/__init__.py +44 -0
- etlplus/validation/utils.py +389 -0
- etlplus-0.5.4.dist-info/METADATA +616 -0
- etlplus-0.5.4.dist-info/RECORD +55 -0
- etlplus-0.5.4.dist-info/WHEEL +5 -0
- etlplus-0.5.4.dist-info/entry_points.txt +2 -0
- etlplus-0.5.4.dist-info/licenses/LICENSE +21 -0
- etlplus-0.5.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,885 @@
|
|
|
1
|
+
"""
|
|
2
|
+
:mod:`etlplus.api.endpoint_client` module.
|
|
3
|
+
|
|
4
|
+
Endpoint client for composing URLs, requests, and pagination.
|
|
5
|
+
|
|
6
|
+
This module provides :class:`EndpointClient`, a small frozen dataclass that
|
|
7
|
+
registers endpoint paths under a base URL, applies retry and rate-limiting
|
|
8
|
+
policies, and wires pagination helpers to fetch JSON records from REST APIs.
|
|
9
|
+
|
|
10
|
+
Notes
|
|
11
|
+
-----
|
|
12
|
+
- Retry-related types live in :mod:`etlplus.api.retry_manager`.
|
|
13
|
+
- Pagination requires a ``PaginationConfig``; see
|
|
14
|
+
:class:`PagePaginationConfigMap` and :class:`CursorPaginationConfigMap` for
|
|
15
|
+
the accepted shapes.
|
|
16
|
+
|
|
17
|
+
Examples
|
|
18
|
+
--------
|
|
19
|
+
>>> # Page-based pagination
|
|
20
|
+
>>> client = EndpointClient(
|
|
21
|
+
... base_url="https://api.example.com/v1",
|
|
22
|
+
... endpoints={"list": "/items"},
|
|
23
|
+
... )
|
|
24
|
+
>>> pg = {"type": "page", "page_size": 100}
|
|
25
|
+
>>> rows = client.paginate("list", pagination=pg)
|
|
26
|
+
|
|
27
|
+
>>> # Cursor-based pagination
|
|
28
|
+
>>> pg = {
|
|
29
|
+
... "type": "cursor",
|
|
30
|
+
... "records_path": "data.items",
|
|
31
|
+
... "cursor_param": "cursor",
|
|
32
|
+
... "cursor_path": "data.nextCursor",
|
|
33
|
+
... "page_size": 100,
|
|
34
|
+
... }
|
|
35
|
+
>>> rows = client.paginate("list", pagination=pg)
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import time
|
|
41
|
+
from collections.abc import Callable
|
|
42
|
+
from collections.abc import Iterator
|
|
43
|
+
from collections.abc import Mapping
|
|
44
|
+
from collections.abc import Sequence
|
|
45
|
+
from dataclasses import dataclass
|
|
46
|
+
from dataclasses import field
|
|
47
|
+
from types import MappingProxyType
|
|
48
|
+
from types import TracebackType
|
|
49
|
+
from typing import Any
|
|
50
|
+
from typing import ClassVar
|
|
51
|
+
from typing import Self
|
|
52
|
+
from typing import cast
|
|
53
|
+
from urllib.parse import parse_qsl
|
|
54
|
+
from urllib.parse import quote
|
|
55
|
+
from urllib.parse import urlencode
|
|
56
|
+
from urllib.parse import urlsplit
|
|
57
|
+
from urllib.parse import urlunsplit
|
|
58
|
+
|
|
59
|
+
import requests # type: ignore[import]
|
|
60
|
+
|
|
61
|
+
from ..types import JSONData
|
|
62
|
+
from ..types import JSONDict
|
|
63
|
+
from .errors import ApiRequestError
|
|
64
|
+
from .errors import PaginationError
|
|
65
|
+
from .pagination import PaginationClient
|
|
66
|
+
from .pagination import PaginationInput
|
|
67
|
+
from .pagination import Paginator
|
|
68
|
+
from .rate_limiting import RateLimitConfigMap
|
|
69
|
+
from .rate_limiting import RateLimiter
|
|
70
|
+
from .rate_limiting import RateLimitOverrides
|
|
71
|
+
from .request_manager import RequestManager
|
|
72
|
+
from .retry_manager import RetryManager
|
|
73
|
+
from .retry_manager import RetryPolicy
|
|
74
|
+
from .retry_manager import RetryStrategy
|
|
75
|
+
from .transport import HTTPAdapterMountConfig
|
|
76
|
+
from .types import RequestOptions
|
|
77
|
+
from .types import Url
|
|
78
|
+
|
|
79
|
+
# SECTION: CLASSES ========================================================== #
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True, slots=True)
|
|
83
|
+
class EndpointClient:
|
|
84
|
+
"""
|
|
85
|
+
Immutable registry of endpoint path templates rooted at a base URL.
|
|
86
|
+
|
|
87
|
+
Summary
|
|
88
|
+
-------
|
|
89
|
+
Provides helpers for composing absolute URLs, paginating responses,
|
|
90
|
+
applying client-wide rate limits, and performing jittered exponential
|
|
91
|
+
backoff retries. The dataclass is frozen and uses ``slots`` for memory
|
|
92
|
+
efficiency; mutating attribute values is disallowed.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
base_url : Url
|
|
97
|
+
Absolute base URL, e.g., ``"https://api.example.com/v1"``.
|
|
98
|
+
endpoints : Mapping[str, str]
|
|
99
|
+
Mapping of endpoint keys to relative paths, e.g.,
|
|
100
|
+
``{"list_users": "/users", "user": "/users/{id}"}``.
|
|
101
|
+
base_path : str | None, optional
|
|
102
|
+
Optional base path prefix (``/v2``) prepended to all endpoint
|
|
103
|
+
paths when building URLs.
|
|
104
|
+
retry : RetryPolicy | None, optional
|
|
105
|
+
Optional retry policy. When provided, failed requests matching
|
|
106
|
+
``retry_on`` statuses are retried with full jitter.
|
|
107
|
+
retry_network_errors : bool, optional
|
|
108
|
+
When ``True``, also retry on network errors (timeouts, connection
|
|
109
|
+
resets). Defaults to ``False``.
|
|
110
|
+
rate_limit : RateLimitConfigMap | None, optional
|
|
111
|
+
Optional client-wide rate limit used to derive an inter-request
|
|
112
|
+
delay when an explicit ``sleep_seconds`` isn't supplied.
|
|
113
|
+
session : requests.Session | None, optional
|
|
114
|
+
Explicit HTTP session for all requests.
|
|
115
|
+
session_factory : Callable[[], requests.Session] | None, optional
|
|
116
|
+
Factory used to lazily create a session. Ignored if ``session`` is
|
|
117
|
+
provided.
|
|
118
|
+
session_adapters : Sequence[HTTPAdapterMountConfig] | None, optional
|
|
119
|
+
Adapter mount configuration(s) used to build a session lazily when
|
|
120
|
+
neither ``session`` nor ``session_factory`` is supplied.
|
|
121
|
+
|
|
122
|
+
Attributes
|
|
123
|
+
----------
|
|
124
|
+
base_url : Url
|
|
125
|
+
Absolute base URL.
|
|
126
|
+
endpoints : Mapping[str, str]
|
|
127
|
+
Read-only mapping of endpoint keys to relative paths
|
|
128
|
+
(``MappingProxyType``).
|
|
129
|
+
base_path : str | None
|
|
130
|
+
Optional base path prefix appended after ``base_url``.
|
|
131
|
+
retry : RetryPolicy | None
|
|
132
|
+
Retry policy reference (may be ``None``).
|
|
133
|
+
retry_network_errors : bool
|
|
134
|
+
Whether network errors are retried in addition to HTTP statuses.
|
|
135
|
+
rate_limit : RateLimitConfigMap | None
|
|
136
|
+
Client-wide rate limit configuration (may be ``None``).
|
|
137
|
+
session : requests.Session | None
|
|
138
|
+
Explicit HTTP session used for requests when provided.
|
|
139
|
+
session_factory : Callable[[], requests.Session] | None
|
|
140
|
+
Lazily invoked factory producing a session when needed.
|
|
141
|
+
session_adapters : Sequence[HTTPAdapterMountConfig] | None
|
|
142
|
+
Adapter mount configuration(s) for connection pooling / retries.
|
|
143
|
+
DEFAULT_PAGE_PARAM : ClassVar[str]
|
|
144
|
+
Default page parameter name.
|
|
145
|
+
DEFAULT_SIZE_PARAM : ClassVar[str]
|
|
146
|
+
Default page-size parameter name.
|
|
147
|
+
DEFAULT_START_PAGE : ClassVar[int]
|
|
148
|
+
Default starting page number.
|
|
149
|
+
DEFAULT_PAGE_SIZE : ClassVar[int]
|
|
150
|
+
Default records-per-page when unspecified.
|
|
151
|
+
DEFAULT_CURSOR_PARAM : ClassVar[str]
|
|
152
|
+
Default cursor parameter name.
|
|
153
|
+
DEFAULT_LIMIT_PARAM : ClassVar[str]
|
|
154
|
+
Default limit parameter name used for cursor pagination.
|
|
155
|
+
DEFAULT_RETRY_MAX_ATTEMPTS : ClassVar[int]
|
|
156
|
+
Fallback max attempts when retry policy omits it.
|
|
157
|
+
DEFAULT_RETRY_BACKOFF : ClassVar[float]
|
|
158
|
+
Fallback exponential backoff base seconds.
|
|
159
|
+
DEFAULT_RETRY_ON : ClassVar[tuple[int, ...]]
|
|
160
|
+
Default HTTP status codes eligible for retry.
|
|
161
|
+
DEFAULT_RETRY_CAP : ClassVar[float]
|
|
162
|
+
Maximum sleep seconds for jittered backoff.
|
|
163
|
+
DEFAULT_TIMEOUT : ClassVar[float]
|
|
164
|
+
Default timeout applied to HTTP requests when unspecified.
|
|
165
|
+
|
|
166
|
+
Raises
|
|
167
|
+
------
|
|
168
|
+
ValueError
|
|
169
|
+
If ``base_url`` is not absolute or endpoint keys/values are invalid.
|
|
170
|
+
|
|
171
|
+
Notes
|
|
172
|
+
-----
|
|
173
|
+
- Endpoint mapping is defensively copied and wrapped read-only.
|
|
174
|
+
- Pagination defaults (page size, start page, cursor param, etc.) are
|
|
175
|
+
centralized as class variables.
|
|
176
|
+
- Context manager support (``with EndpointClient(...) as client``)
|
|
177
|
+
manages session lifecycle; owned sessions are closed on exit.
|
|
178
|
+
- Retries use exponential backoff with jitter capped by
|
|
179
|
+
``DEFAULT_RETRY_CAP`` seconds.
|
|
180
|
+
|
|
181
|
+
Examples
|
|
182
|
+
--------
|
|
183
|
+
Basic URL composition
|
|
184
|
+
^^^^^^^^^^^^^^^^^^^^^
|
|
185
|
+
>>> client = EndpointClient(
|
|
186
|
+
... base_url="https://api.example.com/v1",
|
|
187
|
+
... endpoints={"list_users": "/users", "user": "/users/{id}"},
|
|
188
|
+
... )
|
|
189
|
+
>>> client.url("list_users", query_parameters={"active": "true"})
|
|
190
|
+
'https://api.example.com/v1/users?active=true'
|
|
191
|
+
>>> client.url("user", path_parameters={"id": 42})
|
|
192
|
+
'https://api.example.com/v1/users/42'
|
|
193
|
+
|
|
194
|
+
Page pagination with retries
|
|
195
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
196
|
+
>>> client = EndpointClient(
|
|
197
|
+
... base_url="https://api.example.com/v1",
|
|
198
|
+
... endpoints={"list": "/items"},
|
|
199
|
+
... retry={"max_attempts": 5, "backoff": 0.5, "retry_on": [429, 503]},
|
|
200
|
+
... retry_network_errors=True,
|
|
201
|
+
... )
|
|
202
|
+
>>> rows = client.paginate(
|
|
203
|
+
... "list",
|
|
204
|
+
... pagination={"type": "page", "page_size": 50},
|
|
205
|
+
... )
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
# -- Attributes -- #
|
|
209
|
+
|
|
210
|
+
base_url: Url
|
|
211
|
+
endpoints: Mapping[str, str]
|
|
212
|
+
base_path: str | None = None
|
|
213
|
+
|
|
214
|
+
# Optional retry configuration (constructor parameter; object is frozen)
|
|
215
|
+
retry: RetryPolicy | None = None
|
|
216
|
+
retry_network_errors: bool = False
|
|
217
|
+
# Optional client-wide rate limit configuration
|
|
218
|
+
rate_limit: RateLimitConfigMap | None = None
|
|
219
|
+
|
|
220
|
+
# Optional HTTP session or factory
|
|
221
|
+
session: requests.Session | None = None
|
|
222
|
+
session_factory: Callable[[], requests.Session] | None = None
|
|
223
|
+
|
|
224
|
+
# Optional HTTPAdapter mount configuration(s) for transport-level retries
|
|
225
|
+
# and connection pooling. If provided and neither `session` nor
|
|
226
|
+
# `session_factory` is supplied, a factory is synthesized to create a
|
|
227
|
+
# Session and mount the configured adapters lazily.
|
|
228
|
+
session_adapters: Sequence[HTTPAdapterMountConfig] | None = None
|
|
229
|
+
|
|
230
|
+
# Internal: context-managed session and ownership flag.
|
|
231
|
+
_request_manager: RequestManager = field(
|
|
232
|
+
init=False,
|
|
233
|
+
repr=False,
|
|
234
|
+
compare=False,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# -- Class Defaults (Centralized) -- #
|
|
238
|
+
|
|
239
|
+
DEFAULT_PAGE_PARAM: ClassVar[str] = 'page'
|
|
240
|
+
DEFAULT_SIZE_PARAM: ClassVar[str] = 'per_page'
|
|
241
|
+
DEFAULT_START_PAGE: ClassVar[int] = 1
|
|
242
|
+
DEFAULT_PAGE_SIZE: ClassVar[int] = 100
|
|
243
|
+
DEFAULT_CURSOR_PARAM: ClassVar[str] = 'cursor'
|
|
244
|
+
DEFAULT_LIMIT_PARAM: ClassVar[str] = 'limit'
|
|
245
|
+
|
|
246
|
+
# Retry defaults (only used if a policy is provided)
|
|
247
|
+
DEFAULT_RETRY_MAX_ATTEMPTS: ClassVar[int] = RetryStrategy.DEFAULT_ATTEMPTS
|
|
248
|
+
DEFAULT_RETRY_BACKOFF: ClassVar[float] = RetryStrategy.DEFAULT_BACKOFF
|
|
249
|
+
DEFAULT_RETRY_ON: ClassVar[tuple[int, ...]] = tuple(
|
|
250
|
+
RetryManager.DEFAULT_STATUS_CODES,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Cap for jittered backoff sleeps (seconds)
|
|
254
|
+
DEFAULT_RETRY_CAP: ClassVar[float] = RetryManager.DEFAULT_CAP
|
|
255
|
+
|
|
256
|
+
# Default timeout applied when callers do not explicitly provide one.
|
|
257
|
+
DEFAULT_TIMEOUT: ClassVar[float] = 10.0
|
|
258
|
+
|
|
259
|
+
# -- Magic Methods (Object Lifecycle) -- #
|
|
260
|
+
|
|
261
|
+
def __post_init__(self) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Validate inputs and finalize immutable state.
|
|
264
|
+
|
|
265
|
+
Ensures ``base_url`` is absolute, copies and validates endpoint
|
|
266
|
+
mappings, wraps them in a read-only proxy, and synthesizes a
|
|
267
|
+
session factory when only adapter configs are provided.
|
|
268
|
+
|
|
269
|
+
Raises
|
|
270
|
+
------
|
|
271
|
+
ValueError
|
|
272
|
+
If ``base_url`` is not absolute or endpoints are invalid.
|
|
273
|
+
"""
|
|
274
|
+
# Validate base_url is absolute.
|
|
275
|
+
parts = urlsplit(self.base_url)
|
|
276
|
+
if not parts.scheme or not parts.netloc:
|
|
277
|
+
raise ValueError(
|
|
278
|
+
'base_url must be absolute, e.g. "https://api.example.com"',
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Defensive copy + validate endpoints with concise comprehension.
|
|
282
|
+
eps = dict(self.endpoints)
|
|
283
|
+
invalid = [
|
|
284
|
+
(k, v)
|
|
285
|
+
for k, v in eps.items()
|
|
286
|
+
if not (isinstance(k, str) and isinstance(v, str) and v)
|
|
287
|
+
]
|
|
288
|
+
if invalid:
|
|
289
|
+
sample = invalid[:3]
|
|
290
|
+
msg = (
|
|
291
|
+
'endpoints must map str -> non-empty str; '
|
|
292
|
+
f'invalid entries: {sample}'
|
|
293
|
+
)
|
|
294
|
+
raise ValueError(msg)
|
|
295
|
+
# Wrap in a read-only mapping to ensure immutability
|
|
296
|
+
object.__setattr__(self, 'endpoints', MappingProxyType(eps))
|
|
297
|
+
|
|
298
|
+
# If both session and factory are provided, prefer explicit session.
|
|
299
|
+
if self.session is not None and self.session_factory is not None:
|
|
300
|
+
object.__setattr__(self, 'session_factory', None)
|
|
301
|
+
|
|
302
|
+
# Normalize adapter configs to tuples for immutability.
|
|
303
|
+
if self.session_adapters:
|
|
304
|
+
adapters_cfg = tuple(self.session_adapters)
|
|
305
|
+
object.__setattr__(self, 'session_adapters', adapters_cfg)
|
|
306
|
+
else:
|
|
307
|
+
object.__setattr__(self, 'session_adapters', None)
|
|
308
|
+
|
|
309
|
+
manager = RequestManager(
|
|
310
|
+
retry=self.retry,
|
|
311
|
+
retry_network_errors=self.retry_network_errors,
|
|
312
|
+
default_timeout=self.DEFAULT_TIMEOUT,
|
|
313
|
+
session=self.session,
|
|
314
|
+
session_factory=self.session_factory,
|
|
315
|
+
session_adapters=self.session_adapters,
|
|
316
|
+
retry_cap=self.DEFAULT_RETRY_CAP,
|
|
317
|
+
)
|
|
318
|
+
object.__setattr__(self, '_request_manager', manager)
|
|
319
|
+
|
|
320
|
+
# -- Magic Methods (Context Manager Protocol) -- #
|
|
321
|
+
|
|
322
|
+
def __enter__(self) -> Self:
|
|
323
|
+
"""
|
|
324
|
+
Enter the runtime context related to this object.
|
|
325
|
+
|
|
326
|
+
Returns
|
|
327
|
+
-------
|
|
328
|
+
Self
|
|
329
|
+
The client instance.
|
|
330
|
+
"""
|
|
331
|
+
self._request_manager.__enter__()
|
|
332
|
+
return self
|
|
333
|
+
|
|
334
|
+
def __exit__(
|
|
335
|
+
self,
|
|
336
|
+
exc_type: type[BaseException] | None,
|
|
337
|
+
exc: BaseException | None,
|
|
338
|
+
tb: TracebackType | None,
|
|
339
|
+
) -> None:
|
|
340
|
+
"""
|
|
341
|
+
Exit the runtime context related to this object.
|
|
342
|
+
|
|
343
|
+
Parameters
|
|
344
|
+
----------
|
|
345
|
+
exc_type : type[BaseException] | None
|
|
346
|
+
Exception type if raised, else ``None``.
|
|
347
|
+
exc : BaseException | None
|
|
348
|
+
Exception instance if raised, else ``None``.
|
|
349
|
+
tb : TracebackType | None
|
|
350
|
+
Traceback if exception raised, else ``None``.
|
|
351
|
+
"""
|
|
352
|
+
self._request_manager.__exit__(exc_type, exc, tb)
|
|
353
|
+
|
|
354
|
+
# -- Internal Instance Methods -- #
|
|
355
|
+
|
|
356
|
+
def _build_pagination_client(
|
|
357
|
+
self,
|
|
358
|
+
*,
|
|
359
|
+
pagination: PaginationInput,
|
|
360
|
+
sleep_seconds: float,
|
|
361
|
+
rate_limit_overrides: RateLimitOverrides,
|
|
362
|
+
) -> PaginationClient:
|
|
363
|
+
"""
|
|
364
|
+
Create a :class:`PaginationClient` wired to the request manager.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
pagination : PaginationInput
|
|
369
|
+
Pagination configuration mapping or :class:`PaginationConfig`.
|
|
370
|
+
sleep_seconds : float
|
|
371
|
+
Number of seconds to sleep between requests.
|
|
372
|
+
rate_limit_overrides : RateLimitOverrides
|
|
373
|
+
Overrides for rate limiting.
|
|
374
|
+
|
|
375
|
+
Returns
|
|
376
|
+
-------
|
|
377
|
+
PaginationClient
|
|
378
|
+
Configured pagination helper instance.
|
|
379
|
+
"""
|
|
380
|
+
effective_sleep = self._resolve_sleep_seconds(
|
|
381
|
+
sleep_seconds,
|
|
382
|
+
self.rate_limit,
|
|
383
|
+
rate_limit_overrides,
|
|
384
|
+
)
|
|
385
|
+
rate_limiter = (
|
|
386
|
+
RateLimiter.fixed(effective_sleep) if effective_sleep > 0 else None
|
|
387
|
+
)
|
|
388
|
+
return PaginationClient(
|
|
389
|
+
pagination=pagination,
|
|
390
|
+
fetch=self._fetch_page,
|
|
391
|
+
rate_limiter=rate_limiter,
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
def _fetch_page(
|
|
395
|
+
self,
|
|
396
|
+
url_: Url,
|
|
397
|
+
request: RequestOptions,
|
|
398
|
+
page_index: int | None,
|
|
399
|
+
) -> JSONData:
|
|
400
|
+
"""
|
|
401
|
+
Fetch a single page using shared pagination guardrails.
|
|
402
|
+
|
|
403
|
+
Parameters
|
|
404
|
+
----------
|
|
405
|
+
url_ : Url
|
|
406
|
+
Absolute URL to request.
|
|
407
|
+
request : RequestOptions
|
|
408
|
+
Request metadata produced by ``Paginator``.
|
|
409
|
+
page_index : int | None
|
|
410
|
+
Index of the page being fetched.
|
|
411
|
+
|
|
412
|
+
Returns
|
|
413
|
+
-------
|
|
414
|
+
JSONData
|
|
415
|
+
Parsed response payload.
|
|
416
|
+
|
|
417
|
+
Raises
|
|
418
|
+
------
|
|
419
|
+
PaginationError
|
|
420
|
+
If the request fails.
|
|
421
|
+
"""
|
|
422
|
+
call_kw = request.as_kwargs()
|
|
423
|
+
try:
|
|
424
|
+
return self.get(url_, **call_kw)
|
|
425
|
+
except ApiRequestError as exc:
|
|
426
|
+
raise PaginationError(
|
|
427
|
+
url=url_,
|
|
428
|
+
status=exc.status,
|
|
429
|
+
attempts=exc.attempts,
|
|
430
|
+
retried=exc.retried,
|
|
431
|
+
retry_policy=exc.retry_policy,
|
|
432
|
+
cause=exc,
|
|
433
|
+
page=page_index,
|
|
434
|
+
) from exc
|
|
435
|
+
|
|
436
|
+
# -- Instance Methods (HTTP Requests ) -- #
|
|
437
|
+
|
|
438
|
+
def get(
|
|
439
|
+
self,
|
|
440
|
+
url: Url,
|
|
441
|
+
**kwargs: Any,
|
|
442
|
+
) -> JSONData:
|
|
443
|
+
"""
|
|
444
|
+
Wrap ``request('GET', ...)`` for convenience.
|
|
445
|
+
|
|
446
|
+
Parameters
|
|
447
|
+
----------
|
|
448
|
+
url : Url
|
|
449
|
+
Absolute URL to request.
|
|
450
|
+
**kwargs : Any
|
|
451
|
+
Additional keyword arguments forwarded to ``requests``
|
|
452
|
+
(e.g., ``params``, ``headers``).
|
|
453
|
+
|
|
454
|
+
Returns
|
|
455
|
+
-------
|
|
456
|
+
JSONData
|
|
457
|
+
Parsed JSON payload or fallback structure matching
|
|
458
|
+
:func:`etlplus.extract.extract_from_api` semantics.
|
|
459
|
+
"""
|
|
460
|
+
return self._request_manager.get(url, **kwargs)
|
|
461
|
+
|
|
462
|
+
def post(
|
|
463
|
+
self,
|
|
464
|
+
url: Url,
|
|
465
|
+
**kwargs: Any,
|
|
466
|
+
) -> JSONData:
|
|
467
|
+
"""
|
|
468
|
+
Wrap ``request('POST', ...)`` for convenience.
|
|
469
|
+
|
|
470
|
+
Parameters
|
|
471
|
+
----------
|
|
472
|
+
url : Url
|
|
473
|
+
Absolute URL to request.
|
|
474
|
+
**kwargs : Any
|
|
475
|
+
Additional keyword arguments forwarded to ``requests``
|
|
476
|
+
(e.g., ``params``, ``headers``, ``json``).
|
|
477
|
+
|
|
478
|
+
Returns
|
|
479
|
+
-------
|
|
480
|
+
JSONData
|
|
481
|
+
Parsed JSON payload or fallback structure matching
|
|
482
|
+
:func:`etlplus.extract.extract_from_api` semantics.
|
|
483
|
+
"""
|
|
484
|
+
return self._request_manager.post(url, **kwargs)
|
|
485
|
+
|
|
486
|
+
def request(
|
|
487
|
+
self,
|
|
488
|
+
method: str,
|
|
489
|
+
url: Url,
|
|
490
|
+
**kwargs: Any,
|
|
491
|
+
) -> JSONData:
|
|
492
|
+
"""
|
|
493
|
+
Execute an HTTP request using the client's retry and session settings.
|
|
494
|
+
|
|
495
|
+
Parameters
|
|
496
|
+
----------
|
|
497
|
+
method : str
|
|
498
|
+
HTTP method to invoke (``'GET'``, ``'POST'``, etc.).
|
|
499
|
+
url : Url
|
|
500
|
+
Absolute URL to request.
|
|
501
|
+
**kwargs : Any
|
|
502
|
+
Additional keyword arguments forwarded to ``requests``
|
|
503
|
+
(e.g., ``params``, ``headers``, ``json``).
|
|
504
|
+
|
|
505
|
+
Returns
|
|
506
|
+
-------
|
|
507
|
+
JSONData
|
|
508
|
+
Parsed JSON payload or fallback structure matching
|
|
509
|
+
:func:`etlplus.extract.extract_from_api` semantics.
|
|
510
|
+
"""
|
|
511
|
+
return self._request_manager.request(method, url, **kwargs)
|
|
512
|
+
|
|
513
|
+
# -- Instance Methods (HTTP Responses) -- #
|
|
514
|
+
|
|
515
|
+
def paginate(
|
|
516
|
+
self,
|
|
517
|
+
endpoint_key: str,
|
|
518
|
+
*,
|
|
519
|
+
path_parameters: Mapping[str, str] | None = None,
|
|
520
|
+
query_parameters: Mapping[str, str] | None = None,
|
|
521
|
+
pagination: PaginationInput = None,
|
|
522
|
+
request: RequestOptions | None = None,
|
|
523
|
+
sleep_seconds: float = 0.0,
|
|
524
|
+
rate_limit_overrides: RateLimitOverrides = None,
|
|
525
|
+
) -> JSONData:
|
|
526
|
+
"""
|
|
527
|
+
Paginate by endpoint key.
|
|
528
|
+
|
|
529
|
+
Builds the URL via ``self.url(...)`` and delegates to ``paginate_url``.
|
|
530
|
+
|
|
531
|
+
Parameters
|
|
532
|
+
----------
|
|
533
|
+
endpoint_key : str
|
|
534
|
+
Key into the ``endpoints`` mapping whose relative path will be
|
|
535
|
+
resolved against ``base_url``.
|
|
536
|
+
path_parameters : Mapping[str, str] | None
|
|
537
|
+
Values to substitute into placeholders in the endpoint path.
|
|
538
|
+
query_parameters : Mapping[str, str] | None
|
|
539
|
+
Query parameters to append (merged with any already present on
|
|
540
|
+
``base_url``).
|
|
541
|
+
pagination : PaginationInput, optional
|
|
542
|
+
Pagination configuration mapping or :class:`PaginationConfig`.
|
|
543
|
+
request : RequestOptions | None, optional
|
|
544
|
+
Pre-built request metadata snapshot (params/headers/timeout).
|
|
545
|
+
sleep_seconds : float
|
|
546
|
+
Time to sleep between requests.
|
|
547
|
+
rate_limit_overrides : RateLimitOverrides, optional
|
|
548
|
+
Optional per-call overrides merged with ``self.rate_limit`` when
|
|
549
|
+
deriving pacing.
|
|
550
|
+
|
|
551
|
+
Returns
|
|
552
|
+
-------
|
|
553
|
+
JSONData
|
|
554
|
+
Raw JSON object for non-paginated calls, or a list of record
|
|
555
|
+
dicts aggregated across pages for paginated calls.
|
|
556
|
+
"""
|
|
557
|
+
url = self.url(
|
|
558
|
+
endpoint_key,
|
|
559
|
+
path_parameters=path_parameters,
|
|
560
|
+
query_parameters=query_parameters,
|
|
561
|
+
)
|
|
562
|
+
return self.paginate_url(
|
|
563
|
+
url,
|
|
564
|
+
pagination=pagination,
|
|
565
|
+
request=request,
|
|
566
|
+
sleep_seconds=sleep_seconds,
|
|
567
|
+
rate_limit_overrides=rate_limit_overrides,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
def paginate_iter(
|
|
571
|
+
self,
|
|
572
|
+
endpoint_key: str,
|
|
573
|
+
*,
|
|
574
|
+
path_parameters: Mapping[str, str] | None = None,
|
|
575
|
+
query_parameters: Mapping[str, str] | None = None,
|
|
576
|
+
pagination: PaginationInput = None,
|
|
577
|
+
request: RequestOptions | None = None,
|
|
578
|
+
sleep_seconds: float = 0.0,
|
|
579
|
+
rate_limit_overrides: RateLimitOverrides = None,
|
|
580
|
+
) -> Iterator[JSONDict]:
|
|
581
|
+
"""
|
|
582
|
+
Stream records for a registered endpoint using pagination.
|
|
583
|
+
|
|
584
|
+
Summary
|
|
585
|
+
-------
|
|
586
|
+
Generator variant of ``paginate`` that yields record dicts across
|
|
587
|
+
pages instead of aggregating them into a list.
|
|
588
|
+
|
|
589
|
+
Parameters
|
|
590
|
+
----------
|
|
591
|
+
endpoint_key : str
|
|
592
|
+
Key into the ``endpoints`` mapping whose relative path will be
|
|
593
|
+
resolved against ``base_url``.
|
|
594
|
+
path_parameters : Mapping[str, str] | None
|
|
595
|
+
Values to substitute into placeholders in the endpoint path.
|
|
596
|
+
query_parameters : Mapping[str, str] | None
|
|
597
|
+
Query parameters to append (merged with any already present).
|
|
598
|
+
pagination : PaginationInput, optional
|
|
599
|
+
Pagination configuration mapping or :class:`PaginationConfig`.
|
|
600
|
+
request : RequestOptions | None, optional
|
|
601
|
+
Pre-built request metadata snapshot (params/headers/timeout).
|
|
602
|
+
sleep_seconds : float
|
|
603
|
+
Time to sleep between requests.
|
|
604
|
+
rate_limit_overrides : RateLimitOverrides, optional
|
|
605
|
+
Optional per-call overrides merged with ``self.rate_limit`` when
|
|
606
|
+
deriving pacing.
|
|
607
|
+
|
|
608
|
+
Yields
|
|
609
|
+
------
|
|
610
|
+
JSONDict
|
|
611
|
+
Record dictionaries extracted from each page.
|
|
612
|
+
"""
|
|
613
|
+
url = self.url(
|
|
614
|
+
endpoint_key,
|
|
615
|
+
path_parameters=path_parameters,
|
|
616
|
+
query_parameters=query_parameters,
|
|
617
|
+
)
|
|
618
|
+
yield from self.paginate_url_iter(
|
|
619
|
+
url=url,
|
|
620
|
+
pagination=pagination,
|
|
621
|
+
request=request,
|
|
622
|
+
sleep_seconds=sleep_seconds,
|
|
623
|
+
rate_limit_overrides=rate_limit_overrides,
|
|
624
|
+
)
|
|
625
|
+
|
|
626
|
+
def paginate_url(
|
|
627
|
+
self,
|
|
628
|
+
url: Url,
|
|
629
|
+
pagination: PaginationInput = None,
|
|
630
|
+
*,
|
|
631
|
+
request: RequestOptions | None = None,
|
|
632
|
+
sleep_seconds: float = 0.0,
|
|
633
|
+
rate_limit_overrides: RateLimitOverrides = None,
|
|
634
|
+
) -> JSONData:
|
|
635
|
+
"""
|
|
636
|
+
Paginate API responses for an absolute URL and aggregate records.
|
|
637
|
+
|
|
638
|
+
Parameters
|
|
639
|
+
----------
|
|
640
|
+
url : Url
|
|
641
|
+
Absolute URL to paginate.
|
|
642
|
+
pagination : PaginationInput, optional
|
|
643
|
+
Pagination configuration mapping or :class:`PaginationConfig`.
|
|
644
|
+
request : RequestOptions | None, optional
|
|
645
|
+
Optional request snapshot with existing params/headers/timeout.
|
|
646
|
+
sleep_seconds : float
|
|
647
|
+
Time to sleep between requests.
|
|
648
|
+
rate_limit_overrides : RateLimitOverrides, optional
|
|
649
|
+
Optional per-call overrides merged with ``self.rate_limit`` when
|
|
650
|
+
deriving pacing.
|
|
651
|
+
|
|
652
|
+
Returns
|
|
653
|
+
-------
|
|
654
|
+
JSONData
|
|
655
|
+
Raw JSON object for non-paginated calls, or a list of record
|
|
656
|
+
dicts aggregated across pages for paginated calls.
|
|
657
|
+
"""
|
|
658
|
+
# Normalize pagination config for typed access.
|
|
659
|
+
if pagination is not None and not isinstance(pagination, Mapping):
|
|
660
|
+
ptype = getattr(pagination, 'type', None)
|
|
661
|
+
else:
|
|
662
|
+
pg_map = cast(Mapping[str, Any] | None, pagination)
|
|
663
|
+
ptype = Paginator.detect_type(pg_map, default=None)
|
|
664
|
+
request_obj = request or RequestOptions()
|
|
665
|
+
|
|
666
|
+
# Preserve raw JSON behavior for non-paginated and unknown types.
|
|
667
|
+
if ptype is None:
|
|
668
|
+
return self.get(url, **request_obj.as_kwargs())
|
|
669
|
+
|
|
670
|
+
# For known pagination types, delegate through paginate_url_iter to
|
|
671
|
+
# preserve subclass overrides (tests rely on this shim behavior).
|
|
672
|
+
# Pass the composed ``request_obj`` as the baseline snapshot and
|
|
673
|
+
# avoid re-specifying params/headers/timeout so pagination glue
|
|
674
|
+
# does not re-merge the same values a second time.
|
|
675
|
+
return list(
|
|
676
|
+
self.paginate_url_iter(
|
|
677
|
+
url,
|
|
678
|
+
pagination=pagination,
|
|
679
|
+
request=request_obj,
|
|
680
|
+
sleep_seconds=sleep_seconds,
|
|
681
|
+
rate_limit_overrides=rate_limit_overrides,
|
|
682
|
+
),
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
def paginate_url_iter(
|
|
686
|
+
self,
|
|
687
|
+
url: Url,
|
|
688
|
+
pagination: PaginationInput = None,
|
|
689
|
+
*,
|
|
690
|
+
request: RequestOptions | None = None,
|
|
691
|
+
sleep_seconds: float = 0.0,
|
|
692
|
+
rate_limit_overrides: RateLimitOverrides = None,
|
|
693
|
+
) -> Iterator[JSONDict]:
|
|
694
|
+
"""
|
|
695
|
+
Stream records by paginating an absolute URL.
|
|
696
|
+
|
|
697
|
+
Parameters
|
|
698
|
+
----------
|
|
699
|
+
url : Url
|
|
700
|
+
Absolute URL to paginate.
|
|
701
|
+
pagination : PaginationInput, optional
|
|
702
|
+
Pagination configuration mapping or :class:`PaginationConfig`.
|
|
703
|
+
request : RequestOptions | None, optional
|
|
704
|
+
Optional request snapshot reused across pages.
|
|
705
|
+
sleep_seconds : float
|
|
706
|
+
Time to sleep between requests.
|
|
707
|
+
rate_limit_overrides : RateLimitOverrides, optional
|
|
708
|
+
Optional per-call overrides merged with ``self.rate_limit`` when
|
|
709
|
+
deriving pacing.
|
|
710
|
+
|
|
711
|
+
Yields
|
|
712
|
+
------
|
|
713
|
+
JSONDict
|
|
714
|
+
Record dictionaries extracted from each page.
|
|
715
|
+
"""
|
|
716
|
+
base_request = request or RequestOptions()
|
|
717
|
+
|
|
718
|
+
runner = self._build_pagination_client(
|
|
719
|
+
pagination=pagination,
|
|
720
|
+
sleep_seconds=sleep_seconds,
|
|
721
|
+
rate_limit_overrides=rate_limit_overrides,
|
|
722
|
+
)
|
|
723
|
+
yield from runner.iterate(
|
|
724
|
+
url,
|
|
725
|
+
request=base_request,
|
|
726
|
+
)
|
|
727
|
+
|
|
728
|
+
# -- Instance Methods (Endpoints)-- #
|
|
729
|
+
|
|
730
|
+
def url(
|
|
731
|
+
self,
|
|
732
|
+
endpoint_key: str,
|
|
733
|
+
path_parameters: Mapping[str, Any] | None = None,
|
|
734
|
+
query_parameters: Mapping[str, Any] | None = None,
|
|
735
|
+
) -> str:
|
|
736
|
+
"""
|
|
737
|
+
Build an absolute URL for a registered endpoint.
|
|
738
|
+
|
|
739
|
+
Parameters
|
|
740
|
+
----------
|
|
741
|
+
endpoint_key : str
|
|
742
|
+
Key into the ``endpoints`` mapping whose relative path will be
|
|
743
|
+
resolved against ``base_url``.
|
|
744
|
+
path_parameters : Mapping[str, Any] | None, optional
|
|
745
|
+
Values to substitute into placeholders in the endpoint path.
|
|
746
|
+
Placeholders must be written as ``{placeholder}`` in the relative
|
|
747
|
+
path. Each substituted value is percent-encoded as a single path
|
|
748
|
+
segment (slashes are encoded) to prevent path traversal.
|
|
749
|
+
query_parameters : Mapping[str, Any] | None, optional
|
|
750
|
+
Query parameters to append (and merge with any already present on
|
|
751
|
+
``base_url``). Values are percent-encoded and combined using
|
|
752
|
+
``application/x-www-form-urlencoded`` rules.
|
|
753
|
+
|
|
754
|
+
Returns
|
|
755
|
+
-------
|
|
756
|
+
str
|
|
757
|
+
Constructed absolute URL.
|
|
758
|
+
|
|
759
|
+
Raises
|
|
760
|
+
------
|
|
761
|
+
KeyError
|
|
762
|
+
If ``endpoint_key`` is unknown or a required placeholder in the
|
|
763
|
+
path has no corresponding entry in ``path_parameters``.
|
|
764
|
+
ValueError
|
|
765
|
+
If the path template is invalid.
|
|
766
|
+
|
|
767
|
+
Examples
|
|
768
|
+
--------
|
|
769
|
+
>>> ep = EndpointClient(
|
|
770
|
+
... base_url='https://api.example.com/v1',
|
|
771
|
+
... endpoints={
|
|
772
|
+
... 'user': '/users/{id}',
|
|
773
|
+
... 'search': '/users'
|
|
774
|
+
... }
|
|
775
|
+
... )
|
|
776
|
+
>>> ep.url('user', path_parameters={'id': '42'})
|
|
777
|
+
'https://api.example.com/v1/users/42'
|
|
778
|
+
>>> ep.url('search', query_parameters={'q': 'Jane Doe', 'page': '2'})
|
|
779
|
+
'https://api.example.com/v1/users?q=Jane+Doe&page=2'
|
|
780
|
+
"""
|
|
781
|
+
if endpoint_key not in self.endpoints:
|
|
782
|
+
raise KeyError(f'Unknown endpoint_key: {endpoint_key!r}')
|
|
783
|
+
|
|
784
|
+
rel_path = self.endpoints[endpoint_key]
|
|
785
|
+
|
|
786
|
+
# Substitute path parameters if provided.
|
|
787
|
+
if '{' in rel_path:
|
|
788
|
+
try:
|
|
789
|
+
encoded = (
|
|
790
|
+
{
|
|
791
|
+
k: quote(str(v), safe='')
|
|
792
|
+
for k, v in path_parameters.items()
|
|
793
|
+
}
|
|
794
|
+
if path_parameters
|
|
795
|
+
else {}
|
|
796
|
+
)
|
|
797
|
+
rel_path = rel_path.format(**encoded)
|
|
798
|
+
except KeyError as e:
|
|
799
|
+
missing = e.args[0]
|
|
800
|
+
raise KeyError(
|
|
801
|
+
f'Missing path parameter for placeholder: {missing!r}',
|
|
802
|
+
) from None
|
|
803
|
+
except ValueError as e:
|
|
804
|
+
raise ValueError(
|
|
805
|
+
f'Invalid path template {rel_path!r}: {e}',
|
|
806
|
+
) from None
|
|
807
|
+
|
|
808
|
+
# Build final absolute URL, honoring any client base_path prefix.
|
|
809
|
+
parts = urlsplit(self.base_url)
|
|
810
|
+
base_url_path = parts.path.rstrip('/')
|
|
811
|
+
extra = self.base_path
|
|
812
|
+
extra_norm = ('/' + extra.lstrip('/')) if extra else ''
|
|
813
|
+
composed_base = (
|
|
814
|
+
base_url_path + extra_norm if (base_url_path or extra_norm) else ''
|
|
815
|
+
)
|
|
816
|
+
rel_norm = '/' + rel_path.lstrip('/')
|
|
817
|
+
path = (composed_base + rel_norm) if composed_base else rel_norm
|
|
818
|
+
|
|
819
|
+
# Merge base query with provided query_parameters.
|
|
820
|
+
base_q = parse_qsl(parts.query, keep_blank_values=True)
|
|
821
|
+
add_q = list((query_parameters or {}).items())
|
|
822
|
+
qs = urlencode(base_q + add_q, doseq=True)
|
|
823
|
+
|
|
824
|
+
return urlunsplit(
|
|
825
|
+
(parts.scheme, parts.netloc, path, qs, parts.fragment),
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
# -- Static Methods -- #
|
|
829
|
+
|
|
830
|
+
@staticmethod
|
|
831
|
+
def apply_sleep(
|
|
832
|
+
sleep_seconds: float,
|
|
833
|
+
*,
|
|
834
|
+
sleeper: Callable[[float], None] | None = None,
|
|
835
|
+
) -> None:
|
|
836
|
+
"""
|
|
837
|
+
Sleep for the specified seconds if positive.
|
|
838
|
+
|
|
839
|
+
The optional ``sleeper`` is useful for tests (e.g., pass
|
|
840
|
+
``lambda s: None``). Defaults to using time.sleep when not provided.
|
|
841
|
+
|
|
842
|
+
Parameters
|
|
843
|
+
----------
|
|
844
|
+
sleep_seconds : float
|
|
845
|
+
Number of seconds to sleep; no-op if non-positive.
|
|
846
|
+
sleeper : Callable[[float], None] | None, optional
|
|
847
|
+
Optional sleeper function taking seconds as input.
|
|
848
|
+
"""
|
|
849
|
+
if sleep_seconds and sleep_seconds > 0:
|
|
850
|
+
if sleeper is None:
|
|
851
|
+
time.sleep(sleep_seconds)
|
|
852
|
+
else:
|
|
853
|
+
sleeper(sleep_seconds)
|
|
854
|
+
|
|
855
|
+
# -- Internal Static Methods -- #
|
|
856
|
+
|
|
857
|
+
@staticmethod
|
|
858
|
+
def _resolve_sleep_seconds(
|
|
859
|
+
explicit: float,
|
|
860
|
+
rate_limit: RateLimitConfigMap | None,
|
|
861
|
+
overrides: RateLimitOverrides = None,
|
|
862
|
+
) -> float:
|
|
863
|
+
"""
|
|
864
|
+
Derive the effective sleep interval honoring rate-limit config.
|
|
865
|
+
|
|
866
|
+
Parameters
|
|
867
|
+
----------
|
|
868
|
+
explicit : float
|
|
869
|
+
Explicit sleep seconds provided by the caller.
|
|
870
|
+
rate_limit : RateLimitConfigMap | None
|
|
871
|
+
Client-wide rate limit configuration.
|
|
872
|
+
overrides : RateLimitOverrides, optional
|
|
873
|
+
Per-call overrides that take precedence over ``rate_limit``.
|
|
874
|
+
|
|
875
|
+
Returns
|
|
876
|
+
-------
|
|
877
|
+
float
|
|
878
|
+
The resolved sleep seconds to apply between requests.
|
|
879
|
+
"""
|
|
880
|
+
if explicit and explicit > 0:
|
|
881
|
+
return explicit
|
|
882
|
+
return RateLimiter.resolve_sleep_seconds(
|
|
883
|
+
rate_limit=rate_limit,
|
|
884
|
+
overrides=overrides,
|
|
885
|
+
)
|