airbyte-cdk 6.34.0.dev1__py3-none-any.whl → 6.34.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +12 -16
- airbyte_cdk/connector_builder/message_grouper.py +448 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +7 -7
- airbyte_cdk/sources/declarative/auth/jwt.py +11 -17
- airbyte_cdk/sources/declarative/auth/oauth.py +1 -6
- airbyte_cdk/sources/declarative/auth/token.py +8 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +19 -30
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +85 -203
- airbyte_cdk/sources/declarative/declarative_stream.py +1 -3
- airbyte_cdk/sources/declarative/decoders/__init__.py +4 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +2 -7
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +58 -12
- airbyte_cdk/sources/declarative/extractors/record_selector.py +3 -12
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +25 -56
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +6 -12
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +0 -9
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +41 -150
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +84 -234
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +5 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +2 -4
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +18 -26
- airbyte_cdk/sources/declarative/requesters/http_requester.py +1 -8
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +5 -16
- airbyte_cdk/sources/declarative/requesters/request_option.py +4 -83
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +6 -7
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -6
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -4
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +1 -2
- airbyte_cdk/sources/file_based/file_based_source.py +37 -70
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +12 -107
- airbyte_cdk/sources/file_based/stream/__init__.py +1 -10
- airbyte_cdk/sources/streams/call_rate.py +47 -185
- airbyte_cdk/sources/streams/http/http.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +56 -217
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +73 -144
- airbyte_cdk/utils/datetime_helpers.py +66 -48
- airbyte_cdk/utils/mapping_helpers.py +26 -126
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/RECORD +43 -52
- airbyte_cdk/connector_builder/test_reader/__init__.py +0 -7
- airbyte_cdk/connector_builder/test_reader/helpers.py +0 -591
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +0 -160
- airbyte_cdk/connector_builder/test_reader/reader.py +0 -441
- airbyte_cdk/connector_builder/test_reader/types.py +0 -75
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +0 -81
- airbyte_cdk/sources/file_based/stream/identities_stream.py +0 -47
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +0 -85
- airbyte_cdk/sources/specs/transfer_modes.py +0 -26
- airbyte_cdk/sources/streams/permissions/identities_stream.py +0 -75
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.34.0.dev1.dist-info → airbyte_cdk-6.34.1.dev0.dist-info}/entry_points.txt +0 -0
@@ -13,11 +13,6 @@ from typing import Any, Dict, Iterable, List, Optional, Set
|
|
13
13
|
from wcmatch.glob import GLOBSTAR, globmatch
|
14
14
|
|
15
15
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
16
|
-
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
17
|
-
include_identities_stream,
|
18
|
-
preserve_directory_structure,
|
19
|
-
use_file_transfer,
|
20
|
-
)
|
21
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
22
17
|
|
23
18
|
|
@@ -133,20 +128,24 @@ class AbstractFileBasedStreamReader(ABC):
|
|
133
128
|
|
134
129
|
def use_file_transfer(self) -> bool:
|
135
130
|
if self.config:
|
136
|
-
|
131
|
+
use_file_transfer = (
|
132
|
+
hasattr(self.config.delivery_method, "delivery_type")
|
133
|
+
and self.config.delivery_method.delivery_type == "use_file_transfer"
|
134
|
+
)
|
135
|
+
return use_file_transfer
|
137
136
|
return False
|
138
137
|
|
139
138
|
def preserve_directory_structure(self) -> bool:
|
140
139
|
# fall back to preserve subdirectories if config is not present or incomplete
|
141
|
-
if
|
142
|
-
|
140
|
+
if (
|
141
|
+
self.use_file_transfer()
|
142
|
+
and self.config
|
143
|
+
and hasattr(self.config.delivery_method, "preserve_directory_structure")
|
144
|
+
and self.config.delivery_method.preserve_directory_structure is not None
|
145
|
+
):
|
146
|
+
return self.config.delivery_method.preserve_directory_structure
|
143
147
|
return True
|
144
148
|
|
145
|
-
def include_identities_stream(self) -> bool:
|
146
|
-
if self.config:
|
147
|
-
return include_identities_stream(self.config)
|
148
|
-
return False
|
149
|
-
|
150
149
|
@abstractmethod
|
151
150
|
def get_file(
|
152
151
|
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
@@ -184,97 +183,3 @@ class AbstractFileBasedStreamReader(ABC):
|
|
184
183
|
makedirs(path.dirname(local_file_path), exist_ok=True)
|
185
184
|
absolute_file_path = path.abspath(local_file_path)
|
186
185
|
return [file_relative_path, local_file_path, absolute_file_path]
|
187
|
-
|
188
|
-
@abstractmethod
|
189
|
-
def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]:
|
190
|
-
"""
|
191
|
-
This function should return the allow list for a given file, i.e. the list of all identities and their permission levels associated with it
|
192
|
-
|
193
|
-
e.g.
|
194
|
-
def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger):
|
195
|
-
api_conn = some_api.conn(credentials=SOME_CREDENTIALS)
|
196
|
-
result = api_conn.get_file_permissions_info(file.id)
|
197
|
-
return MyPermissionsModel(
|
198
|
-
id=result["id"],
|
199
|
-
access_control_list = result["access_control_list"],
|
200
|
-
is_public = result["is_public"],
|
201
|
-
).dict()
|
202
|
-
"""
|
203
|
-
raise NotImplementedError(
|
204
|
-
f"{self.__class__.__name__} does not implement get_file_acl_permissions(). To support ACL permissions, implement this method and update file_permissions_schema."
|
205
|
-
)
|
206
|
-
|
207
|
-
@abstractmethod
|
208
|
-
def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]:
|
209
|
-
"""
|
210
|
-
This function should return the Identities in a determined "space" or "domain" where the file metadata (ACLs) are fetched and ACLs items (Identities) exists.
|
211
|
-
|
212
|
-
e.g.
|
213
|
-
def load_identity_groups(self, logger: logging.Logger) -> Dict[str, Any]:
|
214
|
-
api_conn = some_api.conn(credentials=SOME_CREDENTIALS)
|
215
|
-
users_api = api_conn.users()
|
216
|
-
groups_api = api_conn.groups()
|
217
|
-
members_api = self.google_directory_service.members()
|
218
|
-
for user in users_api.list():
|
219
|
-
yield my_identity_model(id=user.id, name=user.name, email_address=user.email, type="user").dict()
|
220
|
-
for group in groups_api.list():
|
221
|
-
group_obj = my_identity_model(id=group.id, name=groups.name, email_address=user.email, type="group").dict()
|
222
|
-
for member in members_api.list(group=group):
|
223
|
-
group_obj.member_email_addresses = group_obj.member_email_addresses or []
|
224
|
-
group_obj.member_email_addresses.append(member.email)
|
225
|
-
yield group_obj.dict()
|
226
|
-
"""
|
227
|
-
raise NotImplementedError(
|
228
|
-
f"{self.__class__.__name__} does not implement load_identity_groups(). To support identities, implement this method and update identities_schema."
|
229
|
-
)
|
230
|
-
|
231
|
-
@property
|
232
|
-
@abstractmethod
|
233
|
-
def file_permissions_schema(self) -> Dict[str, Any]:
|
234
|
-
"""
|
235
|
-
This function should return the permissions schema for file permissions stream.
|
236
|
-
|
237
|
-
e.g.
|
238
|
-
def file_permissions_schema(self) -> Dict[str, Any]:
|
239
|
-
# you can also follow the patter we have for python connectors and have a json file and read from there e.g. schemas/identities.json
|
240
|
-
return {
|
241
|
-
"type": "object",
|
242
|
-
"properties": {
|
243
|
-
"id": { "type": "string" },
|
244
|
-
"file_path": { "type": "string" },
|
245
|
-
"access_control_list": {
|
246
|
-
"type": "array",
|
247
|
-
"items": { "type": "string" }
|
248
|
-
},
|
249
|
-
"publicly_accessible": { "type": "boolean" }
|
250
|
-
}
|
251
|
-
}
|
252
|
-
"""
|
253
|
-
raise NotImplementedError(
|
254
|
-
f"{self.__class__.__name__} does not implement file_permissions_schema, please return json schema for your permissions streams."
|
255
|
-
)
|
256
|
-
|
257
|
-
@property
|
258
|
-
@abstractmethod
|
259
|
-
def identities_schema(self) -> Dict[str, Any]:
|
260
|
-
"""
|
261
|
-
This function should return the identities schema for file identity stream.
|
262
|
-
|
263
|
-
e.g.
|
264
|
-
def identities_schema(self) -> Dict[str, Any]:
|
265
|
-
# you can also follow the patter we have for python connectors and have a json file and read from there e.g. schemas/identities.json
|
266
|
-
return {
|
267
|
-
"type": "object",
|
268
|
-
"properties": {
|
269
|
-
"id": { "type": "string" },
|
270
|
-
"remote_id": { "type": "string" },
|
271
|
-
"name": { "type": ["null", "string"] },
|
272
|
-
"email_address": { "type": ["null", "string"] },
|
273
|
-
"member_email_addresses": { "type": ["null", "array"] },
|
274
|
-
"type": { "type": "string" },
|
275
|
-
}
|
276
|
-
}
|
277
|
-
"""
|
278
|
-
raise NotImplementedError(
|
279
|
-
f"{self.__class__.__name__} does not implement identities_schema, please return json schema for your identities stream."
|
280
|
-
)
|
@@ -1,13 +1,4 @@
|
|
1
1
|
from airbyte_cdk.sources.file_based.stream.abstract_file_based_stream import AbstractFileBasedStream
|
2
2
|
from airbyte_cdk.sources.file_based.stream.default_file_based_stream import DefaultFileBasedStream
|
3
|
-
from airbyte_cdk.sources.file_based.stream.identities_stream import FileIdentitiesStream
|
4
|
-
from airbyte_cdk.sources.file_based.stream.permissions_file_based_stream import (
|
5
|
-
PermissionsFileBasedStream,
|
6
|
-
)
|
7
3
|
|
8
|
-
__all__ = [
|
9
|
-
"AbstractFileBasedStream",
|
10
|
-
"DefaultFileBasedStream",
|
11
|
-
"FileIdentitiesStream",
|
12
|
-
"PermissionsFileBasedStream",
|
13
|
-
]
|
4
|
+
__all__ = ["AbstractFileBasedStream", "DefaultFileBasedStream"]
|
@@ -6,7 +6,6 @@ import abc
|
|
6
6
|
import dataclasses
|
7
7
|
import datetime
|
8
8
|
import logging
|
9
|
-
import re
|
10
9
|
import time
|
11
10
|
from datetime import timedelta
|
12
11
|
from threading import RLock
|
@@ -26,7 +25,6 @@ else:
|
|
26
25
|
MIXIN_BASE = object
|
27
26
|
|
28
27
|
logger = logging.getLogger("airbyte")
|
29
|
-
logging.getLogger("pyrate_limiter").setLevel(logging.WARNING)
|
30
28
|
|
31
29
|
|
32
30
|
@dataclasses.dataclass
|
@@ -100,7 +98,7 @@ class RequestMatcher(abc.ABC):
|
|
100
98
|
|
101
99
|
|
102
100
|
class HttpRequestMatcher(RequestMatcher):
|
103
|
-
"""Simple implementation of RequestMatcher for
|
101
|
+
"""Simple implementation of RequestMatcher for http requests case"""
|
104
102
|
|
105
103
|
def __init__(
|
106
104
|
self,
|
@@ -111,94 +109,32 @@ class HttpRequestMatcher(RequestMatcher):
|
|
111
109
|
):
|
112
110
|
"""Constructor
|
113
111
|
|
114
|
-
:param method:
|
115
|
-
:param url:
|
116
|
-
:param params:
|
117
|
-
:param headers:
|
112
|
+
:param method:
|
113
|
+
:param url:
|
114
|
+
:param params:
|
115
|
+
:param headers:
|
118
116
|
"""
|
119
|
-
|
120
|
-
|
121
|
-
parsed_url = parse.urlsplit(url)
|
122
|
-
url_base = f"{parsed_url.scheme}://{parsed_url.netloc}"
|
123
|
-
url_path = parsed_url.path if parsed_url.path != "/" else None
|
124
|
-
else:
|
125
|
-
url_base = None
|
126
|
-
url_path = None
|
127
|
-
|
128
|
-
# Use HttpRequestRegexMatcher under the hood
|
129
|
-
self._regex_matcher = HttpRequestRegexMatcher(
|
130
|
-
method=method,
|
131
|
-
url_base=url_base,
|
132
|
-
url_path_pattern=re.escape(url_path) if url_path else None,
|
133
|
-
params=params,
|
134
|
-
headers=headers,
|
135
|
-
)
|
136
|
-
|
137
|
-
def __call__(self, request: Any) -> bool:
|
138
|
-
"""
|
139
|
-
:param request: A requests.Request or requests.PreparedRequest instance.
|
140
|
-
:return: True if the request matches all provided criteria; False otherwise.
|
141
|
-
"""
|
142
|
-
return self._regex_matcher(request)
|
143
|
-
|
144
|
-
def __str__(self) -> str:
|
145
|
-
return (
|
146
|
-
f"HttpRequestMatcher(method={self._regex_matcher._method}, "
|
147
|
-
f"url={self._regex_matcher._url_base}{self._regex_matcher._url_path_pattern.pattern if self._regex_matcher._url_path_pattern else ''}, "
|
148
|
-
f"params={self._regex_matcher._params}, headers={self._regex_matcher._headers})"
|
149
|
-
)
|
150
|
-
|
151
|
-
|
152
|
-
class HttpRequestRegexMatcher(RequestMatcher):
|
153
|
-
"""
|
154
|
-
Extended RequestMatcher for HTTP requests that supports matching on:
|
155
|
-
- HTTP method (case-insensitive)
|
156
|
-
- URL base (scheme + netloc) optionally
|
157
|
-
- URL path pattern (a regex applied to the path portion of the URL)
|
158
|
-
- Query parameters (must be present)
|
159
|
-
- Headers (header names compared case-insensitively)
|
160
|
-
"""
|
161
|
-
|
162
|
-
def __init__(
|
163
|
-
self,
|
164
|
-
method: Optional[str] = None,
|
165
|
-
url_base: Optional[str] = None,
|
166
|
-
url_path_pattern: Optional[str] = None,
|
167
|
-
params: Optional[Mapping[str, Any]] = None,
|
168
|
-
headers: Optional[Mapping[str, Any]] = None,
|
169
|
-
):
|
170
|
-
"""
|
171
|
-
:param method: HTTP method (e.g. "GET", "POST"); compared case-insensitively.
|
172
|
-
:param url_base: Base URL (scheme://host) that must match.
|
173
|
-
:param url_path_pattern: A regex pattern that will be applied to the path portion of the URL.
|
174
|
-
:param params: Dictionary of query parameters that must be present in the request.
|
175
|
-
:param headers: Dictionary of headers that must be present (header keys are compared case-insensitively).
|
176
|
-
"""
|
177
|
-
self._method = method.upper() if method else None
|
178
|
-
|
179
|
-
# Normalize the url_base if provided: remove trailing slash.
|
180
|
-
self._url_base = url_base.rstrip("/") if url_base else None
|
181
|
-
|
182
|
-
# Compile the URL path pattern if provided.
|
183
|
-
self._url_path_pattern = re.compile(url_path_pattern) if url_path_pattern else None
|
184
|
-
|
185
|
-
# Normalize query parameters to strings.
|
117
|
+
self._method = method
|
118
|
+
self._url = url
|
186
119
|
self._params = {str(k): str(v) for k, v in (params or {}).items()}
|
187
|
-
|
188
|
-
# Normalize header keys to lowercase.
|
189
|
-
self._headers = {str(k).lower(): str(v) for k, v in (headers or {}).items()}
|
120
|
+
self._headers = {str(k): str(v) for k, v in (headers or {}).items()}
|
190
121
|
|
191
122
|
@staticmethod
|
192
123
|
def _match_dict(obj: Mapping[str, Any], pattern: Mapping[str, Any]) -> bool:
|
193
|
-
"""Check that
|
124
|
+
"""Check that all elements from pattern dict present and have the same values in obj dict
|
125
|
+
|
126
|
+
:param obj:
|
127
|
+
:param pattern:
|
128
|
+
:return:
|
129
|
+
"""
|
194
130
|
return pattern.items() <= obj.items()
|
195
131
|
|
196
132
|
def __call__(self, request: Any) -> bool:
|
197
133
|
"""
|
198
|
-
|
199
|
-
:
|
134
|
+
|
135
|
+
:param request:
|
136
|
+
:return: True if matches the provided request object, False - otherwise
|
200
137
|
"""
|
201
|
-
# Prepare the request (if needed) and extract the URL details.
|
202
138
|
if isinstance(request, requests.Request):
|
203
139
|
prepared_request = request.prepare()
|
204
140
|
elif isinstance(request, requests.PreparedRequest):
|
@@ -206,49 +142,23 @@ class HttpRequestRegexMatcher(RequestMatcher):
|
|
206
142
|
else:
|
207
143
|
return False
|
208
144
|
|
209
|
-
# Check HTTP method.
|
210
145
|
if self._method is not None:
|
211
146
|
if prepared_request.method != self._method:
|
212
147
|
return False
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
# Reconstruct the base: scheme://netloc
|
217
|
-
request_url_base = f"{str(parsed_url.scheme)}://{str(parsed_url.netloc)}"
|
218
|
-
# The path (without query parameters)
|
219
|
-
request_path = str(parsed_url.path).rstrip("/")
|
220
|
-
|
221
|
-
# If a base URL is provided, check that it matches.
|
222
|
-
if self._url_base is not None:
|
223
|
-
if request_url_base != self._url_base:
|
224
|
-
return False
|
225
|
-
|
226
|
-
# If a URL path pattern is provided, ensure the path matches the regex.
|
227
|
-
if self._url_path_pattern is not None:
|
228
|
-
if not self._url_path_pattern.search(request_path):
|
148
|
+
if self._url is not None and prepared_request.url is not None:
|
149
|
+
url_without_params = prepared_request.url.split("?")[0]
|
150
|
+
if url_without_params != self._url:
|
229
151
|
return False
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
if not self._match_dict(query_params, self._params):
|
152
|
+
if self._params is not None:
|
153
|
+
parsed_url = parse.urlsplit(prepared_request.url)
|
154
|
+
params = dict(parse.parse_qsl(str(parsed_url.query)))
|
155
|
+
if not self._match_dict(params, self._params):
|
235
156
|
return False
|
236
|
-
|
237
|
-
|
238
|
-
if self._headers:
|
239
|
-
req_headers = {k.lower(): v for k, v in prepared_request.headers.items()}
|
240
|
-
if not self._match_dict(req_headers, self._headers):
|
157
|
+
if self._headers is not None:
|
158
|
+
if not self._match_dict(prepared_request.headers, self._headers):
|
241
159
|
return False
|
242
|
-
|
243
160
|
return True
|
244
161
|
|
245
|
-
def __str__(self) -> str:
|
246
|
-
regex = self._url_path_pattern.pattern if self._url_path_pattern else None
|
247
|
-
return (
|
248
|
-
f"HttpRequestRegexMatcher(method={self._method}, url_base={self._url_base}, "
|
249
|
-
f"url_path_pattern={regex}, params={self._params}, headers={self._headers})"
|
250
|
-
)
|
251
|
-
|
252
162
|
|
253
163
|
class BaseCallRatePolicy(AbstractCallRatePolicy, abc.ABC):
|
254
164
|
def __init__(self, matchers: list[RequestMatcher]):
|
@@ -347,14 +257,6 @@ class FixedWindowCallRatePolicy(BaseCallRatePolicy):
|
|
347
257
|
|
348
258
|
self._calls_num += weight
|
349
259
|
|
350
|
-
def __str__(self) -> str:
|
351
|
-
matcher_str = ", ".join(f"{matcher}" for matcher in self._matchers)
|
352
|
-
return (
|
353
|
-
f"FixedWindowCallRatePolicy(call_limit={self._call_limit}, period={self._offset}, "
|
354
|
-
f"calls_used={self._calls_num}, next_reset={self._next_reset_ts}, "
|
355
|
-
f"matchers=[{matcher_str}])"
|
356
|
-
)
|
357
|
-
|
358
260
|
def update(
|
359
261
|
self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
|
360
262
|
) -> None:
|
@@ -461,19 +363,6 @@ class MovingWindowCallRatePolicy(BaseCallRatePolicy):
|
|
461
363
|
# if available_calls is not None and call_reset_ts is not None:
|
462
364
|
# ts = call_reset_ts.timestamp()
|
463
365
|
|
464
|
-
def __str__(self) -> str:
|
465
|
-
"""Return a human-friendly description of the moving window rate policy for logging purposes."""
|
466
|
-
rates_info = ", ".join(
|
467
|
-
f"{rate.limit} per {timedelta(milliseconds=rate.interval)}"
|
468
|
-
for rate in self._bucket.rates
|
469
|
-
)
|
470
|
-
current_bucket_count = self._bucket.count()
|
471
|
-
matcher_str = ", ".join(f"{matcher}" for matcher in self._matchers)
|
472
|
-
return (
|
473
|
-
f"MovingWindowCallRatePolicy(rates=[{rates_info}], current_bucket_count={current_bucket_count}, "
|
474
|
-
f"matchers=[{matcher_str}])"
|
475
|
-
)
|
476
|
-
|
477
366
|
|
478
367
|
class AbstractAPIBudget(abc.ABC):
|
479
368
|
"""Interface to some API where a client allowed to have N calls per T interval.
|
@@ -526,23 +415,6 @@ class APIBudget(AbstractAPIBudget):
|
|
526
415
|
self._policies = policies
|
527
416
|
self._maximum_attempts_to_acquire = maximum_attempts_to_acquire
|
528
417
|
|
529
|
-
def _extract_endpoint(self, request: Any) -> str:
|
530
|
-
"""Extract the endpoint URL from the request if available."""
|
531
|
-
endpoint = None
|
532
|
-
try:
|
533
|
-
# If the request is already a PreparedRequest, it should have a URL.
|
534
|
-
if isinstance(request, requests.PreparedRequest):
|
535
|
-
endpoint = request.url
|
536
|
-
# If it's a requests.Request, we call prepare() to extract the URL.
|
537
|
-
elif isinstance(request, requests.Request):
|
538
|
-
prepared = request.prepare()
|
539
|
-
endpoint = prepared.url
|
540
|
-
except Exception as e:
|
541
|
-
logger.debug(f"Error extracting endpoint: {e}")
|
542
|
-
if endpoint:
|
543
|
-
return endpoint
|
544
|
-
return "unknown endpoint"
|
545
|
-
|
546
418
|
def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]:
|
547
419
|
for policy in self._policies:
|
548
420
|
if policy.matches(request):
|
@@ -556,24 +428,20 @@ class APIBudget(AbstractAPIBudget):
|
|
556
428
|
Matchers will be called sequentially in the same order they were added.
|
557
429
|
The first matcher that returns True will
|
558
430
|
|
559
|
-
:param request:
|
560
|
-
:param block: when
|
561
|
-
:param timeout: if provided
|
562
|
-
:raises: CallRateLimitHit if
|
431
|
+
:param request:
|
432
|
+
:param block: when true (default) will block the current thread until call credit is available
|
433
|
+
:param timeout: if provided will limit maximum time in block, otherwise will wait until credit is available
|
434
|
+
:raises: CallRateLimitHit - when no calls left and if timeout was set the waiting time exceed the timeout
|
563
435
|
"""
|
564
436
|
|
565
437
|
policy = self.get_matching_policy(request)
|
566
|
-
endpoint = self._extract_endpoint(request)
|
567
438
|
if policy:
|
568
|
-
logger.debug(f"Acquiring call for endpoint {endpoint} using policy: {policy}")
|
569
439
|
self._do_acquire(request=request, policy=policy, block=block, timeout=timeout)
|
570
440
|
elif self._policies:
|
571
|
-
logger.
|
572
|
-
f"No policies matched for endpoint {endpoint} (request: {request}). Allowing call by default."
|
573
|
-
)
|
441
|
+
logger.info("no policies matched with requests, allow call by default")
|
574
442
|
|
575
443
|
def update_from_response(self, request: Any, response: Any) -> None:
|
576
|
-
"""Update budget information based on
|
444
|
+
"""Update budget information based on response from API
|
577
445
|
|
578
446
|
:param request: the initial request that triggered this response
|
579
447
|
:param response: response from the API
|
@@ -583,17 +451,15 @@ class APIBudget(AbstractAPIBudget):
|
|
583
451
|
def _do_acquire(
|
584
452
|
self, request: Any, policy: AbstractCallRatePolicy, block: bool, timeout: Optional[float]
|
585
453
|
) -> None:
|
586
|
-
"""Internal method to try to acquire a call credit
|
454
|
+
"""Internal method to try to acquire a call credit
|
587
455
|
|
588
|
-
:param request:
|
589
|
-
:param policy:
|
590
|
-
:param block:
|
591
|
-
:param timeout:
|
592
|
-
:raises: CallRateLimitHit if unable to acquire a call credit
|
456
|
+
:param request:
|
457
|
+
:param policy:
|
458
|
+
:param block:
|
459
|
+
:param timeout:
|
593
460
|
"""
|
594
461
|
last_exception = None
|
595
|
-
|
596
|
-
# sometimes we spend all budget before a second attempt, so we have a few more attempts
|
462
|
+
# sometimes we spend all budget before a second attempt, so we have few more here
|
597
463
|
for attempt in range(1, self._maximum_attempts_to_acquire):
|
598
464
|
try:
|
599
465
|
policy.try_acquire(request, weight=1)
|
@@ -605,24 +471,20 @@ class APIBudget(AbstractAPIBudget):
|
|
605
471
|
time_to_wait = min(timedelta(seconds=timeout), exc.time_to_wait)
|
606
472
|
else:
|
607
473
|
time_to_wait = exc.time_to_wait
|
608
|
-
|
609
|
-
time_to_wait = max(
|
610
|
-
|
611
|
-
|
612
|
-
|
474
|
+
|
475
|
+
time_to_wait = max(
|
476
|
+
timedelta(0), time_to_wait
|
477
|
+
) # sometimes we get negative duration
|
478
|
+
logger.info(
|
479
|
+
"reached call limit %s. going to sleep for %s", exc.rate, time_to_wait
|
613
480
|
)
|
614
481
|
time.sleep(time_to_wait.total_seconds())
|
615
482
|
else:
|
616
|
-
logger.debug(
|
617
|
-
f"Policy {policy} reached call limit for endpoint {endpoint} ({exc.rate}) "
|
618
|
-
f"and blocking is disabled."
|
619
|
-
)
|
620
483
|
raise
|
621
484
|
|
622
485
|
if last_exception:
|
623
|
-
logger.
|
624
|
-
|
625
|
-
f"using policy: {policy}"
|
486
|
+
logger.info(
|
487
|
+
"we used all %s attempts to acquire and failed", self._maximum_attempts_to_acquire
|
626
488
|
)
|
627
489
|
raise last_exception
|
628
490
|
|
@@ -634,7 +496,7 @@ class HttpAPIBudget(APIBudget):
|
|
634
496
|
self,
|
635
497
|
ratelimit_reset_header: str = "ratelimit-reset",
|
636
498
|
ratelimit_remaining_header: str = "ratelimit-remaining",
|
637
|
-
status_codes_for_ratelimit_hit:
|
499
|
+
status_codes_for_ratelimit_hit: tuple[int] = (429,),
|
638
500
|
**kwargs: Any,
|
639
501
|
):
|
640
502
|
"""Constructor
|
@@ -423,6 +423,8 @@ class HttpStream(Stream, CheckpointMixin, ABC):
|
|
423
423
|
stream_slice: Optional[Mapping[str, Any]] = None,
|
424
424
|
stream_state: Optional[Mapping[str, Any]] = None,
|
425
425
|
) -> Iterable[StreamData]:
|
426
|
+
partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice)
|
427
|
+
|
426
428
|
stream_state = stream_state or {}
|
427
429
|
pagination_complete = False
|
428
430
|
next_page_token = None
|
@@ -436,7 +438,6 @@ class HttpStream(Stream, CheckpointMixin, ABC):
|
|
436
438
|
|
437
439
|
cursor = self.get_cursor()
|
438
440
|
if cursor and isinstance(cursor, SubstreamResumableFullRefreshCursor):
|
439
|
-
partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice)
|
440
441
|
# Substreams checkpoint state by marking an entire parent partition as completed so that on the subsequent attempt
|
441
442
|
# after a failure, completed parents are skipped and the sync can make progress
|
442
443
|
cursor.close_slice(StreamSlice(cursor_slice={}, partition=partition))
|