airbyte-cdk 6.34.1.dev0__py3-none-any.whl → 6.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +16 -12
  2. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  3. airbyte_cdk/connector_builder/test_reader/helpers.py +591 -0
  4. airbyte_cdk/connector_builder/test_reader/message_grouper.py +160 -0
  5. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  6. airbyte_cdk/connector_builder/test_reader/types.py +75 -0
  7. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +7 -7
  8. airbyte_cdk/sources/declarative/auth/jwt.py +17 -11
  9. airbyte_cdk/sources/declarative/auth/oauth.py +6 -1
  10. airbyte_cdk/sources/declarative/auth/token.py +3 -8
  11. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +30 -79
  12. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +213 -100
  13. airbyte_cdk/sources/declarative/declarative_stream.py +3 -1
  14. airbyte_cdk/sources/declarative/decoders/__init__.py +0 -4
  15. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +18 -3
  16. airbyte_cdk/sources/declarative/decoders/json_decoder.py +12 -58
  17. airbyte_cdk/sources/declarative/extractors/record_selector.py +12 -3
  18. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +56 -25
  19. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +12 -6
  20. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +6 -2
  21. airbyte_cdk/sources/declarative/interpolation/__init__.py +1 -1
  22. airbyte_cdk/sources/declarative/interpolation/filters.py +2 -1
  23. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +1 -1
  24. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +1 -1
  25. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +1 -1
  26. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +1 -1
  27. airbyte_cdk/sources/declarative/interpolation/interpolation.py +2 -1
  28. airbyte_cdk/sources/declarative/interpolation/jinja.py +14 -1
  29. airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
  30. airbyte_cdk/sources/declarative/manifest_declarative_source.py +9 -0
  31. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +150 -41
  32. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +234 -84
  33. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +5 -5
  34. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +4 -2
  35. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +26 -18
  36. airbyte_cdk/sources/declarative/requesters/http_requester.py +8 -2
  37. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +16 -5
  38. airbyte_cdk/sources/declarative/requesters/request_option.py +83 -4
  39. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +7 -6
  40. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +1 -4
  41. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +0 -3
  42. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +2 -47
  43. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +6 -12
  44. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +4 -3
  45. airbyte_cdk/sources/declarative/transformations/add_fields.py +4 -4
  46. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +2 -1
  47. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  48. airbyte_cdk/sources/file_based/file_based_source.py +70 -37
  49. airbyte_cdk/sources/file_based/file_based_stream_reader.py +107 -12
  50. airbyte_cdk/sources/file_based/stream/__init__.py +10 -1
  51. airbyte_cdk/sources/file_based/stream/identities_stream.py +47 -0
  52. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +85 -0
  53. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  54. airbyte_cdk/sources/streams/call_rate.py +185 -47
  55. airbyte_cdk/sources/streams/http/http.py +1 -2
  56. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +217 -56
  57. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +144 -73
  58. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  59. airbyte_cdk/test/mock_http/mocker.py +9 -1
  60. airbyte_cdk/test/mock_http/response.py +6 -3
  61. airbyte_cdk/utils/datetime_helpers.py +48 -66
  62. airbyte_cdk/utils/mapping_helpers.py +126 -26
  63. {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/METADATA +1 -1
  64. {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/RECORD +68 -59
  65. airbyte_cdk/connector_builder/message_grouper.py +0 -448
  66. {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/LICENSE.txt +0 -0
  67. {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/LICENSE_SHORT +0 -0
  68. {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/WHEEL +0 -0
  69. {airbyte_cdk-6.34.1.dev0.dist-info → airbyte_cdk-6.35.0.dist-info}/entry_points.txt +0 -0
@@ -6,6 +6,7 @@ import abc
6
6
  import dataclasses
7
7
  import datetime
8
8
  import logging
9
+ import re
9
10
  import time
10
11
  from datetime import timedelta
11
12
  from threading import RLock
@@ -25,6 +26,7 @@ else:
25
26
  MIXIN_BASE = object
26
27
 
27
28
  logger = logging.getLogger("airbyte")
29
+ logging.getLogger("pyrate_limiter").setLevel(logging.WARNING)
28
30
 
29
31
 
30
32
  @dataclasses.dataclass
@@ -98,7 +100,7 @@ class RequestMatcher(abc.ABC):
98
100
 
99
101
 
100
102
  class HttpRequestMatcher(RequestMatcher):
101
- """Simple implementation of RequestMatcher for http requests case"""
103
+ """Simple implementation of RequestMatcher for HTTP requests using HttpRequestRegexMatcher under the hood."""
102
104
 
103
105
  def __init__(
104
106
  self,
@@ -109,32 +111,94 @@ class HttpRequestMatcher(RequestMatcher):
109
111
  ):
110
112
  """Constructor
111
113
 
112
- :param method:
113
- :param url:
114
- :param params:
115
- :param headers:
114
+ :param method: HTTP method (e.g., "GET", "POST").
115
+ :param url: Full URL to match.
116
+ :param params: Dictionary of query parameters to match.
117
+ :param headers: Dictionary of headers to match.
116
118
  """
117
- self._method = method
118
- self._url = url
119
+ # Parse the URL to extract the base and path
120
+ if url:
121
+ parsed_url = parse.urlsplit(url)
122
+ url_base = f"{parsed_url.scheme}://{parsed_url.netloc}"
123
+ url_path = parsed_url.path if parsed_url.path != "/" else None
124
+ else:
125
+ url_base = None
126
+ url_path = None
127
+
128
+ # Use HttpRequestRegexMatcher under the hood
129
+ self._regex_matcher = HttpRequestRegexMatcher(
130
+ method=method,
131
+ url_base=url_base,
132
+ url_path_pattern=re.escape(url_path) if url_path else None,
133
+ params=params,
134
+ headers=headers,
135
+ )
136
+
137
+ def __call__(self, request: Any) -> bool:
138
+ """
139
+ :param request: A requests.Request or requests.PreparedRequest instance.
140
+ :return: True if the request matches all provided criteria; False otherwise.
141
+ """
142
+ return self._regex_matcher(request)
143
+
144
+ def __str__(self) -> str:
145
+ return (
146
+ f"HttpRequestMatcher(method={self._regex_matcher._method}, "
147
+ f"url={self._regex_matcher._url_base}{self._regex_matcher._url_path_pattern.pattern if self._regex_matcher._url_path_pattern else ''}, "
148
+ f"params={self._regex_matcher._params}, headers={self._regex_matcher._headers})"
149
+ )
150
+
151
+
152
+ class HttpRequestRegexMatcher(RequestMatcher):
153
+ """
154
+ Extended RequestMatcher for HTTP requests that supports matching on:
155
+ - HTTP method (case-insensitive)
156
+ - URL base (scheme + netloc) optionally
157
+ - URL path pattern (a regex applied to the path portion of the URL)
158
+ - Query parameters (must be present)
159
+ - Headers (header names compared case-insensitively)
160
+ """
161
+
162
+ def __init__(
163
+ self,
164
+ method: Optional[str] = None,
165
+ url_base: Optional[str] = None,
166
+ url_path_pattern: Optional[str] = None,
167
+ params: Optional[Mapping[str, Any]] = None,
168
+ headers: Optional[Mapping[str, Any]] = None,
169
+ ):
170
+ """
171
+ :param method: HTTP method (e.g. "GET", "POST"); compared case-insensitively.
172
+ :param url_base: Base URL (scheme://host) that must match.
173
+ :param url_path_pattern: A regex pattern that will be applied to the path portion of the URL.
174
+ :param params: Dictionary of query parameters that must be present in the request.
175
+ :param headers: Dictionary of headers that must be present (header keys are compared case-insensitively).
176
+ """
177
+ self._method = method.upper() if method else None
178
+
179
+ # Normalize the url_base if provided: remove trailing slash.
180
+ self._url_base = url_base.rstrip("/") if url_base else None
181
+
182
+ # Compile the URL path pattern if provided.
183
+ self._url_path_pattern = re.compile(url_path_pattern) if url_path_pattern else None
184
+
185
+ # Normalize query parameters to strings.
119
186
  self._params = {str(k): str(v) for k, v in (params or {}).items()}
120
- self._headers = {str(k): str(v) for k, v in (headers or {}).items()}
187
+
188
+ # Normalize header keys to lowercase.
189
+ self._headers = {str(k).lower(): str(v) for k, v in (headers or {}).items()}
121
190
 
122
191
  @staticmethod
123
192
  def _match_dict(obj: Mapping[str, Any], pattern: Mapping[str, Any]) -> bool:
124
- """Check that all elements from pattern dict present and have the same values in obj dict
125
-
126
- :param obj:
127
- :param pattern:
128
- :return:
129
- """
193
+ """Check that every key/value in the pattern exists in the object."""
130
194
  return pattern.items() <= obj.items()
131
195
 
132
196
  def __call__(self, request: Any) -> bool:
133
197
  """
134
-
135
- :param request:
136
- :return: True if matches the provided request object, False - otherwise
198
+ :param request: A requests.Request or requests.PreparedRequest instance.
199
+ :return: True if the request matches all provided criteria; False otherwise.
137
200
  """
201
+ # Prepare the request (if needed) and extract the URL details.
138
202
  if isinstance(request, requests.Request):
139
203
  prepared_request = request.prepare()
140
204
  elif isinstance(request, requests.PreparedRequest):
@@ -142,23 +206,49 @@ class HttpRequestMatcher(RequestMatcher):
142
206
  else:
143
207
  return False
144
208
 
209
+ # Check HTTP method.
145
210
  if self._method is not None:
146
211
  if prepared_request.method != self._method:
147
212
  return False
148
- if self._url is not None and prepared_request.url is not None:
149
- url_without_params = prepared_request.url.split("?")[0]
150
- if url_without_params != self._url:
213
+
214
+ # Parse the URL.
215
+ parsed_url = parse.urlsplit(prepared_request.url)
216
+ # Reconstruct the base: scheme://netloc
217
+ request_url_base = f"{str(parsed_url.scheme)}://{str(parsed_url.netloc)}"
218
+ # The path (without query parameters)
219
+ request_path = str(parsed_url.path).rstrip("/")
220
+
221
+ # If a base URL is provided, check that it matches.
222
+ if self._url_base is not None:
223
+ if request_url_base != self._url_base:
224
+ return False
225
+
226
+ # If a URL path pattern is provided, ensure the path matches the regex.
227
+ if self._url_path_pattern is not None:
228
+ if not self._url_path_pattern.search(request_path):
151
229
  return False
152
- if self._params is not None:
153
- parsed_url = parse.urlsplit(prepared_request.url)
154
- params = dict(parse.parse_qsl(str(parsed_url.query)))
155
- if not self._match_dict(params, self._params):
230
+
231
+ # Check query parameters.
232
+ if self._params:
233
+ query_params = dict(parse.parse_qsl(str(parsed_url.query)))
234
+ if not self._match_dict(query_params, self._params):
156
235
  return False
157
- if self._headers is not None:
158
- if not self._match_dict(prepared_request.headers, self._headers):
236
+
237
+ # Check headers (normalize keys to lower-case).
238
+ if self._headers:
239
+ req_headers = {k.lower(): v for k, v in prepared_request.headers.items()}
240
+ if not self._match_dict(req_headers, self._headers):
159
241
  return False
242
+
160
243
  return True
161
244
 
245
+ def __str__(self) -> str:
246
+ regex = self._url_path_pattern.pattern if self._url_path_pattern else None
247
+ return (
248
+ f"HttpRequestRegexMatcher(method={self._method}, url_base={self._url_base}, "
249
+ f"url_path_pattern={regex}, params={self._params}, headers={self._headers})"
250
+ )
251
+
162
252
 
163
253
  class BaseCallRatePolicy(AbstractCallRatePolicy, abc.ABC):
164
254
  def __init__(self, matchers: list[RequestMatcher]):
@@ -257,6 +347,14 @@ class FixedWindowCallRatePolicy(BaseCallRatePolicy):
257
347
 
258
348
  self._calls_num += weight
259
349
 
350
+ def __str__(self) -> str:
351
+ matcher_str = ", ".join(f"{matcher}" for matcher in self._matchers)
352
+ return (
353
+ f"FixedWindowCallRatePolicy(call_limit={self._call_limit}, period={self._offset}, "
354
+ f"calls_used={self._calls_num}, next_reset={self._next_reset_ts}, "
355
+ f"matchers=[{matcher_str}])"
356
+ )
357
+
260
358
  def update(
261
359
  self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
262
360
  ) -> None:
@@ -363,6 +461,19 @@ class MovingWindowCallRatePolicy(BaseCallRatePolicy):
363
461
  # if available_calls is not None and call_reset_ts is not None:
364
462
  # ts = call_reset_ts.timestamp()
365
463
 
464
+ def __str__(self) -> str:
465
+ """Return a human-friendly description of the moving window rate policy for logging purposes."""
466
+ rates_info = ", ".join(
467
+ f"{rate.limit} per {timedelta(milliseconds=rate.interval)}"
468
+ for rate in self._bucket.rates
469
+ )
470
+ current_bucket_count = self._bucket.count()
471
+ matcher_str = ", ".join(f"{matcher}" for matcher in self._matchers)
472
+ return (
473
+ f"MovingWindowCallRatePolicy(rates=[{rates_info}], current_bucket_count={current_bucket_count}, "
474
+ f"matchers=[{matcher_str}])"
475
+ )
476
+
366
477
 
367
478
  class AbstractAPIBudget(abc.ABC):
368
479
  """Interface to some API where a client allowed to have N calls per T interval.
@@ -415,6 +526,23 @@ class APIBudget(AbstractAPIBudget):
415
526
  self._policies = policies
416
527
  self._maximum_attempts_to_acquire = maximum_attempts_to_acquire
417
528
 
529
+ def _extract_endpoint(self, request: Any) -> str:
530
+ """Extract the endpoint URL from the request if available."""
531
+ endpoint = None
532
+ try:
533
+ # If the request is already a PreparedRequest, it should have a URL.
534
+ if isinstance(request, requests.PreparedRequest):
535
+ endpoint = request.url
536
+ # If it's a requests.Request, we call prepare() to extract the URL.
537
+ elif isinstance(request, requests.Request):
538
+ prepared = request.prepare()
539
+ endpoint = prepared.url
540
+ except Exception as e:
541
+ logger.debug(f"Error extracting endpoint: {e}")
542
+ if endpoint:
543
+ return endpoint
544
+ return "unknown endpoint"
545
+
418
546
  def get_matching_policy(self, request: Any) -> Optional[AbstractCallRatePolicy]:
419
547
  for policy in self._policies:
420
548
  if policy.matches(request):
@@ -428,20 +556,24 @@ class APIBudget(AbstractAPIBudget):
428
556
  Matchers will be called sequentially in the same order they were added.
429
557
  The first matcher that returns True will
430
558
 
431
- :param request:
432
- :param block: when true (default) will block the current thread until call credit is available
433
- :param timeout: if provided will limit maximum time in block, otherwise will wait until credit is available
434
- :raises: CallRateLimitHit - when no calls left and if timeout was set the waiting time exceed the timeout
559
+ :param request: the API request
560
+ :param block: when True (default) will block until a call credit is available
561
+ :param timeout: if provided, limits maximum waiting time; otherwise, waits indefinitely
562
+ :raises: CallRateLimitHit if the call credit cannot be acquired within the timeout
435
563
  """
436
564
 
437
565
  policy = self.get_matching_policy(request)
566
+ endpoint = self._extract_endpoint(request)
438
567
  if policy:
568
+ logger.debug(f"Acquiring call for endpoint {endpoint} using policy: {policy}")
439
569
  self._do_acquire(request=request, policy=policy, block=block, timeout=timeout)
440
570
  elif self._policies:
441
- logger.info("no policies matched with requests, allow call by default")
571
+ logger.debug(
572
+ f"No policies matched for endpoint {endpoint} (request: {request}). Allowing call by default."
573
+ )
442
574
 
443
575
  def update_from_response(self, request: Any, response: Any) -> None:
444
- """Update budget information based on response from API
576
+ """Update budget information based on the API response.
445
577
 
446
578
  :param request: the initial request that triggered this response
447
579
  :param response: response from the API
@@ -451,15 +583,17 @@ class APIBudget(AbstractAPIBudget):
451
583
  def _do_acquire(
452
584
  self, request: Any, policy: AbstractCallRatePolicy, block: bool, timeout: Optional[float]
453
585
  ) -> None:
454
- """Internal method to try to acquire a call credit
586
+ """Internal method to try to acquire a call credit.
455
587
 
456
- :param request:
457
- :param policy:
458
- :param block:
459
- :param timeout:
588
+ :param request: the API request
589
+ :param policy: the matching rate-limiting policy
590
+ :param block: indicates whether to block until a call credit is available
591
+ :param timeout: maximum time to wait if blocking
592
+ :raises: CallRateLimitHit if unable to acquire a call credit
460
593
  """
461
594
  last_exception = None
462
- # sometimes we spend all budget before a second attempt, so we have few more here
595
+ endpoint = self._extract_endpoint(request)
596
+ # sometimes we spend all budget before a second attempt, so we have a few more attempts
463
597
  for attempt in range(1, self._maximum_attempts_to_acquire):
464
598
  try:
465
599
  policy.try_acquire(request, weight=1)
@@ -471,20 +605,24 @@ class APIBudget(AbstractAPIBudget):
471
605
  time_to_wait = min(timedelta(seconds=timeout), exc.time_to_wait)
472
606
  else:
473
607
  time_to_wait = exc.time_to_wait
474
-
475
- time_to_wait = max(
476
- timedelta(0), time_to_wait
477
- ) # sometimes we get negative duration
478
- logger.info(
479
- "reached call limit %s. going to sleep for %s", exc.rate, time_to_wait
608
+ # Ensure we never sleep for a negative duration.
609
+ time_to_wait = max(timedelta(0), time_to_wait)
610
+ logger.debug(
611
+ f"Policy {policy} reached call limit for endpoint {endpoint} ({exc.rate}). "
612
+ f"Sleeping for {time_to_wait} on attempt {attempt}."
480
613
  )
481
614
  time.sleep(time_to_wait.total_seconds())
482
615
  else:
616
+ logger.debug(
617
+ f"Policy {policy} reached call limit for endpoint {endpoint} ({exc.rate}) "
618
+ f"and blocking is disabled."
619
+ )
483
620
  raise
484
621
 
485
622
  if last_exception:
486
- logger.info(
487
- "we used all %s attempts to acquire and failed", self._maximum_attempts_to_acquire
623
+ logger.debug(
624
+ f"Exhausted all {self._maximum_attempts_to_acquire} attempts to acquire a call for endpoint {endpoint} "
625
+ f"using policy: {policy}"
488
626
  )
489
627
  raise last_exception
490
628
 
@@ -496,7 +634,7 @@ class HttpAPIBudget(APIBudget):
496
634
  self,
497
635
  ratelimit_reset_header: str = "ratelimit-reset",
498
636
  ratelimit_remaining_header: str = "ratelimit-remaining",
499
- status_codes_for_ratelimit_hit: tuple[int] = (429,),
637
+ status_codes_for_ratelimit_hit: list[int] = [429],
500
638
  **kwargs: Any,
501
639
  ):
502
640
  """Constructor
@@ -423,8 +423,6 @@ class HttpStream(Stream, CheckpointMixin, ABC):
423
423
  stream_slice: Optional[Mapping[str, Any]] = None,
424
424
  stream_state: Optional[Mapping[str, Any]] = None,
425
425
  ) -> Iterable[StreamData]:
426
- partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice)
427
-
428
426
  stream_state = stream_state or {}
429
427
  pagination_complete = False
430
428
  next_page_token = None
@@ -438,6 +436,7 @@ class HttpStream(Stream, CheckpointMixin, ABC):
438
436
 
439
437
  cursor = self.get_cursor()
440
438
  if cursor and isinstance(cursor, SubstreamResumableFullRefreshCursor):
439
+ partition, _, _ = self._extract_slice_fields(stream_slice=stream_slice)
441
440
  # Substreams checkpoint state by marking an entire parent partition as completed so that on the subsequent attempt
442
441
  # after a failure, completed parents are skipped and the sync can make progress
443
442
  cursor.close_slice(StreamSlice(cursor_slice={}, partition=partition))