airbyte-cdk 6.12.4.dev0__py3-none-any.whl → 6.13.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. airbyte_cdk/__init__.py +93 -34
  2. airbyte_cdk/cli/source_declarative_manifest/__init__.py +0 -1
  3. airbyte_cdk/models/__init__.py +10 -11
  4. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +1 -1
  5. airbyte_cdk/sources/declarative/auth/__init__.py +2 -5
  6. airbyte_cdk/sources/declarative/auth/oauth.py +27 -12
  7. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +25 -65
  8. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +78 -1
  9. airbyte_cdk/sources/declarative/decoders/__init__.py +21 -3
  10. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  11. airbyte_cdk/sources/declarative/extractors/__init__.py +10 -2
  12. airbyte_cdk/sources/declarative/extractors/record_filter.py +5 -3
  13. airbyte_cdk/sources/declarative/incremental/__init__.py +10 -6
  14. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +0 -14
  15. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +49 -2
  16. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +96 -80
  17. airbyte_cdk/sources/declarative/partition_routers/__init__.py +23 -5
  18. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  19. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  20. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  21. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  22. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +52 -35
  23. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +10 -7
  24. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +9 -4
  25. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  26. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +11 -6
  27. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +16 -5
  28. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +14 -13
  29. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +7 -8
  30. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +10 -7
  31. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +12 -3
  32. airbyte_cdk/sources/declarative/resolvers/__init__.py +31 -8
  33. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +20 -14
  34. airbyte_cdk/sources/declarative/retrievers/__init__.py +5 -2
  35. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +9 -32
  36. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +72 -65
  37. airbyte_cdk/sources/declarative/schema/__init__.py +14 -2
  38. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +20 -3
  39. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  40. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  41. airbyte_cdk/sources/file_based/file_types/__init__.py +12 -3
  42. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  43. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  44. airbyte_cdk/sources/message/__init__.py +7 -1
  45. airbyte_cdk/sources/streams/__init__.py +1 -1
  46. airbyte_cdk/sources/streams/checkpoint/__init__.py +2 -3
  47. airbyte_cdk/sources/streams/concurrent/cursor.py +0 -1
  48. airbyte_cdk/sources/streams/http/__init__.py +2 -2
  49. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +2 -2
  50. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +8 -3
  51. airbyte_cdk/test/mock_http/__init__.py +1 -1
  52. airbyte_cdk/test/mock_http/mocker.py +3 -1
  53. airbyte_cdk/test/mock_http/response_builder.py +1 -1
  54. airbyte_cdk/utils/__init__.py +1 -1
  55. {airbyte_cdk-6.12.4.dev0.dist-info → airbyte_cdk-6.13.0.dev0.dist-info}/METADATA +2 -2
  56. {airbyte_cdk-6.12.4.dev0.dist-info → airbyte_cdk-6.13.0.dev0.dist-info}/RECORD +59 -58
  57. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +0 -344
  58. {airbyte_cdk-6.12.4.dev0.dist-info → airbyte_cdk-6.13.0.dev0.dist-info}/LICENSE.txt +0 -0
  59. {airbyte_cdk-6.12.4.dev0.dist-info → airbyte_cdk-6.13.0.dev0.dist-info}/WHEEL +0 -0
  60. {airbyte_cdk-6.12.4.dev0.dist-info → airbyte_cdk-6.13.0.dev0.dist-info}/entry_points.txt +0 -0
@@ -1,344 +0,0 @@
1
- import copy
2
- import logging
3
-
4
- #
5
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
6
- #
7
- import threading
8
- from collections import OrderedDict
9
- from copy import deepcopy
10
- from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
11
-
12
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
13
- from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
14
- from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
15
- Timer,
16
- iterate_with_last_flag_and_state,
17
- )
18
- from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
19
- from airbyte_cdk.sources.message import MessageRepository
20
- from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
21
- PerPartitionKeySerializer,
22
- )
23
- from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
24
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
25
- from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
26
-
27
- logger = logging.getLogger("airbyte")
28
-
29
-
30
- class ConcurrentCursorFactory:
31
- def __init__(self, create_function: Callable[..., Cursor]):
32
- self._create_function = create_function
33
-
34
- def create(self, stream_state: Mapping[str, Any]) -> Cursor:
35
- return self._create_function(stream_state=stream_state)
36
-
37
-
38
- class ConcurrentPerPartitionCursor(Cursor):
39
- """
40
- Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
41
-
42
- **Partition Limitation and Limit Reached Logic**
43
-
44
- - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
45
- - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
46
- - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
47
-
48
- The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
49
-
50
- - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
51
- - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
52
-
53
- This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
54
- """
55
-
56
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
57
- _NO_STATE: Mapping[str, Any] = {}
58
- _NO_CURSOR_STATE: Mapping[str, Any] = {}
59
- _KEY = 0
60
- _VALUE = 1
61
-
62
- def __init__(
63
- self,
64
- cursor_factory: ConcurrentCursorFactory,
65
- partition_router: PartitionRouter,
66
- stream_name: str,
67
- stream_namespace: Optional[str],
68
- stream_state: Any,
69
- message_repository: MessageRepository,
70
- connector_state_manager: ConnectorStateManager,
71
- cursor_field: CursorField,
72
- ) -> None:
73
- self._global_cursor: Mapping[str, Any] = {}
74
- self._stream_name = stream_name
75
- self._stream_namespace = stream_namespace
76
- self._message_repository = message_repository
77
- self._connector_state_manager = connector_state_manager
78
- self._cursor_field = cursor_field
79
-
80
- self._cursor_factory = cursor_factory
81
- self._partition_router = partition_router
82
-
83
- # The dict is ordered to ensure that once the maximum number of partitions is reached,
84
- # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
85
- self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
86
- self._state = {"states": []}
87
- self._semaphore_per_partition = OrderedDict()
88
- self._finished_partitions = set()
89
- self._lock = threading.Lock()
90
- self._timer = Timer()
91
- self._new_global_cursor = None
92
- self._lookback_window = 0
93
- self._parent_state = None
94
- self._over_limit = 0
95
- self._partition_serializer = PerPartitionKeySerializer()
96
-
97
- self._set_initial_state(stream_state)
98
-
99
- @property
100
- def cursor_field(self) -> CursorField:
101
- return self._cursor_field
102
-
103
- @property
104
- def state(self) -> MutableMapping[str, Any]:
105
- states = []
106
- for partition_tuple, cursor in self._cursor_per_partition.items():
107
- cursor_state = cursor._connector_state_converter.convert_to_state_message(
108
- self.cursor_field, cursor.state
109
- )
110
- if cursor_state:
111
- states.append(
112
- {
113
- "partition": self._to_dict(partition_tuple),
114
- "cursor": copy.deepcopy(cursor_state),
115
- }
116
- )
117
- state: dict[str, Any] = {"states": states}
118
-
119
- if self._global_cursor:
120
- state["state"] = self._global_cursor
121
- if self._lookback_window is not None:
122
- state["lookback_window"] = self._lookback_window
123
- if self._parent_state is not None:
124
- state["parent_state"] = self._parent_state
125
- return state
126
-
127
- def close_partition(self, partition: Partition) -> None:
128
- print(f"Closing partition {self._to_partition_key(partition._stream_slice.partition)}")
129
- self._cursor_per_partition[
130
- self._to_partition_key(partition._stream_slice.partition)
131
- ].close_partition(partition=partition)
132
- with self._lock:
133
- self._semaphore_per_partition[
134
- self._to_partition_key(partition._stream_slice.partition)
135
- ].acquire()
136
- cursor = self._cursor_per_partition[
137
- self._to_partition_key(partition._stream_slice.partition)
138
- ]
139
- cursor_state = cursor._connector_state_converter.convert_to_state_message(
140
- cursor._cursor_field, cursor.state
141
- )
142
- print(f"State {cursor_state} {cursor.state}")
143
- if (
144
- self._to_partition_key(partition._stream_slice.partition)
145
- in self._finished_partitions
146
- and self._semaphore_per_partition[
147
- self._to_partition_key(partition._stream_slice.partition)
148
- ]._value
149
- == 0
150
- ):
151
- if (
152
- self._new_global_cursor is None
153
- or self._new_global_cursor[self.cursor_field.cursor_field_key]
154
- < cursor_state[self.cursor_field.cursor_field_key]
155
- ):
156
- self._new_global_cursor = copy.deepcopy(cursor_state)
157
-
158
- def ensure_at_least_one_state_emitted(self) -> None:
159
- """
160
- The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
161
- called.
162
- """
163
- if not any(
164
- semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
165
- ):
166
- self._global_cursor = self._new_global_cursor
167
- self._lookback_window = self._timer.finish()
168
- self._parent_state = self._partition_router.get_stream_state()
169
- self._emit_state_message()
170
-
171
- def _emit_state_message(self) -> None:
172
- self._connector_state_manager.update_state_for_stream(
173
- self._stream_name,
174
- self._stream_namespace,
175
- self.state,
176
- )
177
- state_message = self._connector_state_manager.create_state_message(
178
- self._stream_name, self._stream_namespace
179
- )
180
- self._message_repository.emit_message(state_message)
181
-
182
- def stream_slices(self) -> Iterable[StreamSlice]:
183
- slices = self._partition_router.stream_slices()
184
- self._timer.start()
185
- for partition in slices:
186
- yield from self.generate_slices_from_partition(partition)
187
-
188
- def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
189
- # Ensure the maximum number of partitions is not exceeded
190
- self._ensure_partition_limit()
191
-
192
- cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
193
- if not cursor:
194
- partition_state = (
195
- self._global_cursor
196
- if self._global_cursor
197
- else self._NO_CURSOR_STATE
198
- )
199
- cursor = self._create_cursor(partition_state)
200
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
201
- self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
202
- threading.Semaphore(0)
203
- )
204
-
205
- for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
206
- cursor.stream_slices(),
207
- lambda: None,
208
- ):
209
- self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
210
- if is_last_slice:
211
- self._finished_partitions.add(self._to_partition_key(partition.partition))
212
- yield StreamSlice(
213
- partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
214
- )
215
-
216
- def _ensure_partition_limit(self) -> None:
217
- """
218
- Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
219
- """
220
- while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
221
- self._over_limit += 1
222
- oldest_partition = self._cursor_per_partition.popitem(last=False)[
223
- 0
224
- ] # Remove the oldest partition
225
- logger.warning(
226
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
227
- )
228
-
229
- def limit_reached(self) -> bool:
230
- return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
231
-
232
- def _set_initial_state(self, stream_state: StreamState) -> None:
233
- """
234
- Set the initial state for the cursors.
235
-
236
- This method initializes the state for each partition cursor using the provided stream state.
237
- If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
238
-
239
- Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
240
- does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
241
-
242
- Args:
243
- stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
244
- {
245
- "states": [
246
- {
247
- "partition": {
248
- "partition_key": "value"
249
- },
250
- "cursor": {
251
- "last_updated": "2023-05-27T00:00:00Z"
252
- }
253
- }
254
- ],
255
- "parent_state": {
256
- "parent_stream_name": {
257
- "last_updated": "2023-05-27T00:00:00Z"
258
- }
259
- }
260
- }
261
- """
262
- if not stream_state:
263
- return
264
-
265
- if "states" not in stream_state:
266
- # We assume that `stream_state` is in a global format that can be applied to all partitions.
267
- # Example: {"global_state_format_key": "global_state_format_value"}
268
- self._global_cursor = deepcopy(stream_state)
269
- self._new_global_cursor = deepcopy(stream_state)
270
-
271
- else:
272
- for state in stream_state["states"]:
273
- self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
274
- self._create_cursor(state["cursor"])
275
- )
276
- self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
277
- threading.Semaphore(0)
278
- )
279
-
280
- # set default state for missing partitions if it is per partition with fallback to global
281
- if "state" in stream_state:
282
- self._global_cursor = deepcopy(stream_state["state"])
283
- self._new_global_cursor = deepcopy(stream_state["state"])
284
-
285
- # Set parent state for partition routers based on parent streams
286
- self._partition_router.set_initial_state(stream_state)
287
-
288
- def observe(self, record: Record) -> None:
289
- print("Observing record in concirrent perpartition ", self._to_partition_key(record.associated_slice.partition), record, self._cursor_per_partition[
290
- self._to_partition_key(record.associated_slice.partition)
291
- ].state)
292
- self._cursor_per_partition[
293
- self._to_partition_key(record.associated_slice.partition)
294
- ].observe(record)
295
-
296
- def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
297
- return self._partition_serializer.to_partition_key(partition)
298
-
299
- def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
300
- return self._partition_serializer.to_partition(partition_key)
301
-
302
- def _create_cursor(self, cursor_state: Any) -> Cursor:
303
- cursor = self._cursor_factory.create(stream_state=deepcopy(cursor_state))
304
- return cursor
305
-
306
- def should_be_synced(self, record: Record) -> bool:
307
- return self._get_cursor(record).should_be_synced(record)
308
-
309
- def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
310
- if not first.associated_slice or not second.associated_slice:
311
- raise ValueError(
312
- f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
313
- )
314
- if first.associated_slice.partition != second.associated_slice.partition:
315
- raise ValueError(
316
- f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
317
- )
318
-
319
- return self._get_cursor(first).is_greater_than_or_equal(
320
- self._convert_record_to_cursor_record(first),
321
- self._convert_record_to_cursor_record(second),
322
- )
323
-
324
- @staticmethod
325
- def _convert_record_to_cursor_record(record: Record) -> Record:
326
- return Record(
327
- record.data,
328
- StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
329
- if record.associated_slice
330
- else None,
331
- )
332
-
333
- def _get_cursor(self, record: Record) -> Cursor:
334
- if not record.associated_slice:
335
- raise ValueError(
336
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
337
- )
338
- partition_key = self._to_partition_key(record.associated_slice.partition)
339
- if partition_key not in self._cursor_per_partition:
340
- raise ValueError(
341
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
342
- )
343
- cursor = self._cursor_per_partition[partition_key]
344
- return cursor