airbyte-cdk 6.8.1rc9__py3-none-any.whl → 6.8.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. airbyte_cdk/cli/source_declarative_manifest/_run.py +11 -5
  2. airbyte_cdk/config_observation.py +1 -1
  3. airbyte_cdk/connector_builder/main.py +1 -1
  4. airbyte_cdk/connector_builder/message_grouper.py +10 -10
  5. airbyte_cdk/destinations/destination.py +1 -1
  6. airbyte_cdk/destinations/vector_db_based/embedder.py +2 -2
  7. airbyte_cdk/destinations/vector_db_based/writer.py +12 -4
  8. airbyte_cdk/entrypoint.py +7 -6
  9. airbyte_cdk/logger.py +2 -2
  10. airbyte_cdk/sources/abstract_source.py +1 -1
  11. airbyte_cdk/sources/config.py +1 -1
  12. airbyte_cdk/sources/connector_state_manager.py +9 -4
  13. airbyte_cdk/sources/declarative/auth/oauth.py +1 -1
  14. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +6 -1
  15. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +76 -28
  16. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +10 -4
  17. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +16 -17
  18. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +4 -1
  19. airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
  20. airbyte_cdk/sources/declarative/incremental/__init__.py +3 -0
  21. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +270 -0
  22. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +8 -6
  23. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +9 -0
  24. airbyte_cdk/sources/declarative/interpolation/jinja.py +35 -36
  25. airbyte_cdk/sources/declarative/interpolation/macros.py +1 -1
  26. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +71 -17
  27. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +13 -7
  28. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +1 -1
  29. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +8 -6
  30. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +1 -1
  31. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +2 -2
  32. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +1 -1
  33. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +5 -2
  34. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
  35. airbyte_cdk/sources/declarative/spec/spec.py +1 -1
  36. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +0 -1
  37. airbyte_cdk/sources/embedded/base_integration.py +3 -2
  38. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
  39. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +18 -7
  40. airbyte_cdk/sources/file_based/file_types/avro_parser.py +14 -11
  41. airbyte_cdk/sources/file_based/file_types/csv_parser.py +3 -3
  42. airbyte_cdk/sources/file_based/file_types/excel_parser.py +11 -5
  43. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
  44. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +2 -2
  45. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +6 -3
  46. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +1 -1
  47. airbyte_cdk/sources/http_logger.py +3 -3
  48. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +5 -2
  49. airbyte_cdk/sources/streams/concurrent/adapters.py +6 -3
  50. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +9 -3
  51. airbyte_cdk/sources/streams/concurrent/cursor.py +10 -1
  52. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +2 -2
  53. airbyte_cdk/sources/streams/core.py +17 -14
  54. airbyte_cdk/sources/streams/http/http.py +19 -19
  55. airbyte_cdk/sources/streams/http/http_client.py +4 -48
  56. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  57. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +62 -33
  58. airbyte_cdk/sources/utils/record_helper.py +1 -1
  59. airbyte_cdk/sources/utils/schema_helpers.py +1 -1
  60. airbyte_cdk/sources/utils/transform.py +34 -15
  61. airbyte_cdk/test/entrypoint_wrapper.py +11 -6
  62. airbyte_cdk/test/mock_http/response_builder.py +1 -1
  63. airbyte_cdk/utils/airbyte_secrets_utils.py +1 -1
  64. airbyte_cdk/utils/event_timing.py +10 -10
  65. airbyte_cdk/utils/message_utils.py +4 -3
  66. airbyte_cdk/utils/spec_schema_transformations.py +3 -2
  67. airbyte_cdk/utils/traced_exception.py +14 -12
  68. airbyte_cdk-6.8.2.dev1.dist-info/METADATA +111 -0
  69. {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/RECORD +72 -71
  70. airbyte_cdk-6.8.1rc9.dist-info/METADATA +0 -307
  71. {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/LICENSE.txt +0 -0
  72. {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/WHEEL +0 -0
  73. {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/entry_points.txt +0 -0
@@ -59,13 +59,11 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- date_time_based_cursor: DatetimeBasedCursor,
63
- substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
62
+ cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
64
63
  **kwargs: Any,
65
64
  ):
66
65
  super().__init__(**kwargs)
67
- self._date_time_based_cursor = date_time_based_cursor
68
- self._substream_cursor = substream_cursor
66
+ self._cursor = cursor
69
67
 
70
68
  def filter_records(
71
69
  self,
@@ -77,7 +75,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
77
75
  records = (
78
76
  record
79
77
  for record in records
80
- if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
78
+ if self._cursor.should_be_synced(
81
79
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
82
80
  # Record stream name is empty cause it is not used durig the filtering
83
81
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ConcurrentCursorFactory, ConcurrentPerPartitionCursor
5
6
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
6
7
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
7
8
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import GlobalSubstreamCursor
@@ -14,6 +15,8 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
14
15
 
15
16
  __all__ = [
16
17
  "CursorFactory",
18
+ "ConcurrentCursorFactory"
19
+ "ConcurrentPerPartitionCursor",
17
20
  "DatetimeBasedCursor",
18
21
  "DeclarativeCursor",
19
22
  "GlobalSubstreamCursor",
@@ -0,0 +1,270 @@
1
+ import copy
2
+
3
+ #
4
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
5
+ #
6
+ import logging
7
+ from collections import OrderedDict
8
+ from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
9
+
10
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
11
+ from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
12
+ from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
13
+ from airbyte_cdk.sources.message import MessageRepository
14
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
15
+ PerPartitionKeySerializer,
16
+ )
17
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
18
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
19
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
20
+
21
+ logger = logging.getLogger("airbyte")
22
+
23
+
24
+ class ConcurrentCursorFactory:
25
+ def __init__(self, create_function: Callable[..., Cursor]):
26
+ self._create_function = create_function
27
+
28
+ def create(self, stream_state: Mapping[str, Any]) -> Cursor:
29
+ return self._create_function(stream_state=stream_state)
30
+
31
+
32
+ class ConcurrentPerPartitionCursor(Cursor):
33
+ """
34
+ Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
35
+
36
+ **Partition Limitation and Limit Reached Logic**
37
+
38
+ - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
39
+ - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
40
+ - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
41
+
42
+ The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
43
+
44
+ - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
45
+ - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
46
+
47
+ This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
48
+ """
49
+
50
+ DEFAULT_MAX_PARTITIONS_NUMBER = 10000
51
+ _NO_STATE: Mapping[str, Any] = {}
52
+ _NO_CURSOR_STATE: Mapping[str, Any] = {}
53
+ _KEY = 0
54
+ _VALUE = 1
55
+ _state_to_migrate_from: Mapping[str, Any] = {}
56
+
57
+ def __init__(
58
+ self,
59
+ cursor_factory: ConcurrentCursorFactory,
60
+ partition_router: PartitionRouter,
61
+ stream_name: str,
62
+ stream_namespace: Optional[str],
63
+ stream_state: Any,
64
+ message_repository: MessageRepository,
65
+ connector_state_manager: ConnectorStateManager,
66
+ cursor_field: CursorField,
67
+ ) -> None:
68
+ self._stream_name = stream_name
69
+ self._stream_namespace = stream_namespace
70
+ self._message_repository = message_repository
71
+ self._connector_state_manager = connector_state_manager
72
+ self._cursor_field = cursor_field
73
+
74
+ self._cursor_factory = cursor_factory
75
+ self._partition_router = partition_router
76
+
77
+ # The dict is ordered to ensure that once the maximum number of partitions is reached,
78
+ # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
79
+ self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
80
+ self._over_limit = 0
81
+ self._partition_serializer = PerPartitionKeySerializer()
82
+
83
+ self._set_initial_state(stream_state)
84
+
85
+ @property
86
+ def cursor_field(self) -> CursorField:
87
+ return self._cursor_field
88
+
89
+ @property
90
+ def state(self) -> MutableMapping[str, Any]:
91
+ states = []
92
+ for partition_tuple, cursor in self._cursor_per_partition.items():
93
+ cursor_state = cursor._connector_state_converter.convert_to_state_message(
94
+ cursor._cursor_field, cursor.state
95
+ )
96
+ if cursor_state:
97
+ states.append(
98
+ {
99
+ "partition": self._to_dict(partition_tuple),
100
+ "cursor": copy.deepcopy(cursor_state),
101
+ }
102
+ )
103
+ state: dict[str, Any] = {"states": states}
104
+ return state
105
+
106
+ def close_partition(self, partition: Partition) -> None:
107
+ self._cursor_per_partition[self._to_partition_key(partition._stream_slice.partition)].close_partition_without_emit(partition=partition)
108
+
109
+ def ensure_at_least_one_state_emitted(self) -> None:
110
+ """
111
+ The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
112
+ called.
113
+ """
114
+ self._emit_state_message()
115
+
116
+ def _emit_state_message(self) -> None:
117
+ self._connector_state_manager.update_state_for_stream(
118
+ self._stream_name,
119
+ self._stream_namespace,
120
+ self.state,
121
+ )
122
+ state_message = self._connector_state_manager.create_state_message(
123
+ self._stream_name, self._stream_namespace
124
+ )
125
+ self._message_repository.emit_message(state_message)
126
+
127
+
128
+ def stream_slices(self) -> Iterable[StreamSlice]:
129
+ slices = self._partition_router.stream_slices()
130
+ for partition in slices:
131
+ yield from self.generate_slices_from_partition(partition)
132
+
133
+ def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
134
+ # Ensure the maximum number of partitions is not exceeded
135
+ self._ensure_partition_limit()
136
+
137
+ cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
138
+ if not cursor:
139
+ partition_state = (
140
+ self._state_to_migrate_from
141
+ if self._state_to_migrate_from
142
+ else self._NO_CURSOR_STATE
143
+ )
144
+ cursor = self._create_cursor(partition_state)
145
+ self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
146
+
147
+ for cursor_slice in cursor.stream_slices():
148
+ yield StreamSlice(
149
+ partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
150
+ )
151
+
152
+ def _ensure_partition_limit(self) -> None:
153
+ """
154
+ Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
155
+ """
156
+ while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
157
+ self._over_limit += 1
158
+ oldest_partition = self._cursor_per_partition.popitem(last=False)[
159
+ 0
160
+ ] # Remove the oldest partition
161
+ logger.warning(
162
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
163
+ )
164
+
165
+ def limit_reached(self) -> bool:
166
+ return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
167
+
168
+ def _set_initial_state(self, stream_state: StreamState) -> None:
169
+ """
170
+ Set the initial state for the cursors.
171
+
172
+ This method initializes the state for each partition cursor using the provided stream state.
173
+ If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
174
+
175
+ Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
176
+ does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
177
+
178
+ Args:
179
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
180
+ {
181
+ "states": [
182
+ {
183
+ "partition": {
184
+ "partition_key": "value"
185
+ },
186
+ "cursor": {
187
+ "last_updated": "2023-05-27T00:00:00Z"
188
+ }
189
+ }
190
+ ],
191
+ "parent_state": {
192
+ "parent_stream_name": {
193
+ "last_updated": "2023-05-27T00:00:00Z"
194
+ }
195
+ }
196
+ }
197
+ """
198
+ if not stream_state:
199
+ return
200
+
201
+ if "states" not in stream_state:
202
+ # We assume that `stream_state` is in a global format that can be applied to all partitions.
203
+ # Example: {"global_state_format_key": "global_state_format_value"}
204
+ self._state_to_migrate_from = stream_state
205
+
206
+ else:
207
+ for state in stream_state["states"]:
208
+ self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
209
+ self._create_cursor(state["cursor"])
210
+ )
211
+
212
+ # set default state for missing partitions if it is per partition with fallback to global
213
+ if "state" in stream_state:
214
+ self._state_to_migrate_from = stream_state["state"]
215
+
216
+ # Set parent state for partition routers based on parent streams
217
+ self._partition_router.set_initial_state(stream_state)
218
+
219
+ def observe(self, record: Record) -> None:
220
+ self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)].observe(record)
221
+
222
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
223
+ return self._partition_serializer.to_partition_key(partition)
224
+
225
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
226
+ return self._partition_serializer.to_partition(partition_key)
227
+
228
+ def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor:
229
+ cursor = self._cursor_factory.create(stream_state=cursor_state)
230
+ return cursor
231
+
232
+ def should_be_synced(self, record: Record) -> bool:
233
+ return self._get_cursor(record).should_be_synced(record)
234
+
235
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
236
+ if not first.associated_slice or not second.associated_slice:
237
+ raise ValueError(
238
+ f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
239
+ )
240
+ if first.associated_slice.partition != second.associated_slice.partition:
241
+ raise ValueError(
242
+ f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
243
+ )
244
+
245
+ return self._get_cursor(first).is_greater_than_or_equal(
246
+ self._convert_record_to_cursor_record(first),
247
+ self._convert_record_to_cursor_record(second),
248
+ )
249
+
250
+ @staticmethod
251
+ def _convert_record_to_cursor_record(record: Record) -> Record:
252
+ return Record(
253
+ record.data,
254
+ StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
255
+ if record.associated_slice
256
+ else None,
257
+ )
258
+
259
+ def _get_cursor(self, record: Record) -> Cursor:
260
+ if not record.associated_slice:
261
+ raise ValueError(
262
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
263
+ )
264
+ partition_key = self._to_partition_key(record.associated_slice.partition)
265
+ if partition_key not in self._cursor_per_partition:
266
+ raise ValueError(
267
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
268
+ )
269
+ cursor = self._cursor_per_partition[partition_key]
270
+ return cursor
@@ -133,8 +133,8 @@ class DatetimeBasedCursor(DeclarativeCursor):
133
133
  :param stream_state: The state of the stream as returned by get_stream_state
134
134
  """
135
135
  self._cursor = (
136
- stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None
137
- ) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
136
+ stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None # type: ignore [union-attr]
137
+ )
138
138
 
139
139
  def observe(self, stream_slice: StreamSlice, record: Record) -> None:
140
140
  """
@@ -158,8 +158,10 @@ class DatetimeBasedCursor(DeclarativeCursor):
158
158
  )
159
159
  if (
160
160
  self._is_within_daterange_boundaries(
161
- record, stream_slice.get(start_field), stream_slice.get(end_field)
162
- ) # type: ignore # we know that stream_slices for these cursors will use a string representing an unparsed date
161
+ record,
162
+ stream_slice.get(start_field), # type: ignore [arg-type]
163
+ stream_slice.get(end_field), # type: ignore [arg-type]
164
+ )
163
165
  and is_highest_observed_cursor_value
164
166
  ):
165
167
  self._highest_observed_cursor_field_value = record_cursor_value
@@ -368,9 +370,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
368
370
  self._partition_field_start.eval(self.config)
369
371
  )
370
372
  if self.end_time_option and self.end_time_option.inject_into == option_type:
371
- options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(
373
+ options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get( # type: ignore [union-attr]
372
374
  self._partition_field_end.eval(self.config)
373
- ) # type: ignore # field_name is always casted to an interpolated string
375
+ )
374
376
  return options
375
377
 
376
378
  def should_be_synced(self, record: Record) -> bool:
@@ -303,6 +303,15 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
+ if self._to_partition_key(record.associated_slice.partition) not in self._cursor_per_partition:
307
+ partition_state = (
308
+ self._state_to_migrate_from
309
+ if self._state_to_migrate_from
310
+ else self._NO_CURSOR_STATE
311
+ )
312
+ cursor = self._create_cursor(partition_state)
313
+
314
+ self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)] = cursor
306
315
  return self._get_cursor(record).should_be_synced(
307
316
  self._convert_record_to_cursor_record(record)
308
317
  )
@@ -4,7 +4,7 @@
4
4
 
5
5
  import ast
6
6
  from functools import cache
7
- from typing import Any, Mapping, Optional, Set, Tuple, Type
7
+ from typing import Any, Mapping, Optional, Tuple, Type
8
8
 
9
9
  from jinja2 import meta
10
10
  from jinja2.environment import Template
@@ -27,35 +27,7 @@ class StreamPartitionAccessEnvironment(SandboxedEnvironment):
27
27
  def is_safe_attribute(self, obj: Any, attr: str, value: Any) -> bool:
28
28
  if attr in ["_partition"]:
29
29
  return True
30
- return super().is_safe_attribute(obj, attr, value) # type: ignore # for some reason, mypy says 'Returning Any from function declared to return "bool"'
31
-
32
-
33
- # These aliases are used to deprecate existing keywords without breaking all existing connectors.
34
- _ALIASES = {
35
- "stream_interval": "stream_slice", # Use stream_interval to access incremental_sync values
36
- "stream_partition": "stream_slice", # Use stream_partition to access partition router's values
37
- }
38
-
39
- # These extensions are not installed so they're not currently a problem,
40
- # but we're still explicitely removing them from the jinja context.
41
- # At worst, this is documentation that we do NOT want to include these extensions because of the potential security risks
42
- _RESTRICTED_EXTENSIONS = ["jinja2.ext.loopcontrols"] # Adds support for break continue in loops
43
-
44
- # By default, these Python builtin functions are available in the Jinja context.
45
- # We explicitely remove them because of the potential security risk.
46
- # Please add a unit test to test_jinja.py when adding a restriction.
47
- _RESTRICTED_BUILTIN_FUNCTIONS = [
48
- "range"
49
- ] # The range function can cause very expensive computations
50
-
51
- _ENVIRONMENT = StreamPartitionAccessEnvironment()
52
- _ENVIRONMENT.filters.update(**filters)
53
- _ENVIRONMENT.globals.update(**macros)
54
-
55
- for extension in _RESTRICTED_EXTENSIONS:
56
- _ENVIRONMENT.extensions.pop(extension, None)
57
- for builtin in _RESTRICTED_BUILTIN_FUNCTIONS:
58
- _ENVIRONMENT.globals.pop(builtin, None)
30
+ return super().is_safe_attribute(obj, attr, value)
59
31
 
60
32
 
61
33
  class JinjaInterpolation(Interpolation):
@@ -76,6 +48,34 @@ class JinjaInterpolation(Interpolation):
76
48
  Additional information on jinja templating can be found at https://jinja.palletsprojects.com/en/3.1.x/templates/#
77
49
  """
78
50
 
51
+ # These aliases are used to deprecate existing keywords without breaking all existing connectors.
52
+ ALIASES = {
53
+ "stream_interval": "stream_slice", # Use stream_interval to access incremental_sync values
54
+ "stream_partition": "stream_slice", # Use stream_partition to access partition router's values
55
+ }
56
+
57
+ # These extensions are not installed so they're not currently a problem,
58
+ # but we're still explicitely removing them from the jinja context.
59
+ # At worst, this is documentation that we do NOT want to include these extensions because of the potential security risks
60
+ RESTRICTED_EXTENSIONS = ["jinja2.ext.loopcontrols"] # Adds support for break continue in loops
61
+
62
+ # By default, these Python builtin functions are available in the Jinja context.
63
+ # We explicitely remove them because of the potential security risk.
64
+ # Please add a unit test to test_jinja.py when adding a restriction.
65
+ RESTRICTED_BUILTIN_FUNCTIONS = [
66
+ "range"
67
+ ] # The range function can cause very expensive computations
68
+
69
+ def __init__(self) -> None:
70
+ self._environment = StreamPartitionAccessEnvironment()
71
+ self._environment.filters.update(**filters)
72
+ self._environment.globals.update(**macros)
73
+
74
+ for extension in self.RESTRICTED_EXTENSIONS:
75
+ self._environment.extensions.pop(extension, None)
76
+ for builtin in self.RESTRICTED_BUILTIN_FUNCTIONS:
77
+ self._environment.globals.pop(builtin, None)
78
+
79
79
  def eval(
80
80
  self,
81
81
  input_str: str,
@@ -86,7 +86,7 @@ class JinjaInterpolation(Interpolation):
86
86
  ) -> Any:
87
87
  context = {"config": config, **additional_parameters}
88
88
 
89
- for alias, equivalent in _ALIASES.items():
89
+ for alias, equivalent in self.ALIASES.items():
90
90
  if alias in context:
91
91
  # This is unexpected. We could ignore or log a warning, but failing loudly should result in fewer surprises
92
92
  raise ValueError(
@@ -105,7 +105,6 @@ class JinjaInterpolation(Interpolation):
105
105
  raise Exception(f"Expected a string, got {input_str}")
106
106
  except UndefinedError:
107
107
  pass
108
-
109
108
  # If result is empty or resulted in an undefined error, evaluate and return the default string
110
109
  return self._literal_eval(self._eval(default, context), valid_types)
111
110
 
@@ -133,16 +132,16 @@ class JinjaInterpolation(Interpolation):
133
132
  return s
134
133
 
135
134
  @cache
136
- def _find_undeclared_variables(self, s: Optional[str]) -> Set[str]:
135
+ def _find_undeclared_variables(self, s: Optional[str]) -> set[str]:
137
136
  """
138
137
  Find undeclared variables and cache them
139
138
  """
140
- ast = _ENVIRONMENT.parse(s) # type: ignore # parse is able to handle None
139
+ ast = self._environment.parse(s) # type: ignore # parse is able to handle None
141
140
  return meta.find_undeclared_variables(ast)
142
141
 
143
142
  @cache
144
- def _compile(self, s: str) -> Template:
143
+ def _compile(self, s: Optional[str]) -> Template:
145
144
  """
146
145
  We must cache the Jinja Template ourselves because we're using `from_string` instead of a template loader
147
146
  """
148
- return _ENVIRONMENT.from_string(s)
147
+ return self._environment.from_string(s) # type: ignore [arg-type] # Expected `str | Template` but passed `str | None`
@@ -116,7 +116,7 @@ def duration(datestring: str) -> Union[datetime.timedelta, isodate.Duration]:
116
116
  Usage:
117
117
  `"{{ now_utc() - duration('P1D') }}"`
118
118
  """
119
- return parse_duration(datestring) # type: ignore # mypy thinks this returns Any for some reason
119
+ return parse_duration(datestring)
120
120
 
121
121
 
122
122
  def format_datetime(
@@ -81,6 +81,8 @@ from airbyte_cdk.sources.declarative.extractors.record_selector import (
81
81
  )
82
82
  from airbyte_cdk.sources.declarative.incremental import (
83
83
  ChildPartitionResumableFullRefreshCursor,
84
+ ConcurrentCursorFactory,
85
+ ConcurrentPerPartitionCursor,
84
86
  CursorFactory,
85
87
  DatetimeBasedCursor,
86
88
  DeclarativeCursor,
@@ -396,7 +398,7 @@ class ModelToComponentFactory:
396
398
  self._disable_retries = disable_retries
397
399
  self._disable_cache = disable_cache
398
400
  self._disable_resumable_full_refresh = disable_resumable_full_refresh
399
- self._message_repository = message_repository or InMemoryMessageRepository( # type: ignore
401
+ self._message_repository = message_repository or InMemoryMessageRepository(
400
402
  self._evaluate_log_level(emit_connector_builder_messages)
401
403
  )
402
404
 
@@ -644,7 +646,7 @@ class ModelToComponentFactory:
644
646
  declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams.
645
647
  config,
646
648
  declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any]
647
- ) # type: ignore # The retriever type was already checked
649
+ )
648
650
 
649
651
  def create_session_token_authenticator(
650
652
  self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any
@@ -674,7 +676,7 @@ class ModelToComponentFactory:
674
676
  return ModelToComponentFactory.create_bearer_authenticator(
675
677
  BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value
676
678
  config,
677
- token_provider=token_provider, # type: ignore # $parameters defaults to None
679
+ token_provider=token_provider,
678
680
  )
679
681
  else:
680
682
  return ModelToComponentFactory.create_api_key_authenticator(
@@ -821,7 +823,6 @@ class ModelToComponentFactory:
821
823
  input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats,
822
824
  is_sequential_state=True,
823
825
  cursor_granularity=cursor_granularity,
824
- # type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
825
826
  )
826
827
 
827
828
  start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime]
@@ -894,7 +895,7 @@ class ModelToComponentFactory:
894
895
  stream_name=stream_name,
895
896
  stream_namespace=stream_namespace,
896
897
  stream_state=stream_state,
897
- message_repository=self._message_repository, # type: ignore # message_repository is always instantiated with a value by factory
898
+ message_repository=self._message_repository,
898
899
  connector_state_manager=state_manager,
899
900
  connector_state_converter=connector_state_converter,
900
901
  cursor_field=cursor_field,
@@ -906,6 +907,62 @@ class ModelToComponentFactory:
906
907
  cursor_granularity=cursor_granularity,
907
908
  )
908
909
 
910
+ def create_concurrent_cursor_from_perpartition_cursor(
911
+ self,
912
+ state_manager: ConnectorStateManager,
913
+ model_type: Type[BaseModel],
914
+ component_definition: ComponentDefinition,
915
+ stream_name: str,
916
+ stream_namespace: Optional[str],
917
+ config: Config,
918
+ stream_state: MutableMapping[str, Any],
919
+ partition_router,
920
+ **kwargs: Any,
921
+ ) -> ConcurrentPerPartitionCursor:
922
+ component_type = component_definition.get("type")
923
+ if component_definition.get("type") != model_type.__name__:
924
+ raise ValueError(
925
+ f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
926
+ )
927
+
928
+ datetime_based_cursor_model = model_type.parse_obj(component_definition)
929
+
930
+ if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
931
+ raise ValueError(
932
+ f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
933
+ )
934
+
935
+ interpolated_cursor_field = InterpolatedString.create(
936
+ datetime_based_cursor_model.cursor_field,
937
+ parameters=datetime_based_cursor_model.parameters or {},
938
+ )
939
+ cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
940
+
941
+ # Create the cursor factory
942
+ cursor_factory = ConcurrentCursorFactory(
943
+ partial(
944
+ self.create_concurrent_cursor_from_datetime_based_cursor,
945
+ state_manager=state_manager,
946
+ model_type=model_type,
947
+ component_definition=component_definition,
948
+ stream_name=stream_name,
949
+ stream_namespace=stream_namespace,
950
+ config=config,
951
+ )
952
+ )
953
+
954
+ # Return the concurrent cursor and state converter
955
+ return ConcurrentPerPartitionCursor(
956
+ cursor_factory=cursor_factory,
957
+ partition_router=partition_router,
958
+ stream_name=stream_name,
959
+ stream_namespace=stream_namespace,
960
+ stream_state=stream_state,
961
+ message_repository=self._message_repository, # type: ignore
962
+ connector_state_manager=state_manager,
963
+ cursor_field=cursor_field,
964
+ )
965
+
909
966
  @staticmethod
910
967
  def create_constant_backoff_strategy(
911
968
  model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
@@ -1188,17 +1245,14 @@ class ModelToComponentFactory:
1188
1245
  raise ValueError(
1189
1246
  "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
1190
1247
  )
1248
+ cursor = combined_slicers if isinstance(
1249
+ combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1250
+ ) else self._create_component_from_model(
1251
+ model=model.incremental_sync, config=config
1252
+ )
1253
+
1191
1254
  client_side_incremental_sync = {
1192
- "date_time_based_cursor": self._create_component_from_model(
1193
- model=model.incremental_sync, config=config
1194
- ),
1195
- "substream_cursor": (
1196
- combined_slicers
1197
- if isinstance(
1198
- combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1199
- )
1200
- else None
1201
- ),
1255
+ "cursor": cursor
1202
1256
  }
1203
1257
 
1204
1258
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
@@ -1705,7 +1759,7 @@ class ModelToComponentFactory:
1705
1759
  refresh_token=model.refresh_token,
1706
1760
  scopes=model.scopes,
1707
1761
  token_expiry_date=model.token_expiry_date,
1708
- token_expiry_date_format=model.token_expiry_date_format, # type: ignore
1762
+ token_expiry_date_format=model.token_expiry_date_format,
1709
1763
  token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format),
1710
1764
  token_refresh_endpoint=model.token_refresh_endpoint,
1711
1765
  config=config,
@@ -1912,7 +1966,7 @@ class ModelToComponentFactory:
1912
1966
  if (
1913
1967
  not isinstance(stream_slicer, DatetimeBasedCursor)
1914
1968
  or type(stream_slicer) is not DatetimeBasedCursor
1915
- ):
1969
+ ) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
1916
1970
  # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
1917
1971
  # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
1918
1972
  # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's