airbyte-cdk 6.54.10__py3-none-any.whl → 6.54.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@ import time
9
9
  from collections import OrderedDict
10
10
  from copy import deepcopy
11
11
  from datetime import timedelta
12
- from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
12
+ from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
13
13
 
14
14
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
15
15
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
@@ -66,8 +66,8 @@ class ConcurrentPerPartitionCursor(Cursor):
66
66
  _GLOBAL_STATE_KEY = "state"
67
67
  _PERPARTITION_STATE_KEY = "states"
68
68
  _IS_PARTITION_DUPLICATION_LOGGED = False
69
- _KEY = 0
70
- _VALUE = 1
69
+ _PARENT_STATE = 0
70
+ _GENERATION_SEQUENCE = 1
71
71
 
72
72
  def __init__(
73
73
  self,
@@ -99,19 +99,29 @@ class ConcurrentPerPartitionCursor(Cursor):
99
99
  self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
100
100
 
101
101
  # Parent-state tracking: store each partition’s parent state in creation order
102
- self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
102
+ self._partition_parent_state_map: OrderedDict[str, tuple[Mapping[str, Any], int]] = (
103
+ OrderedDict()
104
+ )
105
+ self._parent_state: Optional[StreamState] = None
106
+
107
+ # Tracks when the last slice for partition is emitted
108
+ self._partitions_done_generating_stream_slices: set[str] = set()
109
+ # Used to track the index of partitions that are not closed yet
110
+ self._processing_partitions_indexes: List[int] = list()
111
+ self._generated_partitions_count: int = 0
112
+ # Dictionary to map partition keys to their index
113
+ self._partition_key_to_index: dict[str, int] = {}
103
114
 
104
- self._finished_partitions: set[str] = set()
105
115
  self._lock = threading.Lock()
106
- self._timer = Timer()
107
- self._new_global_cursor: Optional[StreamState] = None
108
116
  self._lookback_window: int = 0
109
- self._parent_state: Optional[StreamState] = None
117
+ self._new_global_cursor: Optional[StreamState] = None
110
118
  self._number_of_partitions: int = 0
111
119
  self._use_global_cursor: bool = use_global_cursor
112
120
  self._partition_serializer = PerPartitionKeySerializer()
121
+
113
122
  # Track the last time a state message was emitted
114
123
  self._last_emission_time: float = 0.0
124
+ self._timer = Timer()
115
125
 
116
126
  self._set_initial_state(stream_state)
117
127
 
@@ -157,60 +167,37 @@ class ConcurrentPerPartitionCursor(Cursor):
157
167
  self._cursor_per_partition[partition_key].close_partition(partition=partition)
158
168
  cursor = self._cursor_per_partition[partition_key]
159
169
  if (
160
- partition_key in self._finished_partitions
170
+ partition_key in self._partitions_done_generating_stream_slices
161
171
  and self._semaphore_per_partition[partition_key]._value == 0
162
172
  ):
163
173
  self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
164
174
 
175
+ # Clean up the partition if it is fully processed
176
+ self._cleanup_if_done(partition_key)
177
+
165
178
  self._check_and_update_parent_state()
166
179
 
167
180
  self._emit_state_message()
168
181
 
169
182
  def _check_and_update_parent_state(self) -> None:
170
- """
171
- Pop the leftmost partition state from _partition_parent_state_map only if
172
- *all partitions* up to (and including) that partition key in _semaphore_per_partition
173
- are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
174
- Additionally, delete finished semaphores with a value of 0 to free up memory,
175
- as they are only needed to track errors and completion status.
176
- """
177
183
  last_closed_state = None
178
184
 
179
185
  while self._partition_parent_state_map:
180
- # Look at the earliest partition key in creation order
181
- earliest_key = next(iter(self._partition_parent_state_map))
182
-
183
- # Verify ALL partitions from the left up to earliest_key are finished
184
- all_left_finished = True
185
- for p_key, sem in list(
186
- self._semaphore_per_partition.items()
187
- ): # Use list to allow modification during iteration
188
- # If any earlier partition is still not finished, we must stop
189
- if p_key not in self._finished_partitions or sem._value != 0:
190
- all_left_finished = False
191
- break
192
- # Once we've reached earliest_key in the semaphore order, we can stop checking
193
- if p_key == earliest_key:
194
- break
195
-
196
- # If the partitions up to earliest_key are not all finished, break the while-loop
197
- if not all_left_finished:
198
- break
186
+ earliest_key, (candidate_state, candidate_seq) = next(
187
+ iter(self._partition_parent_state_map.items())
188
+ )
199
189
 
200
- # Pop the leftmost entry from parent-state map
201
- _, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
202
- last_closed_state = closed_parent_state
190
+ # if any partition that started <= candidate_seq is still open, we must wait
191
+ if (
192
+ self._processing_partitions_indexes
193
+ and self._processing_partitions_indexes[0] <= candidate_seq
194
+ ):
195
+ break
203
196
 
204
- # Clean up finished semaphores with value 0 up to and including earliest_key
205
- for p_key in list(self._semaphore_per_partition.keys()):
206
- sem = self._semaphore_per_partition[p_key]
207
- if p_key in self._finished_partitions and sem._value == 0:
208
- del self._semaphore_per_partition[p_key]
209
- logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
210
- if p_key == earliest_key:
211
- break
197
+ # safe to pop
198
+ self._partition_parent_state_map.popitem(last=False)
199
+ last_closed_state = candidate_state
212
200
 
213
- # Update _parent_state if we popped at least one partition
214
201
  if last_closed_state is not None:
215
202
  self._parent_state = last_closed_state
216
203
 
@@ -289,18 +276,24 @@ class ConcurrentPerPartitionCursor(Cursor):
289
276
  if not self._IS_PARTITION_DUPLICATION_LOGGED:
290
277
  logger.warning(f"Partition duplication detected for stream {self._stream_name}")
291
278
  self._IS_PARTITION_DUPLICATION_LOGGED = True
279
+ return
292
280
  else:
293
281
  self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
294
282
 
295
283
  with self._lock:
284
+ seq = self._generated_partitions_count
285
+ self._generated_partitions_count += 1
286
+ self._processing_partitions_indexes.append(seq)
287
+ self._partition_key_to_index[partition_key] = seq
288
+
296
289
  if (
297
290
  len(self._partition_parent_state_map) == 0
298
291
  or self._partition_parent_state_map[
299
292
  next(reversed(self._partition_parent_state_map))
300
- ]
293
+ ][self._PARENT_STATE]
301
294
  != parent_state
302
295
  ):
303
- self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
296
+ self._partition_parent_state_map[partition_key] = (deepcopy(parent_state), seq)
304
297
 
305
298
  for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
306
299
  cursor.stream_slices(),
@@ -308,7 +301,7 @@ class ConcurrentPerPartitionCursor(Cursor):
308
301
  ):
309
302
  self._semaphore_per_partition[partition_key].release()
310
303
  if is_last_slice:
311
- self._finished_partitions.add(partition_key)
304
+ self._partitions_done_generating_stream_slices.add(partition_key)
312
305
  yield StreamSlice(
313
306
  partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
314
307
  )
@@ -338,14 +331,11 @@ class ConcurrentPerPartitionCursor(Cursor):
338
331
  while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
339
332
  # Try removing finished partitions first
340
333
  for partition_key in list(self._cursor_per_partition.keys()):
341
- if partition_key in self._finished_partitions and (
342
- partition_key not in self._semaphore_per_partition
343
- or self._semaphore_per_partition[partition_key]._value == 0
344
- ):
334
+ if partition_key not in self._partition_key_to_index:
345
335
  oldest_partition = self._cursor_per_partition.pop(
346
336
  partition_key
347
337
  ) # Remove the oldest partition
348
- logger.warning(
338
+ logger.debug(
349
339
  f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
350
340
  )
351
341
  break
@@ -474,6 +464,25 @@ class ConcurrentPerPartitionCursor(Cursor):
474
464
  ):
475
465
  self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)}
476
466
 
467
+ def _cleanup_if_done(self, partition_key: str) -> None:
468
+ """
469
+ Free every in-memory structure that belonged to a completed partition:
470
+ cursor, semaphore, flag inside `_finished_partitions`
471
+ """
472
+ if not (
473
+ partition_key in self._partitions_done_generating_stream_slices
474
+ and self._semaphore_per_partition[partition_key]._value == 0
475
+ ):
476
+ return
477
+
478
+ self._semaphore_per_partition.pop(partition_key, None)
479
+ self._partitions_done_generating_stream_slices.discard(partition_key)
480
+
481
+ seq = self._partition_key_to_index.pop(partition_key)
482
+ self._processing_partitions_indexes.remove(seq)
483
+
484
+ logger.debug(f"Partition {partition_key} fully processed and cleaned up.")
485
+
477
486
  def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
478
487
  return self._partition_serializer.to_partition_key(partition)
479
488
 
@@ -147,16 +147,18 @@ class JinjaInterpolation(Interpolation):
147
147
  # It can be returned as is
148
148
  return s
149
149
 
150
+ @staticmethod
150
151
  @cache
151
- def _find_undeclared_variables(self, s: Optional[str]) -> Set[str]:
152
+ def _find_undeclared_variables(s: Optional[str]) -> Set[str]:
152
153
  """
153
154
  Find undeclared variables and cache them
154
155
  """
155
156
  ast = _ENVIRONMENT.parse(s) # type: ignore # parse is able to handle None
156
157
  return meta.find_undeclared_variables(ast)
157
158
 
159
+ @staticmethod
158
160
  @cache
159
- def _compile(self, s: str) -> Template:
161
+ def _compile(s: str) -> Template:
160
162
  """
161
163
  We must cache the Jinja Template ourselves because we're using `from_string` instead of a template loader
162
164
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 6.54.10
3
+ Version: 6.54.11
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://airbyte.com
6
6
  License: MIT
@@ -111,7 +111,7 @@ airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=vCpwX1PVRFP
111
111
  airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=WJyA2OYIEgFpVP5Y3o0tIj69AV6IKkn9B16MeXaEItI,6513
112
112
  airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
113
113
  airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=U1oZKtBaEC6IACmvziY9Wzg7Z8EgF4ZuR7NwvjlB_Sk,1255
114
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=8zxfM1aXSwdfTbpL9PHwgg43ZRbNUQRcX038D7YTI_0,22768
114
+ airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=araWk039M89c6lQHEUltfM1VI_xGw9gZIDXRWWF6SkM,22591
115
115
  airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=Rbe6lJLTtZ5en33MwZiB9-H9-AwDMNHgwBZs8EqhYqk,22172
116
116
  airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
117
117
  airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=2tsE6FgXzemf4fZZ4uGtd8QpRBl9GJ2CRqSNJE5p0EI,16077
@@ -125,7 +125,7 @@ airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py,sha256=h36
125
125
  airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py,sha256=myVaNtFqxOAwrbp93rgd1dhkqyuvXvET9rsimQ89ktc,1873
126
126
  airbyte_cdk/sources/declarative/interpolation/interpolated_string.py,sha256=CQkHqGlfa87G6VYMtBAQWin7ECKpfMdrDcg0JO5_rhc,3212
127
127
  airbyte_cdk/sources/declarative/interpolation/interpolation.py,sha256=9IoeuWam3L6GyN10L6U8xNWXmkt9cnahSDNkez1OmFY,982
128
- airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=wm_W3_usYU_3eUdSpzrj21qsk8-vcK8TtWEHQsH7SY4,7245
128
+ airbyte_cdk/sources/declarative/interpolation/jinja.py,sha256=oFGKs3oX0xO6DOL4E9x8rhxwbEoRcgx4HJVIL1RQ9c4,7269
129
129
  airbyte_cdk/sources/declarative/interpolation/macros.py,sha256=xRcmjape4_WGmKMJpmBsKY0k4OHJDM46Hv3V-dlSz3w,5640
130
130
  airbyte_cdk/sources/declarative/manifest_declarative_source.py,sha256=ciXtM7Qhus170ZwP8B9Ac4VScX2FPBYvlbZRv_r376U,24692
131
131
  airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -421,9 +421,9 @@ airbyte_cdk/utils/slice_hasher.py,sha256=EDxgROHDbfG-QKQb59m7h_7crN1tRiawdf5uU7G
421
421
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
422
422
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
423
423
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
424
- airbyte_cdk-6.54.10.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
425
- airbyte_cdk-6.54.10.dist-info/LICENSE_SHORT,sha256=aqF6D1NcESmpn-cqsxBtszTEnHKnlsp8L4x9wAh3Nxg,55
426
- airbyte_cdk-6.54.10.dist-info/METADATA,sha256=D26T_w15_WHbsqtSJzrHNySmgZg9FnltqLhbESGxdDg,6344
427
- airbyte_cdk-6.54.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
428
- airbyte_cdk-6.54.10.dist-info/entry_points.txt,sha256=AKWbEkHfpzzk9nF9tqBUaw1MbvTM4mGtEzmZQm0ZWvM,139
429
- airbyte_cdk-6.54.10.dist-info/RECORD,,
424
+ airbyte_cdk-6.54.11.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
425
+ airbyte_cdk-6.54.11.dist-info/LICENSE_SHORT,sha256=aqF6D1NcESmpn-cqsxBtszTEnHKnlsp8L4x9wAh3Nxg,55
426
+ airbyte_cdk-6.54.11.dist-info/METADATA,sha256=GAc-NcvDVzTVP4sUrNphva1hP0kyZt-ANEM5Qz5CgPY,6344
427
+ airbyte_cdk-6.54.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
428
+ airbyte_cdk-6.54.11.dist-info/entry_points.txt,sha256=AKWbEkHfpzzk9nF9tqBUaw1MbvTM4mGtEzmZQm0ZWvM,139
429
+ airbyte_cdk-6.54.11.dist-info/RECORD,,