airbyte-cdk 6.17.1.dev1__py3-none-any.whl → 6.18.0.dev1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
20
20
  ClientSideIncrementalRecordFilterDecorator,
21
21
  )
22
22
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23
- from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24
- PerPartitionWithGlobalCursor,
25
- )
26
23
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
27
24
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
28
25
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -307,72 +304,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
307
304
  cursor=final_state_cursor,
308
305
  )
309
306
  )
310
- elif (
311
- incremental_sync_component_definition
312
- and incremental_sync_component_definition.get("type", "")
313
- == DatetimeBasedCursorModel.__name__
314
- and self._stream_supports_concurrent_partition_processing(
315
- declarative_stream=declarative_stream
316
- )
317
- and hasattr(declarative_stream.retriever, "stream_slicer")
318
- and isinstance(
319
- declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
320
- )
321
- ):
322
- stream_state = state_manager.get_stream_state(
323
- stream_name=declarative_stream.name, namespace=declarative_stream.namespace
324
- )
325
- partition_router = declarative_stream.retriever.stream_slicer._partition_router
326
-
327
- cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
328
- state_manager=state_manager,
329
- model_type=DatetimeBasedCursorModel,
330
- component_definition=incremental_sync_component_definition,
331
- stream_name=declarative_stream.name,
332
- stream_namespace=declarative_stream.namespace,
333
- config=config or {},
334
- stream_state=stream_state,
335
- partition_router=partition_router,
336
- )
337
-
338
- retriever = declarative_stream.retriever
339
-
340
- # This is an optimization so that we don't invoke any cursor or state management flows within the
341
- # low-code framework because state management is handled through the ConcurrentCursor.
342
- if declarative_stream and isinstance(retriever, SimpleRetriever):
343
- # Also a temporary hack. In the legacy Stream implementation, as part of the read,
344
- # set_initial_state() is called to instantiate incoming state on the cursor. Although we no
345
- # longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
346
- # like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
347
- # still rely on a DatetimeBasedCursor that is properly initialized with state.
348
- if retriever.cursor:
349
- retriever.cursor.set_initial_state(stream_state=stream_state)
350
- # We zero it out here, but since this is a cursor reference, the state is still properly
351
- # instantiated for the other components that reference it
352
- retriever.cursor = None
353
-
354
- partition_generator = StreamSlicerPartitionGenerator(
355
- DeclarativePartitionFactory(
356
- declarative_stream.name,
357
- declarative_stream.get_json_schema(),
358
- retriever,
359
- self.message_repository,
360
- ),
361
- cursor,
362
- )
363
-
364
- concurrent_streams.append(
365
- DefaultStream(
366
- partition_generator=partition_generator,
367
- name=declarative_stream.name,
368
- json_schema=declarative_stream.get_json_schema(),
369
- availability_strategy=AlwaysAvailableAvailabilityStrategy(),
370
- primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
371
- cursor_field=cursor.cursor_field.cursor_field_key,
372
- logger=self.logger,
373
- cursor=cursor,
374
- )
375
- )
376
307
  else:
377
308
  synchronous_streams.append(declarative_stream)
378
309
  else:
@@ -678,7 +678,7 @@ definitions:
678
678
  properties:
679
679
  type:
680
680
  type: string
681
- enum: [ CustomSchemaNormalization ]
681
+ enum: [CustomSchemaNormalization]
682
682
  class_name:
683
683
  title: Class Name
684
684
  description: Fully-qualified name of the class that will be implementing the custom normalization. The format is `source_<name>.<package>.<class_name>`.
@@ -2014,6 +2014,20 @@ definitions:
2014
2014
  $parameters:
2015
2015
  type: object
2016
2016
  additionalProperties: true
2017
+ JsonParser:
2018
+ title: JsonParser
2019
+ description: Parser used for parsing str, bytes, or bytearray data and returning data in a dictionary format.
2020
+ type: object
2021
+ additionalProperties: true
2022
+ required:
2023
+ - type
2024
+ properties:
2025
+ type:
2026
+ type: string
2027
+ enum: [JsonParser]
2028
+ encoding:
2029
+ type: string
2030
+ default: utf-8
2017
2031
  ListPartitionRouter:
2018
2032
  title: List Partition Router
2019
2033
  description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
@@ -7,9 +7,12 @@ from dataclasses import dataclass
7
7
  from io import BufferedIOBase, TextIOWrapper
8
8
  from typing import Any, Generator, MutableMapping, Optional
9
9
 
10
+ import orjson
10
11
  import requests
11
12
 
13
+ from airbyte_cdk.models import FailureType
12
14
  from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
15
+ from airbyte_cdk.utils import AirbyteTracedException
13
16
 
14
17
  logger = logging.getLogger("airbyte")
15
18
 
@@ -42,6 +45,31 @@ class GzipParser(Parser):
42
45
  yield from self.inner_parser.parse(gzipobj)
43
46
 
44
47
 
48
+ @dataclass
49
+ class JsonParser(Parser):
50
+ encoding: str = "utf-8"
51
+
52
+ def parse(self, data: BufferedIOBase) -> Generator[MutableMapping[str, Any], None, None]:
53
+ raw_data = data.read()
54
+ try:
55
+ body_json = orjson.loads(raw_data.decode(self.encoding))
56
+ except Exception:
57
+ try:
58
+ body_json = json.loads(raw_data.decode(self.encoding))
59
+ except Exception as exc:
60
+ raise AirbyteTracedException(
61
+ message="Response JSON data failed to be parsed. See logs for more inforation.",
62
+ internal_message=f"Response JSON data faild to be parsed: {exc=}, {raw_data=}",
63
+ failure_type=FailureType.system_error,
64
+ exception=exc,
65
+ )
66
+
67
+ if isinstance(body_json, list):
68
+ yield from body_json
69
+ else:
70
+ yield from [body_json]
71
+
72
+
45
73
  @dataclass
46
74
  class JsonLineParser(Parser):
47
75
  encoding: Optional[str] = "utf-8"
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
62
+ date_time_based_cursor: DatetimeBasedCursor,
63
+ substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
63
64
  **kwargs: Any,
64
65
  ):
65
66
  super().__init__(**kwargs)
66
- self._cursor = cursor
67
+ self._date_time_based_cursor = date_time_based_cursor
68
+ self._substream_cursor = substream_cursor
67
69
 
68
70
  def filter_records(
69
71
  self,
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
75
77
  records = (
76
78
  record
77
79
  for record in records
78
- if self._cursor.should_be_synced(
80
+ if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
79
81
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
80
82
  # Record stream name is empty cause it is not used durig the filtering
81
83
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,10 +2,6 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
6
- ConcurrentCursorFactory,
7
- ConcurrentPerPartitionCursor,
8
- )
9
5
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
10
6
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
11
7
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
@@ -25,8 +21,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
25
21
 
26
22
  __all__ = [
27
23
  "CursorFactory",
28
- "ConcurrentCursorFactory",
29
- "ConcurrentPerPartitionCursor",
30
24
  "DatetimeBasedCursor",
31
25
  "DeclarativeCursor",
32
26
  "GlobalSubstreamCursor",
@@ -303,21 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
- if (
307
- record.associated_slice
308
- and self._to_partition_key(record.associated_slice.partition)
309
- not in self._cursor_per_partition
310
- ):
311
- partition_state = (
312
- self._state_to_migrate_from
313
- if self._state_to_migrate_from
314
- else self._NO_CURSOR_STATE
315
- )
316
- cursor = self._create_cursor(partition_state)
317
-
318
- self._cursor_per_partition[
319
- self._to_partition_key(record.associated_slice.partition)
320
- ] = cursor
321
306
  return self._get_cursor(record).should_be_synced(
322
307
  self._convert_record_to_cursor_record(record)
323
308
  )
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
737
737
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
738
738
 
739
739
 
740
+ class FlattenFields(BaseModel):
741
+ type: Literal["FlattenFields"]
742
+ flatten_lists: Optional[bool] = Field(
743
+ True,
744
+ description="Whether to flatten lists or leave it as is. Default is True.",
745
+ title="Flatten Lists",
746
+ )
747
+ parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
748
+
749
+
740
750
  class KeysReplace(BaseModel):
741
751
  type: Literal["KeysReplace"]
742
752
  old: str = Field(
743
753
  ...,
744
754
  description="Old value to replace.",
745
- examples=[" ", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
755
+ examples=[
756
+ " ",
757
+ "{{ record.id }}",
758
+ "{{ config['id'] }}",
759
+ "{{ stream_slice['id'] }}",
760
+ ],
746
761
  title="Old value",
747
762
  )
748
763
  new: str = Field(
749
764
  ...,
750
765
  description="New value to set.",
751
- examples=["_", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
766
+ examples=[
767
+ "_",
768
+ "{{ record.id }}",
769
+ "{{ config['id'] }}",
770
+ "{{ stream_slice['id'] }}",
771
+ ],
752
772
  title="New value",
753
773
  )
754
774
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
755
775
 
756
776
 
757
- class FlattenFields(BaseModel):
758
- type: Literal["FlattenFields"]
759
- flatten_lists: Optional[bool] = Field(
760
- True,
761
- description="Whether to flatten lists or leave it as is. Default is True.",
762
- title="Flatten Lists",
763
- )
764
- parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
765
-
766
-
767
777
  class IterableDecoder(BaseModel):
768
778
  type: Literal["IterableDecoder"]
769
779
 
@@ -795,6 +805,14 @@ class GzipJsonDecoder(BaseModel):
795
805
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
796
806
 
797
807
 
808
+ class JsonParser(BaseModel):
809
+ class Config:
810
+ extra = Extra.allow
811
+
812
+ type: Literal["JsonParser"]
813
+ encoding: Optional[str] = "utf-8"
814
+
815
+
798
816
  class MinMaxDatetime(BaseModel):
799
817
  type: Literal["MinMaxDatetime"]
800
818
  datetime: str = Field(
@@ -72,6 +72,7 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
72
72
  CsvParser,
73
73
  GzipParser,
74
74
  JsonLineParser,
75
+ JsonParser,
75
76
  )
76
77
  from airbyte_cdk.sources.declarative.extractors import (
77
78
  DpathExtractor,
@@ -84,8 +85,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
84
85
  )
85
86
  from airbyte_cdk.sources.declarative.incremental import (
86
87
  ChildPartitionResumableFullRefreshCursor,
87
- ConcurrentCursorFactory,
88
- ConcurrentPerPartitionCursor,
89
88
  CursorFactory,
90
89
  DatetimeBasedCursor,
91
90
  DeclarativeCursor,
@@ -249,6 +248,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
249
248
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
250
249
  JsonLineParser as JsonLineParserModel,
251
250
  )
251
+ from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
252
+ JsonParser as JsonParserModel,
253
+ )
252
254
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
253
255
  JwtAuthenticator as JwtAuthenticatorModel,
254
256
  )
@@ -440,7 +442,6 @@ from airbyte_cdk.sources.message import (
440
442
  InMemoryMessageRepository,
441
443
  LogAppenderMessageRepositoryDecorator,
442
444
  MessageRepository,
443
- NoopMessageRepository,
444
445
  )
445
446
  from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
446
447
  from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
@@ -525,6 +526,7 @@ class ModelToComponentFactory:
525
526
  JsonDecoderModel: self.create_json_decoder,
526
527
  JsonlDecoderModel: self.create_jsonl_decoder,
527
528
  JsonLineParserModel: self.create_json_line_parser,
529
+ JsonParserModel: self.create_json_parser,
528
530
  GzipJsonDecoderModel: self.create_gzipjson_decoder,
529
531
  GzipParserModel: self.create_gzip_parser,
530
532
  KeysToLowerModel: self.create_keys_to_lower_transformation,
@@ -874,8 +876,6 @@ class ModelToComponentFactory:
874
876
  stream_namespace: Optional[str],
875
877
  config: Config,
876
878
  stream_state: MutableMapping[str, Any],
877
- message_repository: Optional[MessageRepository] = None,
878
- runtime_lookback_window: Optional[int] = None,
879
879
  **kwargs: Any,
880
880
  ) -> ConcurrentCursor:
881
881
  component_type = component_definition.get("type")
@@ -933,11 +933,6 @@ class ModelToComponentFactory:
933
933
  if evaluated_lookback_window:
934
934
  lookback_window = parse_duration(evaluated_lookback_window)
935
935
 
936
- if runtime_lookback_window and lookback_window:
937
- lookback_window = max(lookback_window, runtime_lookback_window)
938
- elif runtime_lookback_window:
939
- lookback_window = runtime_lookback_window
940
-
941
936
  connector_state_converter: DateTimeStreamStateConverter
942
937
  connector_state_converter = CustomFormatConcurrentStreamStateConverter(
943
938
  datetime_format=datetime_format,
@@ -1016,7 +1011,7 @@ class ModelToComponentFactory:
1016
1011
  stream_name=stream_name,
1017
1012
  stream_namespace=stream_namespace,
1018
1013
  stream_state=stream_state,
1019
- message_repository=message_repository or self._message_repository,
1014
+ message_repository=self._message_repository,
1020
1015
  connector_state_manager=state_manager,
1021
1016
  connector_state_converter=connector_state_converter,
1022
1017
  cursor_field=cursor_field,
@@ -1028,63 +1023,6 @@ class ModelToComponentFactory:
1028
1023
  cursor_granularity=cursor_granularity,
1029
1024
  )
1030
1025
 
1031
- def create_concurrent_cursor_from_perpartition_cursor(
1032
- self,
1033
- state_manager: ConnectorStateManager,
1034
- model_type: Type[BaseModel],
1035
- component_definition: ComponentDefinition,
1036
- stream_name: str,
1037
- stream_namespace: Optional[str],
1038
- config: Config,
1039
- stream_state: MutableMapping[str, Any],
1040
- partition_router,
1041
- **kwargs: Any,
1042
- ) -> ConcurrentPerPartitionCursor:
1043
- component_type = component_definition.get("type")
1044
- if component_definition.get("type") != model_type.__name__:
1045
- raise ValueError(
1046
- f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
1047
- )
1048
-
1049
- datetime_based_cursor_model = model_type.parse_obj(component_definition)
1050
-
1051
- if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
1052
- raise ValueError(
1053
- f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
1054
- )
1055
-
1056
- interpolated_cursor_field = InterpolatedString.create(
1057
- datetime_based_cursor_model.cursor_field,
1058
- parameters=datetime_based_cursor_model.parameters or {},
1059
- )
1060
- cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
1061
-
1062
- # Create the cursor factory
1063
- cursor_factory = ConcurrentCursorFactory(
1064
- partial(
1065
- self.create_concurrent_cursor_from_datetime_based_cursor,
1066
- state_manager=state_manager,
1067
- model_type=model_type,
1068
- component_definition=component_definition,
1069
- stream_name=stream_name,
1070
- stream_namespace=stream_namespace,
1071
- config=config,
1072
- message_repository=NoopMessageRepository(),
1073
- )
1074
- )
1075
-
1076
- # Return the concurrent cursor and state converter
1077
- return ConcurrentPerPartitionCursor(
1078
- cursor_factory=cursor_factory,
1079
- partition_router=partition_router,
1080
- stream_name=stream_name,
1081
- stream_namespace=stream_namespace,
1082
- stream_state=stream_state,
1083
- message_repository=self._message_repository, # type: ignore
1084
- connector_state_manager=state_manager,
1085
- cursor_field=cursor_field,
1086
- )
1087
-
1088
1026
  @staticmethod
1089
1027
  def create_constant_backoff_strategy(
1090
1028
  model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
@@ -1367,15 +1305,18 @@ class ModelToComponentFactory:
1367
1305
  raise ValueError(
1368
1306
  "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
1369
1307
  )
1370
- cursor = (
1371
- combined_slicers
1372
- if isinstance(
1373
- combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1374
- )
1375
- else self._create_component_from_model(model=model.incremental_sync, config=config)
1376
- )
1377
-
1378
- client_side_incremental_sync = {"cursor": cursor}
1308
+ client_side_incremental_sync = {
1309
+ "date_time_based_cursor": self._create_component_from_model(
1310
+ model=model.incremental_sync, config=config
1311
+ ),
1312
+ "substream_cursor": (
1313
+ combined_slicers
1314
+ if isinstance(
1315
+ combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1316
+ )
1317
+ else None
1318
+ ),
1319
+ }
1379
1320
 
1380
1321
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
1381
1322
  cursor_model = model.incremental_sync
@@ -1812,6 +1753,11 @@ class ModelToComponentFactory:
1812
1753
  def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
1813
1754
  return JsonDecoder(parameters={})
1814
1755
 
1756
+ @staticmethod
1757
+ def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
1758
+ encoding = model.encoding or "utf-8"
1759
+ return JsonParser(encoding=encoding)
1760
+
1815
1761
  @staticmethod
1816
1762
  def create_jsonl_decoder(
1817
1763
  model: JsonlDecoderModel, config: Config, **kwargs: Any
@@ -2191,7 +2137,7 @@ class ModelToComponentFactory:
2191
2137
  if (
2192
2138
  not isinstance(stream_slicer, DatetimeBasedCursor)
2193
2139
  or type(stream_slicer) is not DatetimeBasedCursor
2194
- ) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
2140
+ ):
2195
2141
  # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
2196
2142
  # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
2197
2143
  # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
160
160
  stream_slice,
161
161
  next_page_token,
162
162
  self._paginator.get_request_headers,
163
- self.request_option_provider.get_request_headers,
163
+ self.stream_slicer.get_request_headers,
164
164
  )
165
165
  if isinstance(headers, str):
166
166
  raise ValueError("Request headers cannot be a string")
@@ -196,9 +196,7 @@ class ConcurrentCursor(Cursor):
196
196
 
197
197
  @property
198
198
  def state(self) -> MutableMapping[str, Any]:
199
- return self._connector_state_converter.convert_to_state_message(
200
- self.cursor_field, self._concurrent_state
201
- )
199
+ return self._concurrent_state
202
200
 
203
201
  @property
204
202
  def cursor_field(self) -> CursorField:
@@ -243,10 +241,10 @@ class ConcurrentCursor(Cursor):
243
241
  return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
244
242
 
245
243
  def close_partition(self, partition: Partition) -> None:
246
- slice_count_before = len(self._concurrent_state.get("slices", []))
244
+ slice_count_before = len(self.state.get("slices", []))
247
245
  self._add_slice_to_state(partition)
248
246
  if slice_count_before < len(
249
- self._concurrent_state["slices"]
247
+ self.state["slices"]
250
248
  ): # only emit if at least one slice has been processed
251
249
  self._merge_partitions()
252
250
  self._emit_state_message()
@@ -258,11 +256,11 @@ class ConcurrentCursor(Cursor):
258
256
  )
259
257
 
260
258
  if self._slice_boundary_fields:
261
- if "slices" not in self._concurrent_state:
259
+ if "slices" not in self.state:
262
260
  raise RuntimeError(
263
261
  f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
264
262
  )
265
- self._concurrent_state["slices"].append(
263
+ self.state["slices"].append(
266
264
  {
267
265
  self._connector_state_converter.START_KEY: self._extract_from_slice(
268
266
  partition, self._slice_boundary_fields[self._START_BOUNDARY]
@@ -290,7 +288,7 @@ class ConcurrentCursor(Cursor):
290
288
  "expected. Please contact the Airbyte team."
291
289
  )
292
290
 
293
- self._concurrent_state["slices"].append(
291
+ self.state["slices"].append(
294
292
  {
295
293
  self._connector_state_converter.START_KEY: self.start,
296
294
  self._connector_state_converter.END_KEY: most_recent_cursor_value,
@@ -302,7 +300,9 @@ class ConcurrentCursor(Cursor):
302
300
  self._connector_state_manager.update_state_for_stream(
303
301
  self._stream_name,
304
302
  self._stream_namespace,
305
- self.state,
303
+ self._connector_state_converter.convert_to_state_message(
304
+ self._cursor_field, self.state
305
+ ),
306
306
  )
307
307
  state_message = self._connector_state_manager.create_state_message(
308
308
  self._stream_name, self._stream_namespace
@@ -310,9 +310,7 @@ class ConcurrentCursor(Cursor):
310
310
  self._message_repository.emit_message(state_message)
311
311
 
312
312
  def _merge_partitions(self) -> None:
313
- self._concurrent_state["slices"] = self._connector_state_converter.merge_intervals(
314
- self._concurrent_state["slices"]
315
- )
313
+ self.state["slices"] = self._connector_state_converter.merge_intervals(self.state["slices"])
316
314
 
317
315
  def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType:
318
316
  try:
@@ -349,42 +347,36 @@ class ConcurrentCursor(Cursor):
349
347
  if self._start is not None and self._is_start_before_first_slice():
350
348
  yield from self._split_per_slice_range(
351
349
  self._start,
352
- self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY],
350
+ self.state["slices"][0][self._connector_state_converter.START_KEY],
353
351
  False,
354
352
  )
355
353
 
356
- if len(self._concurrent_state["slices"]) == 1:
354
+ if len(self.state["slices"]) == 1:
357
355
  yield from self._split_per_slice_range(
358
356
  self._calculate_lower_boundary_of_last_slice(
359
- self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY]
357
+ self.state["slices"][0][self._connector_state_converter.END_KEY]
360
358
  ),
361
359
  self._end_provider(),
362
360
  True,
363
361
  )
364
- elif len(self._concurrent_state["slices"]) > 1:
365
- for i in range(len(self._concurrent_state["slices"]) - 1):
362
+ elif len(self.state["slices"]) > 1:
363
+ for i in range(len(self.state["slices"]) - 1):
366
364
  if self._cursor_granularity:
367
365
  yield from self._split_per_slice_range(
368
- self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY]
366
+ self.state["slices"][i][self._connector_state_converter.END_KEY]
369
367
  + self._cursor_granularity,
370
- self._concurrent_state["slices"][i + 1][
371
- self._connector_state_converter.START_KEY
372
- ],
368
+ self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
373
369
  False,
374
370
  )
375
371
  else:
376
372
  yield from self._split_per_slice_range(
377
- self._concurrent_state["slices"][i][
378
- self._connector_state_converter.END_KEY
379
- ],
380
- self._concurrent_state["slices"][i + 1][
381
- self._connector_state_converter.START_KEY
382
- ],
373
+ self.state["slices"][i][self._connector_state_converter.END_KEY],
374
+ self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
383
375
  False,
384
376
  )
385
377
  yield from self._split_per_slice_range(
386
378
  self._calculate_lower_boundary_of_last_slice(
387
- self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY]
379
+ self.state["slices"][-1][self._connector_state_converter.END_KEY]
388
380
  ),
389
381
  self._end_provider(),
390
382
  True,
@@ -395,8 +387,7 @@ class ConcurrentCursor(Cursor):
395
387
  def _is_start_before_first_slice(self) -> bool:
396
388
  return (
397
389
  self._start is not None
398
- and self._start
399
- < self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY]
390
+ and self._start < self.state["slices"][0][self._connector_state_converter.START_KEY]
400
391
  )
401
392
 
402
393
  def _calculate_lower_boundary_of_last_slice(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-cdk
3
- Version: 6.17.1.dev1
3
+ Version: 6.18.0.dev1
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  License: MIT
6
6
  Keywords: airbyte,connector-development-kit,cdk
@@ -62,15 +62,15 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
62
62
  airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
63
63
  airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
64
64
  airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
65
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=VfDvff6ionjGScMbEpMGlZ0TfOyIQpMUZiuV6pkI9Os,26557
65
+ airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyMCu1qoGsne1Ooz3c1da-8EDZk6Suiy2gIq9Q,22475
66
66
  airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
67
67
  airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
68
68
  airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=GfZlk9EvYQiWDx3AipNLf1us1e986q2mgqcbHbeZU0k,133172
69
+ airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=QDqDyKmkYDDW3fXA8ImE61p4v_sBNQnqnV-uX_qNHNM,133531
70
70
  airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
71
71
  airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
72
72
  airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
73
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=-aO3ujXX9YTP2ZDvI2BP-x0VOKdAq2TlHo4zG8DCTlY,2748
73
+ airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=rSvqdGsVgBT3ZfY_bthjZl_OmxY84iKz8g9GQIWyq8k,3766
74
74
  airbyte_cdk/sources/declarative/decoders/decoder.py,sha256=sl-Gt8lXi7yD2Q-sD8je5QS2PbgrgsYjxRLWsay7DMc,826
75
75
  airbyte_cdk/sources/declarative/decoders/json_decoder.py,sha256=qdbjeR6RffKaah_iWvMsOcDolYuxJY5DaI3b9AMTZXg,3327
76
76
  airbyte_cdk/sources/declarative/decoders/noop_decoder.py,sha256=iZh0yKY_JzgBnJWiubEusf5c0o6Khd-8EWFWT-8EgFo,542
@@ -81,16 +81,15 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=RmV-IkO1YLj0PSOrrq
81
81
  airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
82
82
  airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
83
83
  airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
84
- airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=yTdEkyDUSW2KbFkEwJJMlS963C955LgCCOVfTmmScpQ,3367
84
+ airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
85
85
  airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=tjNwcURmlyD-TGCScXvW95ThNKyPGcx2SiWbG1-H-sc,6552
86
86
  airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
87
87
  airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
88
- airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=U1oZKtBaEC6IACmvziY9Wzg7Z8EgF4ZuR7NwvjlB_Sk,1255
89
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=vU6bcVgjDFou7szl5UKxv2-theKSsV78oSME84-C78A,15043
88
+ airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=huRz3KQJSUFmJCg5GPE9TckEBsB5TMsCa_THhJAhPVI,1037
90
89
  airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
91
90
  airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
92
91
  airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
93
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=_FSJjAwL4Zu-i2CngnhTtx8j-NPVSBKj5LwDSPta3Cg,16305
92
+ airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
94
93
  airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
95
94
  airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
96
95
  airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
@@ -107,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
107
106
  airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
108
107
  airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
109
108
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
110
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=IZFT1m4d-zp5hQ0ayU06Vdxm6r3MEq-X2sOCo9SuG-k,93270
109
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=SpMwCe-6NZTxICSFIXzwlAnAwNLlC8xS12ncEC1NcbA,93536
111
110
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
112
111
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
113
112
  airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
114
113
  airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
115
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lgFqJ8DP-cRizmvFKRd4Oy_ebgoT_AceMKIpuqoFm3c,112097
114
+ airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lmSh2Yp-lgRTbbSw3m6UH8L2nTRjt0w3aiISWHRG6IM,109739
116
115
  airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
117
116
  airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
118
117
  airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
@@ -163,7 +162,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
163
162
  airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
164
163
  airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=kX9ltelK2xLIBWDJBK2ucrvVe5tc5xmhdbVbgsjvlxY,3696
165
164
  airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
166
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=kgnhVQxRlFqJs2-rDu2-QH-p-GzQU3nKmSp6_aq8u0s,24550
165
+ airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=jxQ_9xcVD07r9PKhofitAqMkdX1k8ZNyy50qz5NwkFs,24540
167
166
  airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
168
167
  airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
169
168
  airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
@@ -257,7 +256,7 @@ airbyte_cdk/sources/streams/concurrent/abstract_stream.py,sha256=3OB5VsvOkJmCxIM
257
256
  airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py,sha256=QTry1QCBUwJDw1QSCEvz23s7zIEx_7QMxkPq9j-oPIQ,1358
258
257
  airbyte_cdk/sources/streams/concurrent/adapters.py,sha256=QP_64kQo-b3sRNHZA5aqrgCJqAhIVegRM3vJ8jGyuSY,15213
259
258
  airbyte_cdk/sources/streams/concurrent/availability_strategy.py,sha256=4La5v2UffSjGnhmF4kwNIKt_g3RXk2ux1mSHA1ejgYM,2898
260
- airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=suObbNi24so8Wcj0Wm32OkJAcuvODAOwp373YBmUPp0,21213
259
+ airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=Hke6CpD8Sq1FS4g1Xuht39UN7hKkGy1mvOxvQrm1lLM,20810
261
260
  airbyte_cdk/sources/streams/concurrent/default_stream.py,sha256=K3rLMpYhS7nnmvwQ52lqBy7DQdFMJpvvT7sgBg_ckA8,3207
262
261
  airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=JOZ446MCLpmF26r9KfS6OO_6rGjcjgJNZdcw6jccjEI,468
263
262
  airbyte_cdk/sources/streams/concurrent/helpers.py,sha256=S6AW8TgIASCZ2UuUcQLE8OzgYUHWt2-KPOvNPwnQf-Q,1596
@@ -343,8 +342,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
343
342
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
344
343
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
345
344
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
346
- airbyte_cdk-6.17.1.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
- airbyte_cdk-6.17.1.dev1.dist-info/METADATA,sha256=8TVLQbLq6-v0qkRHb8X4P9x2sYTe9EUjwdvMb2NVOpA,6005
348
- airbyte_cdk-6.17.1.dev1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
- airbyte_cdk-6.17.1.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
- airbyte_cdk-6.17.1.dev1.dist-info/RECORD,,
345
+ airbyte_cdk-6.18.0.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
346
+ airbyte_cdk-6.18.0.dev1.dist-info/METADATA,sha256=ALXOgvI3pTcF2tNmvbQ9S8fG424n229th_tx1u2uSCo,6005
347
+ airbyte_cdk-6.18.0.dev1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
348
+ airbyte_cdk-6.18.0.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
349
+ airbyte_cdk-6.18.0.dev1.dist-info/RECORD,,
@@ -1,340 +0,0 @@
1
- import copy
2
- import logging
3
-
4
- #
5
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
6
- #
7
- import threading
8
- from collections import OrderedDict
9
- from copy import deepcopy
10
- from datetime import timedelta
11
- from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
12
-
13
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
- from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
15
- from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
- Timer,
17
- iterate_with_last_flag_and_state,
18
- )
19
- from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
20
- from airbyte_cdk.sources.message import MessageRepository
21
- from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
22
- PerPartitionKeySerializer,
23
- )
24
- from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
25
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
26
- from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
27
-
28
- logger = logging.getLogger("airbyte")
29
-
30
-
31
- class ConcurrentCursorFactory:
32
- def __init__(self, create_function: Callable[..., Cursor]):
33
- self._create_function = create_function
34
-
35
- def create(self, stream_state: Mapping[str, Any], runtime_lookback_window: Any) -> Cursor:
36
- return self._create_function(
37
- stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
38
- )
39
-
40
-
41
- class ConcurrentPerPartitionCursor(Cursor):
42
- """
43
- Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
44
-
45
- **Partition Limitation and Limit Reached Logic**
46
-
47
- - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
48
- - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
49
- - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
50
-
51
- The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
52
-
53
- - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
54
- - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
55
-
56
- This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
57
- """
58
-
59
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
60
- _NO_STATE: Mapping[str, Any] = {}
61
- _NO_CURSOR_STATE: Mapping[str, Any] = {}
62
- _KEY = 0
63
- _VALUE = 1
64
-
65
- def __init__(
66
- self,
67
- cursor_factory: ConcurrentCursorFactory,
68
- partition_router: PartitionRouter,
69
- stream_name: str,
70
- stream_namespace: Optional[str],
71
- stream_state: Any,
72
- message_repository: MessageRepository,
73
- connector_state_manager: ConnectorStateManager,
74
- cursor_field: CursorField,
75
- ) -> None:
76
- self._global_cursor: Mapping[str, Any] = {}
77
- self._stream_name = stream_name
78
- self._stream_namespace = stream_namespace
79
- self._message_repository = message_repository
80
- self._connector_state_manager = connector_state_manager
81
- self._cursor_field = cursor_field
82
-
83
- self._cursor_factory = cursor_factory
84
- self._partition_router = partition_router
85
-
86
- # The dict is ordered to ensure that once the maximum number of partitions is reached,
87
- # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
88
- self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
89
- self._state = {"states": []}
90
- self._semaphore_per_partition = OrderedDict()
91
- self._finished_partitions = set()
92
- self._lock = threading.Lock()
93
- self._timer = Timer()
94
- self._new_global_cursor = None
95
- self._lookback_window = 0
96
- self._parent_state = None
97
- self._over_limit = 0
98
- self._partition_serializer = PerPartitionKeySerializer()
99
-
100
- self._set_initial_state(stream_state)
101
-
102
- @property
103
- def cursor_field(self) -> CursorField:
104
- return self._cursor_field
105
-
106
- @property
107
- def state(self) -> MutableMapping[str, Any]:
108
- states = []
109
- for partition_tuple, cursor in self._cursor_per_partition.items():
110
- if cursor.state:
111
- states.append(
112
- {
113
- "partition": self._to_dict(partition_tuple),
114
- "cursor": copy.deepcopy(cursor.state),
115
- }
116
- )
117
- state: dict[str, Any] = {"states": states}
118
-
119
- if self._global_cursor:
120
- state["state"] = self._global_cursor
121
- if self._lookback_window is not None:
122
- state["lookback_window"] = self._lookback_window
123
- if self._parent_state is not None:
124
- state["parent_state"] = self._parent_state
125
- return state
126
-
127
- def close_partition(self, partition: Partition) -> None:
128
- self._cursor_per_partition[
129
- self._to_partition_key(partition._stream_slice.partition)
130
- ].close_partition(partition=partition)
131
- with self._lock:
132
- self._semaphore_per_partition[
133
- self._to_partition_key(partition._stream_slice.partition)
134
- ].acquire()
135
- cursor = self._cursor_per_partition[
136
- self._to_partition_key(partition._stream_slice.partition)
137
- ]
138
- if (
139
- self._to_partition_key(partition._stream_slice.partition)
140
- in self._finished_partitions
141
- and self._semaphore_per_partition[
142
- self._to_partition_key(partition._stream_slice.partition)
143
- ]._value
144
- == 0
145
- ):
146
- if (
147
- self._new_global_cursor is None
148
- or self._new_global_cursor[self.cursor_field.cursor_field_key]
149
- < cursor.state[self.cursor_field.cursor_field_key]
150
- ):
151
- self._new_global_cursor = copy.deepcopy(cursor.state)
152
-
153
- def ensure_at_least_one_state_emitted(self) -> None:
154
- """
155
- The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
156
- called.
157
- """
158
- if not any(
159
- semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
160
- ):
161
- self._global_cursor = self._new_global_cursor
162
- self._lookback_window = self._timer.finish()
163
- self._parent_state = self._partition_router.get_stream_state()
164
- self._emit_state_message()
165
-
166
- def _emit_state_message(self) -> None:
167
- self._connector_state_manager.update_state_for_stream(
168
- self._stream_name,
169
- self._stream_namespace,
170
- self.state,
171
- )
172
- state_message = self._connector_state_manager.create_state_message(
173
- self._stream_name, self._stream_namespace
174
- )
175
- self._message_repository.emit_message(state_message)
176
-
177
- def stream_slices(self) -> Iterable[StreamSlice]:
178
- slices = self._partition_router.stream_slices()
179
- self._timer.start()
180
- for partition in slices:
181
- yield from self.generate_slices_from_partition(partition)
182
-
183
- def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
184
- # Ensure the maximum number of partitions is not exceeded
185
- self._ensure_partition_limit()
186
-
187
- cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
188
- if not cursor:
189
- partition_state = self._global_cursor if self._global_cursor else self._NO_CURSOR_STATE
190
- cursor = self._create_cursor(partition_state)
191
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
192
- self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
193
- threading.Semaphore(0)
194
- )
195
-
196
- for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
197
- cursor.stream_slices(),
198
- lambda: None,
199
- ):
200
- self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
201
- if is_last_slice:
202
- self._finished_partitions.add(self._to_partition_key(partition.partition))
203
- yield StreamSlice(
204
- partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
205
- )
206
-
207
- def _ensure_partition_limit(self) -> None:
208
- """
209
- Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
210
- """
211
- while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
212
- self._over_limit += 1
213
- oldest_partition = self._cursor_per_partition.popitem(last=False)[
214
- 0
215
- ] # Remove the oldest partition
216
- logger.warning(
217
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
218
- )
219
-
220
- def limit_reached(self) -> bool:
221
- return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
222
-
223
- def _set_initial_state(self, stream_state: StreamState) -> None:
224
- """
225
- Set the initial state for the cursors.
226
-
227
- This method initializes the state for each partition cursor using the provided stream state.
228
- If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
229
-
230
- Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
231
- does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
232
-
233
- Args:
234
- stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
235
- {
236
- "states": [
237
- {
238
- "partition": {
239
- "partition_key": "value"
240
- },
241
- "cursor": {
242
- "last_updated": "2023-05-27T00:00:00Z"
243
- }
244
- }
245
- ],
246
- "parent_state": {
247
- "parent_stream_name": {
248
- "last_updated": "2023-05-27T00:00:00Z"
249
- }
250
- }
251
- }
252
- """
253
- if not stream_state:
254
- return
255
-
256
- if "states" not in stream_state:
257
- # We assume that `stream_state` is in a global format that can be applied to all partitions.
258
- # Example: {"global_state_format_key": "global_state_format_value"}
259
- self._global_cursor = deepcopy(stream_state)
260
- self._new_global_cursor = deepcopy(stream_state)
261
-
262
- else:
263
- self._lookback_window = stream_state.get("lookback_window")
264
-
265
- for state in stream_state["states"]:
266
- self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
267
- self._create_cursor(
268
- state["cursor"], runtime_lookback_window=self._lookback_window
269
- )
270
- )
271
- self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
272
- threading.Semaphore(0)
273
- )
274
-
275
- # set default state for missing partitions if it is per partition with fallback to global
276
- if "state" in stream_state:
277
- self._global_cursor = deepcopy(stream_state["state"])
278
- self._new_global_cursor = deepcopy(stream_state["state"])
279
-
280
- # Set parent state for partition routers based on parent streams
281
- self._partition_router.set_initial_state(stream_state)
282
-
283
- def observe(self, record: Record) -> None:
284
- self._cursor_per_partition[
285
- self._to_partition_key(record.associated_slice.partition)
286
- ].observe(record)
287
-
288
- def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
289
- return self._partition_serializer.to_partition_key(partition)
290
-
291
- def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
292
- return self._partition_serializer.to_partition(partition_key)
293
-
294
- def _create_cursor(self, cursor_state: Any, runtime_lookback_window: Any = None) -> Cursor:
295
- if runtime_lookback_window:
296
- runtime_lookback_window = timedelta(seconds=runtime_lookback_window)
297
- cursor = self._cursor_factory.create(
298
- stream_state=deepcopy(cursor_state), runtime_lookback_window=runtime_lookback_window
299
- )
300
- return cursor
301
-
302
- def should_be_synced(self, record: Record) -> bool:
303
- return self._get_cursor(record).should_be_synced(record)
304
-
305
- def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
306
- if not first.associated_slice or not second.associated_slice:
307
- raise ValueError(
308
- f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
309
- )
310
- if first.associated_slice.partition != second.associated_slice.partition:
311
- raise ValueError(
312
- f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
313
- )
314
-
315
- return self._get_cursor(first).is_greater_than_or_equal(
316
- self._convert_record_to_cursor_record(first),
317
- self._convert_record_to_cursor_record(second),
318
- )
319
-
320
- @staticmethod
321
- def _convert_record_to_cursor_record(record: Record) -> Record:
322
- return Record(
323
- record.data,
324
- StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
325
- if record.associated_slice
326
- else None,
327
- )
328
-
329
- def _get_cursor(self, record: Record) -> Cursor:
330
- if not record.associated_slice:
331
- raise ValueError(
332
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
333
- )
334
- partition_key = self._to_partition_key(record.associated_slice.partition)
335
- if partition_key not in self._cursor_per_partition:
336
- raise ValueError(
337
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
338
- )
339
- cursor = self._cursor_per_partition[partition_key]
340
- return cursor