airbyte-cdk 6.17.1.dev1__py3-none-any.whl → 6.18.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
20
20
  ClientSideIncrementalRecordFilterDecorator,
21
21
  )
22
22
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23
- from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24
- PerPartitionWithGlobalCursor,
25
- )
26
23
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
27
24
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
28
25
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -307,72 +304,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
307
304
  cursor=final_state_cursor,
308
305
  )
309
306
  )
310
- elif (
311
- incremental_sync_component_definition
312
- and incremental_sync_component_definition.get("type", "")
313
- == DatetimeBasedCursorModel.__name__
314
- and self._stream_supports_concurrent_partition_processing(
315
- declarative_stream=declarative_stream
316
- )
317
- and hasattr(declarative_stream.retriever, "stream_slicer")
318
- and isinstance(
319
- declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
320
- )
321
- ):
322
- stream_state = state_manager.get_stream_state(
323
- stream_name=declarative_stream.name, namespace=declarative_stream.namespace
324
- )
325
- partition_router = declarative_stream.retriever.stream_slicer._partition_router
326
-
327
- cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
328
- state_manager=state_manager,
329
- model_type=DatetimeBasedCursorModel,
330
- component_definition=incremental_sync_component_definition,
331
- stream_name=declarative_stream.name,
332
- stream_namespace=declarative_stream.namespace,
333
- config=config or {},
334
- stream_state=stream_state,
335
- partition_router=partition_router,
336
- )
337
-
338
- retriever = declarative_stream.retriever
339
-
340
- # This is an optimization so that we don't invoke any cursor or state management flows within the
341
- # low-code framework because state management is handled through the ConcurrentCursor.
342
- if declarative_stream and isinstance(retriever, SimpleRetriever):
343
- # Also a temporary hack. In the legacy Stream implementation, as part of the read,
344
- # set_initial_state() is called to instantiate incoming state on the cursor. Although we no
345
- # longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
346
- # like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
347
- # still rely on a DatetimeBasedCursor that is properly initialized with state.
348
- if retriever.cursor:
349
- retriever.cursor.set_initial_state(stream_state=stream_state)
350
- # We zero it out here, but since this is a cursor reference, the state is still properly
351
- # instantiated for the other components that reference it
352
- retriever.cursor = None
353
-
354
- partition_generator = StreamSlicerPartitionGenerator(
355
- DeclarativePartitionFactory(
356
- declarative_stream.name,
357
- declarative_stream.get_json_schema(),
358
- retriever,
359
- self.message_repository,
360
- ),
361
- cursor,
362
- )
363
-
364
- concurrent_streams.append(
365
- DefaultStream(
366
- partition_generator=partition_generator,
367
- name=declarative_stream.name,
368
- json_schema=declarative_stream.get_json_schema(),
369
- availability_strategy=AlwaysAvailableAvailabilityStrategy(),
370
- primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
371
- cursor_field=cursor.cursor_field.cursor_field_key,
372
- logger=self.logger,
373
- cursor=cursor,
374
- )
375
- )
376
307
  else:
377
308
  synchronous_streams.append(declarative_stream)
378
309
  else:
@@ -678,7 +678,7 @@ definitions:
678
678
  properties:
679
679
  type:
680
680
  type: string
681
- enum: [ CustomSchemaNormalization ]
681
+ enum: [CustomSchemaNormalization]
682
682
  class_name:
683
683
  title: Class Name
684
684
  description: Fully-qualified name of the class that will be implementing the custom normalization. The format is `source_<name>.<package>.<class_name>`.
@@ -2014,6 +2014,20 @@ definitions:
2014
2014
  $parameters:
2015
2015
  type: object
2016
2016
  additionalProperties: true
2017
+ JsonParser:
2018
+ title: JsonParser
2019
+ description: Parser used for parsing str, bytes, or bytearray data and returning data in a dictionary format.
2020
+ type: object
2021
+ additionalProperties: true
2022
+ required:
2023
+ - type
2024
+ properties:
2025
+ type:
2026
+ type: string
2027
+ enum: [JsonParser]
2028
+ encoding:
2029
+ type: string
2030
+ default: utf-8
2017
2031
  ListPartitionRouter:
2018
2032
  title: List Partition Router
2019
2033
  description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
@@ -7,9 +7,12 @@ from dataclasses import dataclass
7
7
  from io import BufferedIOBase, TextIOWrapper
8
8
  from typing import Any, Generator, MutableMapping, Optional
9
9
 
10
+ import orjson
10
11
  import requests
11
12
 
13
+ from airbyte_cdk.models import FailureType
12
14
  from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
15
+ from airbyte_cdk.utils import AirbyteTracedException
13
16
 
14
17
  logger = logging.getLogger("airbyte")
15
18
 
@@ -42,6 +45,31 @@ class GzipParser(Parser):
42
45
  yield from self.inner_parser.parse(gzipobj)
43
46
 
44
47
 
48
+ @dataclass
49
+ class JsonParser(Parser):
50
+ encoding: str = "utf-8"
51
+
52
+ def parse(self, data: BufferedIOBase) -> Generator[MutableMapping[str, Any], None, None]:
53
+ raw_data = data.read()
54
+ try:
55
+ body_json = orjson.loads(raw_data.decode(self.encoding))
56
+ except Exception:
57
+ try:
58
+ body_json = json.loads(raw_data.decode(self.encoding))
59
+ except Exception as exc:
60
+ raise AirbyteTracedException(
61
+ message="Response JSON data failed to be parsed. See logs for more inforation.",
62
+ internal_message=f"Response JSON data faild to be parsed: {exc=}, {raw_data=}",
63
+ failure_type=FailureType.system_error,
64
+ exception=exc,
65
+ )
66
+
67
+ if isinstance(body_json, list):
68
+ yield from body_json
69
+ else:
70
+ yield from [body_json]
71
+
72
+
45
73
  @dataclass
46
74
  class JsonLineParser(Parser):
47
75
  encoding: Optional[str] = "utf-8"
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
62
+ date_time_based_cursor: DatetimeBasedCursor,
63
+ substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
63
64
  **kwargs: Any,
64
65
  ):
65
66
  super().__init__(**kwargs)
66
- self._cursor = cursor
67
+ self._date_time_based_cursor = date_time_based_cursor
68
+ self._substream_cursor = substream_cursor
67
69
 
68
70
  def filter_records(
69
71
  self,
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
75
77
  records = (
76
78
  record
77
79
  for record in records
78
- if self._cursor.should_be_synced(
80
+ if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
79
81
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
80
82
  # Record stream name is empty cause it is not used durig the filtering
81
83
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,10 +2,6 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
6
- ConcurrentCursorFactory,
7
- ConcurrentPerPartitionCursor,
8
- )
9
5
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
10
6
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
11
7
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
@@ -25,8 +21,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
25
21
 
26
22
  __all__ = [
27
23
  "CursorFactory",
28
- "ConcurrentCursorFactory",
29
- "ConcurrentPerPartitionCursor",
30
24
  "DatetimeBasedCursor",
31
25
  "DeclarativeCursor",
32
26
  "GlobalSubstreamCursor",
@@ -303,21 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
- if (
307
- record.associated_slice
308
- and self._to_partition_key(record.associated_slice.partition)
309
- not in self._cursor_per_partition
310
- ):
311
- partition_state = (
312
- self._state_to_migrate_from
313
- if self._state_to_migrate_from
314
- else self._NO_CURSOR_STATE
315
- )
316
- cursor = self._create_cursor(partition_state)
317
-
318
- self._cursor_per_partition[
319
- self._to_partition_key(record.associated_slice.partition)
320
- ] = cursor
321
306
  return self._get_cursor(record).should_be_synced(
322
307
  self._convert_record_to_cursor_record(record)
323
308
  )
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
737
737
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
738
738
 
739
739
 
740
+ class FlattenFields(BaseModel):
741
+ type: Literal["FlattenFields"]
742
+ flatten_lists: Optional[bool] = Field(
743
+ True,
744
+ description="Whether to flatten lists or leave it as is. Default is True.",
745
+ title="Flatten Lists",
746
+ )
747
+ parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
748
+
749
+
740
750
  class KeysReplace(BaseModel):
741
751
  type: Literal["KeysReplace"]
742
752
  old: str = Field(
743
753
  ...,
744
754
  description="Old value to replace.",
745
- examples=[" ", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
755
+ examples=[
756
+ " ",
757
+ "{{ record.id }}",
758
+ "{{ config['id'] }}",
759
+ "{{ stream_slice['id'] }}",
760
+ ],
746
761
  title="Old value",
747
762
  )
748
763
  new: str = Field(
749
764
  ...,
750
765
  description="New value to set.",
751
- examples=["_", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
766
+ examples=[
767
+ "_",
768
+ "{{ record.id }}",
769
+ "{{ config['id'] }}",
770
+ "{{ stream_slice['id'] }}",
771
+ ],
752
772
  title="New value",
753
773
  )
754
774
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
755
775
 
756
776
 
757
- class FlattenFields(BaseModel):
758
- type: Literal["FlattenFields"]
759
- flatten_lists: Optional[bool] = Field(
760
- True,
761
- description="Whether to flatten lists or leave it as is. Default is True.",
762
- title="Flatten Lists",
763
- )
764
- parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
765
-
766
-
767
777
  class IterableDecoder(BaseModel):
768
778
  type: Literal["IterableDecoder"]
769
779
 
@@ -795,6 +805,14 @@ class GzipJsonDecoder(BaseModel):
795
805
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
796
806
 
797
807
 
808
+ class JsonParser(BaseModel):
809
+ class Config:
810
+ extra = Extra.allow
811
+
812
+ type: Literal["JsonParser"]
813
+ encoding: Optional[str] = "utf-8"
814
+
815
+
798
816
  class MinMaxDatetime(BaseModel):
799
817
  type: Literal["MinMaxDatetime"]
800
818
  datetime: str = Field(
@@ -72,6 +72,7 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
72
72
  CsvParser,
73
73
  GzipParser,
74
74
  JsonLineParser,
75
+ JsonParser,
75
76
  )
76
77
  from airbyte_cdk.sources.declarative.extractors import (
77
78
  DpathExtractor,
@@ -84,8 +85,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
84
85
  )
85
86
  from airbyte_cdk.sources.declarative.incremental import (
86
87
  ChildPartitionResumableFullRefreshCursor,
87
- ConcurrentCursorFactory,
88
- ConcurrentPerPartitionCursor,
89
88
  CursorFactory,
90
89
  DatetimeBasedCursor,
91
90
  DeclarativeCursor,
@@ -249,6 +248,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
249
248
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
250
249
  JsonLineParser as JsonLineParserModel,
251
250
  )
251
+ from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
252
+ JsonParser as JsonParserModel,
253
+ )
252
254
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
253
255
  JwtAuthenticator as JwtAuthenticatorModel,
254
256
  )
@@ -440,7 +442,6 @@ from airbyte_cdk.sources.message import (
440
442
  InMemoryMessageRepository,
441
443
  LogAppenderMessageRepositoryDecorator,
442
444
  MessageRepository,
443
- NoopMessageRepository,
444
445
  )
445
446
  from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
446
447
  from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
@@ -525,6 +526,7 @@ class ModelToComponentFactory:
525
526
  JsonDecoderModel: self.create_json_decoder,
526
527
  JsonlDecoderModel: self.create_jsonl_decoder,
527
528
  JsonLineParserModel: self.create_json_line_parser,
529
+ JsonParserModel: self.create_json_parser,
528
530
  GzipJsonDecoderModel: self.create_gzipjson_decoder,
529
531
  GzipParserModel: self.create_gzip_parser,
530
532
  KeysToLowerModel: self.create_keys_to_lower_transformation,
@@ -874,8 +876,6 @@ class ModelToComponentFactory:
874
876
  stream_namespace: Optional[str],
875
877
  config: Config,
876
878
  stream_state: MutableMapping[str, Any],
877
- message_repository: Optional[MessageRepository] = None,
878
- runtime_lookback_window: Optional[int] = None,
879
879
  **kwargs: Any,
880
880
  ) -> ConcurrentCursor:
881
881
  component_type = component_definition.get("type")
@@ -933,11 +933,6 @@ class ModelToComponentFactory:
933
933
  if evaluated_lookback_window:
934
934
  lookback_window = parse_duration(evaluated_lookback_window)
935
935
 
936
- if runtime_lookback_window and lookback_window:
937
- lookback_window = max(lookback_window, runtime_lookback_window)
938
- elif runtime_lookback_window:
939
- lookback_window = runtime_lookback_window
940
-
941
936
  connector_state_converter: DateTimeStreamStateConverter
942
937
  connector_state_converter = CustomFormatConcurrentStreamStateConverter(
943
938
  datetime_format=datetime_format,
@@ -1016,7 +1011,7 @@ class ModelToComponentFactory:
1016
1011
  stream_name=stream_name,
1017
1012
  stream_namespace=stream_namespace,
1018
1013
  stream_state=stream_state,
1019
- message_repository=message_repository or self._message_repository,
1014
+ message_repository=self._message_repository,
1020
1015
  connector_state_manager=state_manager,
1021
1016
  connector_state_converter=connector_state_converter,
1022
1017
  cursor_field=cursor_field,
@@ -1028,63 +1023,6 @@ class ModelToComponentFactory:
1028
1023
  cursor_granularity=cursor_granularity,
1029
1024
  )
1030
1025
 
1031
- def create_concurrent_cursor_from_perpartition_cursor(
1032
- self,
1033
- state_manager: ConnectorStateManager,
1034
- model_type: Type[BaseModel],
1035
- component_definition: ComponentDefinition,
1036
- stream_name: str,
1037
- stream_namespace: Optional[str],
1038
- config: Config,
1039
- stream_state: MutableMapping[str, Any],
1040
- partition_router,
1041
- **kwargs: Any,
1042
- ) -> ConcurrentPerPartitionCursor:
1043
- component_type = component_definition.get("type")
1044
- if component_definition.get("type") != model_type.__name__:
1045
- raise ValueError(
1046
- f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
1047
- )
1048
-
1049
- datetime_based_cursor_model = model_type.parse_obj(component_definition)
1050
-
1051
- if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
1052
- raise ValueError(
1053
- f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
1054
- )
1055
-
1056
- interpolated_cursor_field = InterpolatedString.create(
1057
- datetime_based_cursor_model.cursor_field,
1058
- parameters=datetime_based_cursor_model.parameters or {},
1059
- )
1060
- cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
1061
-
1062
- # Create the cursor factory
1063
- cursor_factory = ConcurrentCursorFactory(
1064
- partial(
1065
- self.create_concurrent_cursor_from_datetime_based_cursor,
1066
- state_manager=state_manager,
1067
- model_type=model_type,
1068
- component_definition=component_definition,
1069
- stream_name=stream_name,
1070
- stream_namespace=stream_namespace,
1071
- config=config,
1072
- message_repository=NoopMessageRepository(),
1073
- )
1074
- )
1075
-
1076
- # Return the concurrent cursor and state converter
1077
- return ConcurrentPerPartitionCursor(
1078
- cursor_factory=cursor_factory,
1079
- partition_router=partition_router,
1080
- stream_name=stream_name,
1081
- stream_namespace=stream_namespace,
1082
- stream_state=stream_state,
1083
- message_repository=self._message_repository, # type: ignore
1084
- connector_state_manager=state_manager,
1085
- cursor_field=cursor_field,
1086
- )
1087
-
1088
1026
  @staticmethod
1089
1027
  def create_constant_backoff_strategy(
1090
1028
  model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
@@ -1367,15 +1305,18 @@ class ModelToComponentFactory:
1367
1305
  raise ValueError(
1368
1306
  "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
1369
1307
  )
1370
- cursor = (
1371
- combined_slicers
1372
- if isinstance(
1373
- combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1374
- )
1375
- else self._create_component_from_model(model=model.incremental_sync, config=config)
1376
- )
1377
-
1378
- client_side_incremental_sync = {"cursor": cursor}
1308
+ client_side_incremental_sync = {
1309
+ "date_time_based_cursor": self._create_component_from_model(
1310
+ model=model.incremental_sync, config=config
1311
+ ),
1312
+ "substream_cursor": (
1313
+ combined_slicers
1314
+ if isinstance(
1315
+ combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1316
+ )
1317
+ else None
1318
+ ),
1319
+ }
1379
1320
 
1380
1321
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
1381
1322
  cursor_model = model.incremental_sync
@@ -1812,6 +1753,11 @@ class ModelToComponentFactory:
1812
1753
  def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
1813
1754
  return JsonDecoder(parameters={})
1814
1755
 
1756
+ @staticmethod
1757
+ def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
1758
+ encoding = model.encoding or "utf-8"
1759
+ return JsonParser(encoding=encoding)
1760
+
1815
1761
  @staticmethod
1816
1762
  def create_jsonl_decoder(
1817
1763
  model: JsonlDecoderModel, config: Config, **kwargs: Any
@@ -2191,7 +2137,7 @@ class ModelToComponentFactory:
2191
2137
  if (
2192
2138
  not isinstance(stream_slicer, DatetimeBasedCursor)
2193
2139
  or type(stream_slicer) is not DatetimeBasedCursor
2194
- ) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
2140
+ ):
2195
2141
  # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
2196
2142
  # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
2197
2143
  # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
160
160
  stream_slice,
161
161
  next_page_token,
162
162
  self._paginator.get_request_headers,
163
- self.request_option_provider.get_request_headers,
163
+ self.stream_slicer.get_request_headers,
164
164
  )
165
165
  if isinstance(headers, str):
166
166
  raise ValueError("Request headers cannot be a string")
@@ -196,9 +196,7 @@ class ConcurrentCursor(Cursor):
196
196
 
197
197
  @property
198
198
  def state(self) -> MutableMapping[str, Any]:
199
- return self._connector_state_converter.convert_to_state_message(
200
- self.cursor_field, self._concurrent_state
201
- )
199
+ return self._concurrent_state
202
200
 
203
201
  @property
204
202
  def cursor_field(self) -> CursorField:
@@ -243,10 +241,10 @@ class ConcurrentCursor(Cursor):
243
241
  return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
244
242
 
245
243
  def close_partition(self, partition: Partition) -> None:
246
- slice_count_before = len(self._concurrent_state.get("slices", []))
244
+ slice_count_before = len(self.state.get("slices", []))
247
245
  self._add_slice_to_state(partition)
248
246
  if slice_count_before < len(
249
- self._concurrent_state["slices"]
247
+ self.state["slices"]
250
248
  ): # only emit if at least one slice has been processed
251
249
  self._merge_partitions()
252
250
  self._emit_state_message()
@@ -258,11 +256,11 @@ class ConcurrentCursor(Cursor):
258
256
  )
259
257
 
260
258
  if self._slice_boundary_fields:
261
- if "slices" not in self._concurrent_state:
259
+ if "slices" not in self.state:
262
260
  raise RuntimeError(
263
261
  f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
264
262
  )
265
- self._concurrent_state["slices"].append(
263
+ self.state["slices"].append(
266
264
  {
267
265
  self._connector_state_converter.START_KEY: self._extract_from_slice(
268
266
  partition, self._slice_boundary_fields[self._START_BOUNDARY]
@@ -290,7 +288,7 @@ class ConcurrentCursor(Cursor):
290
288
  "expected. Please contact the Airbyte team."
291
289
  )
292
290
 
293
- self._concurrent_state["slices"].append(
291
+ self.state["slices"].append(
294
292
  {
295
293
  self._connector_state_converter.START_KEY: self.start,
296
294
  self._connector_state_converter.END_KEY: most_recent_cursor_value,
@@ -302,7 +300,9 @@ class ConcurrentCursor(Cursor):
302
300
  self._connector_state_manager.update_state_for_stream(
303
301
  self._stream_name,
304
302
  self._stream_namespace,
305
- self.state,
303
+ self._connector_state_converter.convert_to_state_message(
304
+ self._cursor_field, self.state
305
+ ),
306
306
  )
307
307
  state_message = self._connector_state_manager.create_state_message(
308
308
  self._stream_name, self._stream_namespace
@@ -310,9 +310,7 @@ class ConcurrentCursor(Cursor):
310
310
  self._message_repository.emit_message(state_message)
311
311
 
312
312
  def _merge_partitions(self) -> None:
313
- self._concurrent_state["slices"] = self._connector_state_converter.merge_intervals(
314
- self._concurrent_state["slices"]
315
- )
313
+ self.state["slices"] = self._connector_state_converter.merge_intervals(self.state["slices"])
316
314
 
317
315
  def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType:
318
316
  try:
@@ -349,42 +347,36 @@ class ConcurrentCursor(Cursor):
349
347
  if self._start is not None and self._is_start_before_first_slice():
350
348
  yield from self._split_per_slice_range(
351
349
  self._start,
352
- self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY],
350
+ self.state["slices"][0][self._connector_state_converter.START_KEY],
353
351
  False,
354
352
  )
355
353
 
356
- if len(self._concurrent_state["slices"]) == 1:
354
+ if len(self.state["slices"]) == 1:
357
355
  yield from self._split_per_slice_range(
358
356
  self._calculate_lower_boundary_of_last_slice(
359
- self._concurrent_state["slices"][0][self._connector_state_converter.END_KEY]
357
+ self.state["slices"][0][self._connector_state_converter.END_KEY]
360
358
  ),
361
359
  self._end_provider(),
362
360
  True,
363
361
  )
364
- elif len(self._concurrent_state["slices"]) > 1:
365
- for i in range(len(self._concurrent_state["slices"]) - 1):
362
+ elif len(self.state["slices"]) > 1:
363
+ for i in range(len(self.state["slices"]) - 1):
366
364
  if self._cursor_granularity:
367
365
  yield from self._split_per_slice_range(
368
- self._concurrent_state["slices"][i][self._connector_state_converter.END_KEY]
366
+ self.state["slices"][i][self._connector_state_converter.END_KEY]
369
367
  + self._cursor_granularity,
370
- self._concurrent_state["slices"][i + 1][
371
- self._connector_state_converter.START_KEY
372
- ],
368
+ self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
373
369
  False,
374
370
  )
375
371
  else:
376
372
  yield from self._split_per_slice_range(
377
- self._concurrent_state["slices"][i][
378
- self._connector_state_converter.END_KEY
379
- ],
380
- self._concurrent_state["slices"][i + 1][
381
- self._connector_state_converter.START_KEY
382
- ],
373
+ self.state["slices"][i][self._connector_state_converter.END_KEY],
374
+ self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
383
375
  False,
384
376
  )
385
377
  yield from self._split_per_slice_range(
386
378
  self._calculate_lower_boundary_of_last_slice(
387
- self._concurrent_state["slices"][-1][self._connector_state_converter.END_KEY]
379
+ self.state["slices"][-1][self._connector_state_converter.END_KEY]
388
380
  ),
389
381
  self._end_provider(),
390
382
  True,
@@ -395,8 +387,7 @@ class ConcurrentCursor(Cursor):
395
387
  def _is_start_before_first_slice(self) -> bool:
396
388
  return (
397
389
  self._start is not None
398
- and self._start
399
- < self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY]
390
+ and self._start < self.state["slices"][0][self._connector_state_converter.START_KEY]
400
391
  )
401
392
 
402
393
  def _calculate_lower_boundary_of_last_slice(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-cdk
3
- Version: 6.17.1.dev1
3
+ Version: 6.18.0.dev1
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  License: MIT
6
6
  Keywords: airbyte,connector-development-kit,cdk
@@ -62,15 +62,15 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
62
62
  airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
63
63
  airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
64
64
  airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
65
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=VfDvff6ionjGScMbEpMGlZ0TfOyIQpMUZiuV6pkI9Os,26557
65
+ airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyMCu1qoGsne1Ooz3c1da-8EDZk6Suiy2gIq9Q,22475
66
66
  airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
67
67
  airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
68
68
  airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=GfZlk9EvYQiWDx3AipNLf1us1e986q2mgqcbHbeZU0k,133172
69
+ airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=QDqDyKmkYDDW3fXA8ImE61p4v_sBNQnqnV-uX_qNHNM,133531
70
70
  airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
71
71
  airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
72
72
  airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
73
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=-aO3ujXX9YTP2ZDvI2BP-x0VOKdAq2TlHo4zG8DCTlY,2748
73
+ airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=rSvqdGsVgBT3ZfY_bthjZl_OmxY84iKz8g9GQIWyq8k,3766
74
74
  airbyte_cdk/sources/declarative/decoders/decoder.py,sha256=sl-Gt8lXi7yD2Q-sD8je5QS2PbgrgsYjxRLWsay7DMc,826
75
75
  airbyte_cdk/sources/declarative/decoders/json_decoder.py,sha256=qdbjeR6RffKaah_iWvMsOcDolYuxJY5DaI3b9AMTZXg,3327
76
76
  airbyte_cdk/sources/declarative/decoders/noop_decoder.py,sha256=iZh0yKY_JzgBnJWiubEusf5c0o6Khd-8EWFWT-8EgFo,542
@@ -81,16 +81,15 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=RmV-IkO1YLj0PSOrrq
81
81
  airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
82
82
  airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
83
83
  airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
84
- airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=yTdEkyDUSW2KbFkEwJJMlS963C955LgCCOVfTmmScpQ,3367
84
+ airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
85
85
  airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=tjNwcURmlyD-TGCScXvW95ThNKyPGcx2SiWbG1-H-sc,6552
86
86
  airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
87
87
  airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
88
- airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=U1oZKtBaEC6IACmvziY9Wzg7Z8EgF4ZuR7NwvjlB_Sk,1255
89
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=vU6bcVgjDFou7szl5UKxv2-theKSsV78oSME84-C78A,15043
88
+ airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=huRz3KQJSUFmJCg5GPE9TckEBsB5TMsCa_THhJAhPVI,1037
90
89
  airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
91
90
  airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
92
91
  airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
93
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=_FSJjAwL4Zu-i2CngnhTtx8j-NPVSBKj5LwDSPta3Cg,16305
92
+ airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
94
93
  airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
95
94
  airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
96
95
  airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
@@ -107,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
107
106
  airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
108
107
  airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
109
108
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
110
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=IZFT1m4d-zp5hQ0ayU06Vdxm6r3MEq-X2sOCo9SuG-k,93270
109
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=SpMwCe-6NZTxICSFIXzwlAnAwNLlC8xS12ncEC1NcbA,93536
111
110
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
112
111
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
113
112
  airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
114
113
  airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
115
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lgFqJ8DP-cRizmvFKRd4Oy_ebgoT_AceMKIpuqoFm3c,112097
114
+ airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lmSh2Yp-lgRTbbSw3m6UH8L2nTRjt0w3aiISWHRG6IM,109739
116
115
  airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
117
116
  airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
118
117
  airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
@@ -163,7 +162,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
163
162
  airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
164
163
  airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=kX9ltelK2xLIBWDJBK2ucrvVe5tc5xmhdbVbgsjvlxY,3696
165
164
  airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
166
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=kgnhVQxRlFqJs2-rDu2-QH-p-GzQU3nKmSp6_aq8u0s,24550
165
+ airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=jxQ_9xcVD07r9PKhofitAqMkdX1k8ZNyy50qz5NwkFs,24540
167
166
  airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
168
167
  airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
169
168
  airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
@@ -257,7 +256,7 @@ airbyte_cdk/sources/streams/concurrent/abstract_stream.py,sha256=3OB5VsvOkJmCxIM
257
256
  airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py,sha256=QTry1QCBUwJDw1QSCEvz23s7zIEx_7QMxkPq9j-oPIQ,1358
258
257
  airbyte_cdk/sources/streams/concurrent/adapters.py,sha256=QP_64kQo-b3sRNHZA5aqrgCJqAhIVegRM3vJ8jGyuSY,15213
259
258
  airbyte_cdk/sources/streams/concurrent/availability_strategy.py,sha256=4La5v2UffSjGnhmF4kwNIKt_g3RXk2ux1mSHA1ejgYM,2898
260
- airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=suObbNi24so8Wcj0Wm32OkJAcuvODAOwp373YBmUPp0,21213
259
+ airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=Hke6CpD8Sq1FS4g1Xuht39UN7hKkGy1mvOxvQrm1lLM,20810
261
260
  airbyte_cdk/sources/streams/concurrent/default_stream.py,sha256=K3rLMpYhS7nnmvwQ52lqBy7DQdFMJpvvT7sgBg_ckA8,3207
262
261
  airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=JOZ446MCLpmF26r9KfS6OO_6rGjcjgJNZdcw6jccjEI,468
263
262
  airbyte_cdk/sources/streams/concurrent/helpers.py,sha256=S6AW8TgIASCZ2UuUcQLE8OzgYUHWt2-KPOvNPwnQf-Q,1596
@@ -343,8 +342,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
343
342
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
344
343
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
345
344
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
346
- airbyte_cdk-6.17.1.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
- airbyte_cdk-6.17.1.dev1.dist-info/METADATA,sha256=8TVLQbLq6-v0qkRHb8X4P9x2sYTe9EUjwdvMb2NVOpA,6005
348
- airbyte_cdk-6.17.1.dev1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
- airbyte_cdk-6.17.1.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
- airbyte_cdk-6.17.1.dev1.dist-info/RECORD,,
345
+ airbyte_cdk-6.18.0.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
346
+ airbyte_cdk-6.18.0.dev1.dist-info/METADATA,sha256=ALXOgvI3pTcF2tNmvbQ9S8fG424n229th_tx1u2uSCo,6005
347
+ airbyte_cdk-6.18.0.dev1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
348
+ airbyte_cdk-6.18.0.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
349
+ airbyte_cdk-6.18.0.dev1.dist-info/RECORD,,
@@ -1,340 +0,0 @@
1
- import copy
2
- import logging
3
-
4
- #
5
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
6
- #
7
- import threading
8
- from collections import OrderedDict
9
- from copy import deepcopy
10
- from datetime import timedelta
11
- from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
12
-
13
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
- from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
15
- from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
- Timer,
17
- iterate_with_last_flag_and_state,
18
- )
19
- from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
20
- from airbyte_cdk.sources.message import MessageRepository
21
- from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
22
- PerPartitionKeySerializer,
23
- )
24
- from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
25
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
26
- from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
27
-
28
- logger = logging.getLogger("airbyte")
29
-
30
-
31
- class ConcurrentCursorFactory:
32
- def __init__(self, create_function: Callable[..., Cursor]):
33
- self._create_function = create_function
34
-
35
- def create(self, stream_state: Mapping[str, Any], runtime_lookback_window: Any) -> Cursor:
36
- return self._create_function(
37
- stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
38
- )
39
-
40
-
41
- class ConcurrentPerPartitionCursor(Cursor):
42
- """
43
- Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
44
-
45
- **Partition Limitation and Limit Reached Logic**
46
-
47
- - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
48
- - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
49
- - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
50
-
51
- The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
52
-
53
- - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
54
- - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
55
-
56
- This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
57
- """
58
-
59
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
60
- _NO_STATE: Mapping[str, Any] = {}
61
- _NO_CURSOR_STATE: Mapping[str, Any] = {}
62
- _KEY = 0
63
- _VALUE = 1
64
-
65
- def __init__(
66
- self,
67
- cursor_factory: ConcurrentCursorFactory,
68
- partition_router: PartitionRouter,
69
- stream_name: str,
70
- stream_namespace: Optional[str],
71
- stream_state: Any,
72
- message_repository: MessageRepository,
73
- connector_state_manager: ConnectorStateManager,
74
- cursor_field: CursorField,
75
- ) -> None:
76
- self._global_cursor: Mapping[str, Any] = {}
77
- self._stream_name = stream_name
78
- self._stream_namespace = stream_namespace
79
- self._message_repository = message_repository
80
- self._connector_state_manager = connector_state_manager
81
- self._cursor_field = cursor_field
82
-
83
- self._cursor_factory = cursor_factory
84
- self._partition_router = partition_router
85
-
86
- # The dict is ordered to ensure that once the maximum number of partitions is reached,
87
- # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
88
- self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
89
- self._state = {"states": []}
90
- self._semaphore_per_partition = OrderedDict()
91
- self._finished_partitions = set()
92
- self._lock = threading.Lock()
93
- self._timer = Timer()
94
- self._new_global_cursor = None
95
- self._lookback_window = 0
96
- self._parent_state = None
97
- self._over_limit = 0
98
- self._partition_serializer = PerPartitionKeySerializer()
99
-
100
- self._set_initial_state(stream_state)
101
-
102
- @property
103
- def cursor_field(self) -> CursorField:
104
- return self._cursor_field
105
-
106
- @property
107
- def state(self) -> MutableMapping[str, Any]:
108
- states = []
109
- for partition_tuple, cursor in self._cursor_per_partition.items():
110
- if cursor.state:
111
- states.append(
112
- {
113
- "partition": self._to_dict(partition_tuple),
114
- "cursor": copy.deepcopy(cursor.state),
115
- }
116
- )
117
- state: dict[str, Any] = {"states": states}
118
-
119
- if self._global_cursor:
120
- state["state"] = self._global_cursor
121
- if self._lookback_window is not None:
122
- state["lookback_window"] = self._lookback_window
123
- if self._parent_state is not None:
124
- state["parent_state"] = self._parent_state
125
- return state
126
-
127
- def close_partition(self, partition: Partition) -> None:
128
- self._cursor_per_partition[
129
- self._to_partition_key(partition._stream_slice.partition)
130
- ].close_partition(partition=partition)
131
- with self._lock:
132
- self._semaphore_per_partition[
133
- self._to_partition_key(partition._stream_slice.partition)
134
- ].acquire()
135
- cursor = self._cursor_per_partition[
136
- self._to_partition_key(partition._stream_slice.partition)
137
- ]
138
- if (
139
- self._to_partition_key(partition._stream_slice.partition)
140
- in self._finished_partitions
141
- and self._semaphore_per_partition[
142
- self._to_partition_key(partition._stream_slice.partition)
143
- ]._value
144
- == 0
145
- ):
146
- if (
147
- self._new_global_cursor is None
148
- or self._new_global_cursor[self.cursor_field.cursor_field_key]
149
- < cursor.state[self.cursor_field.cursor_field_key]
150
- ):
151
- self._new_global_cursor = copy.deepcopy(cursor.state)
152
-
153
- def ensure_at_least_one_state_emitted(self) -> None:
154
- """
155
- The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
156
- called.
157
- """
158
- if not any(
159
- semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
160
- ):
161
- self._global_cursor = self._new_global_cursor
162
- self._lookback_window = self._timer.finish()
163
- self._parent_state = self._partition_router.get_stream_state()
164
- self._emit_state_message()
165
-
166
- def _emit_state_message(self) -> None:
167
- self._connector_state_manager.update_state_for_stream(
168
- self._stream_name,
169
- self._stream_namespace,
170
- self.state,
171
- )
172
- state_message = self._connector_state_manager.create_state_message(
173
- self._stream_name, self._stream_namespace
174
- )
175
- self._message_repository.emit_message(state_message)
176
-
177
- def stream_slices(self) -> Iterable[StreamSlice]:
178
- slices = self._partition_router.stream_slices()
179
- self._timer.start()
180
- for partition in slices:
181
- yield from self.generate_slices_from_partition(partition)
182
-
183
- def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
184
- # Ensure the maximum number of partitions is not exceeded
185
- self._ensure_partition_limit()
186
-
187
- cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
188
- if not cursor:
189
- partition_state = self._global_cursor if self._global_cursor else self._NO_CURSOR_STATE
190
- cursor = self._create_cursor(partition_state)
191
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
192
- self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
193
- threading.Semaphore(0)
194
- )
195
-
196
- for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
197
- cursor.stream_slices(),
198
- lambda: None,
199
- ):
200
- self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
201
- if is_last_slice:
202
- self._finished_partitions.add(self._to_partition_key(partition.partition))
203
- yield StreamSlice(
204
- partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
205
- )
206
-
207
- def _ensure_partition_limit(self) -> None:
208
- """
209
- Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
210
- """
211
- while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
212
- self._over_limit += 1
213
- oldest_partition = self._cursor_per_partition.popitem(last=False)[
214
- 0
215
- ] # Remove the oldest partition
216
- logger.warning(
217
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
218
- )
219
-
220
- def limit_reached(self) -> bool:
221
- return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
222
-
223
- def _set_initial_state(self, stream_state: StreamState) -> None:
224
- """
225
- Set the initial state for the cursors.
226
-
227
- This method initializes the state for each partition cursor using the provided stream state.
228
- If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
229
-
230
- Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
231
- does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
232
-
233
- Args:
234
- stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
235
- {
236
- "states": [
237
- {
238
- "partition": {
239
- "partition_key": "value"
240
- },
241
- "cursor": {
242
- "last_updated": "2023-05-27T00:00:00Z"
243
- }
244
- }
245
- ],
246
- "parent_state": {
247
- "parent_stream_name": {
248
- "last_updated": "2023-05-27T00:00:00Z"
249
- }
250
- }
251
- }
252
- """
253
- if not stream_state:
254
- return
255
-
256
- if "states" not in stream_state:
257
- # We assume that `stream_state` is in a global format that can be applied to all partitions.
258
- # Example: {"global_state_format_key": "global_state_format_value"}
259
- self._global_cursor = deepcopy(stream_state)
260
- self._new_global_cursor = deepcopy(stream_state)
261
-
262
- else:
263
- self._lookback_window = stream_state.get("lookback_window")
264
-
265
- for state in stream_state["states"]:
266
- self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
267
- self._create_cursor(
268
- state["cursor"], runtime_lookback_window=self._lookback_window
269
- )
270
- )
271
- self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
272
- threading.Semaphore(0)
273
- )
274
-
275
- # set default state for missing partitions if it is per partition with fallback to global
276
- if "state" in stream_state:
277
- self._global_cursor = deepcopy(stream_state["state"])
278
- self._new_global_cursor = deepcopy(stream_state["state"])
279
-
280
- # Set parent state for partition routers based on parent streams
281
- self._partition_router.set_initial_state(stream_state)
282
-
283
- def observe(self, record: Record) -> None:
284
- self._cursor_per_partition[
285
- self._to_partition_key(record.associated_slice.partition)
286
- ].observe(record)
287
-
288
- def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
289
- return self._partition_serializer.to_partition_key(partition)
290
-
291
- def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
292
- return self._partition_serializer.to_partition(partition_key)
293
-
294
- def _create_cursor(self, cursor_state: Any, runtime_lookback_window: Any = None) -> Cursor:
295
- if runtime_lookback_window:
296
- runtime_lookback_window = timedelta(seconds=runtime_lookback_window)
297
- cursor = self._cursor_factory.create(
298
- stream_state=deepcopy(cursor_state), runtime_lookback_window=runtime_lookback_window
299
- )
300
- return cursor
301
-
302
- def should_be_synced(self, record: Record) -> bool:
303
- return self._get_cursor(record).should_be_synced(record)
304
-
305
- def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
306
- if not first.associated_slice or not second.associated_slice:
307
- raise ValueError(
308
- f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
309
- )
310
- if first.associated_slice.partition != second.associated_slice.partition:
311
- raise ValueError(
312
- f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
313
- )
314
-
315
- return self._get_cursor(first).is_greater_than_or_equal(
316
- self._convert_record_to_cursor_record(first),
317
- self._convert_record_to_cursor_record(second),
318
- )
319
-
320
- @staticmethod
321
- def _convert_record_to_cursor_record(record: Record) -> Record:
322
- return Record(
323
- record.data,
324
- StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
325
- if record.associated_slice
326
- else None,
327
- )
328
-
329
- def _get_cursor(self, record: Record) -> Cursor:
330
- if not record.associated_slice:
331
- raise ValueError(
332
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
333
- )
334
- partition_key = self._to_partition_key(record.associated_slice.partition)
335
- if partition_key not in self._cursor_per_partition:
336
- raise ValueError(
337
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
338
- )
339
- cursor = self._cursor_per_partition[partition_key]
340
- return cursor