airbyte-cdk 6.8.1rc10__py3-none-any.whl → 6.8.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +87 -25
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +2 -100
  3. airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
  4. airbyte_cdk/sources/declarative/incremental/__init__.py +3 -0
  5. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +270 -0
  6. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +9 -0
  7. airbyte_cdk/sources/declarative/manifest_declarative_source.py +2 -53
  8. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2 -95
  9. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +0 -6
  10. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +87 -106
  11. airbyte_cdk/sources/declarative/partition_routers/__init__.py +1 -2
  12. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
  13. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +0 -1
  14. airbyte_cdk/sources/streams/concurrent/cursor.py +9 -0
  15. {airbyte_cdk-6.8.1rc10.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/METADATA +1 -1
  16. {airbyte_cdk-6.8.1rc10.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/RECORD +19 -21
  17. airbyte_cdk/sources/declarative/resolvers/__init__.py +0 -13
  18. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +0 -55
  19. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +0 -106
  20. {airbyte_cdk-6.8.1rc10.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/LICENSE.txt +0 -0
  21. {airbyte_cdk-6.8.1rc10.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/WHEEL +0 -0
  22. {airbyte_cdk-6.8.1rc10.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/entry_points.txt +0 -0
@@ -20,6 +20,9 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
20
20
  ClientSideIncrementalRecordFilterDecorator,
21
21
  )
22
22
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23
+ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24
+ PerPartitionWithGlobalCursor,
25
+ )
23
26
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
24
27
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
25
28
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -86,10 +89,23 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
86
89
  component_factory=component_factory,
87
90
  )
88
91
 
89
- # todo: We could remove state from initialization. Now that streams are grouped during the read(), a source
90
- # no longer needs to store the original incoming state. But maybe there's an edge case?
91
92
  self._state = state
92
93
 
94
+ self._concurrent_streams: Optional[List[AbstractStream]]
95
+ self._synchronous_streams: Optional[List[Stream]]
96
+
97
+ # If the connector command was SPEC, there is no incoming config, and we cannot instantiate streams because
98
+ # they might depend on it. Ideally we want to have a static method on this class to get the spec without
99
+ # any other arguments, but the existing entrypoint.py isn't designed to support this. Just noting this
100
+ # for our future improvements to the CDK.
101
+ if config:
102
+ self._concurrent_streams, self._synchronous_streams = self._group_streams(
103
+ config=config or {}
104
+ )
105
+ else:
106
+ self._concurrent_streams = None
107
+ self._synchronous_streams = None
108
+
93
109
  concurrency_level_from_manifest = self._source_config.get("concurrency_level")
94
110
  if concurrency_level_from_manifest:
95
111
  concurrency_level_component = self._constructor.create_component(
@@ -123,20 +139,17 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
123
139
  logger: logging.Logger,
124
140
  config: Mapping[str, Any],
125
141
  catalog: ConfiguredAirbyteCatalog,
126
- state: Optional[List[AirbyteStateMessage]] = None,
142
+ state: Optional[Union[List[AirbyteStateMessage]]] = None,
127
143
  ) -> Iterator[AirbyteMessage]:
128
- concurrent_streams, _ = self._group_streams(config=config)
129
-
130
- # ConcurrentReadProcessor pops streams that are finished being read so before syncing, the names of
131
- # the concurrent streams must be saved so that they can be removed from the catalog before starting
132
- # synchronous streams
133
- if len(concurrent_streams) > 0:
144
+ # ConcurrentReadProcessor pops streams that are finished being read so before syncing, the names of the concurrent
145
+ # streams must be saved so that they can be removed from the catalog before starting synchronous streams
146
+ if self._concurrent_streams:
134
147
  concurrent_stream_names = set(
135
- [concurrent_stream.name for concurrent_stream in concurrent_streams]
148
+ [concurrent_stream.name for concurrent_stream in self._concurrent_streams]
136
149
  )
137
150
 
138
151
  selected_concurrent_streams = self._select_streams(
139
- streams=concurrent_streams, configured_catalog=catalog
152
+ streams=self._concurrent_streams, configured_catalog=catalog
140
153
  )
141
154
  # It would appear that passing in an empty set of streams causes an infinite loop in ConcurrentReadProcessor.
142
155
  # This is also evident in concurrent_source_adapter.py so I'll leave this out of scope to fix for now
@@ -155,7 +168,8 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
155
168
  yield from super().read(logger, config, filtered_catalog, state)
156
169
 
157
170
  def discover(self, logger: logging.Logger, config: Mapping[str, Any]) -> AirbyteCatalog:
158
- concurrent_streams, synchronous_streams = self._group_streams(config=config)
171
+ concurrent_streams = self._concurrent_streams or []
172
+ synchronous_streams = self._synchronous_streams or []
159
173
  return AirbyteCatalog(
160
174
  streams=[
161
175
  stream.as_airbyte_stream() for stream in concurrent_streams + synchronous_streams
@@ -181,13 +195,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
181
195
 
182
196
  state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
183
197
 
184
- # Combine streams and dynamic_streams. Note: both cannot be empty at the same time,
185
- # and this is validated during the initialization of the source.
186
- streams = self._stream_configs(self._source_config) + self._dynamic_stream_configs(
187
- self._source_config, config
188
- )
189
-
190
- name_to_stream_mapping = {stream["name"]: stream for stream in streams}
198
+ name_to_stream_mapping = {
199
+ stream["name"]: stream for stream in self.resolved_manifest["streams"]
200
+ }
191
201
 
192
202
  for declarative_stream in self.streams(config=config):
193
203
  # Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect
@@ -195,7 +205,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
195
205
  # so we need to treat them as synchronous
196
206
  if (
197
207
  isinstance(declarative_stream, DeclarativeStream)
198
- and name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
208
+ and name_to_stream_mapping[declarative_stream.name].get("retriever")["type"]
199
209
  == "SimpleRetriever"
200
210
  ):
201
211
  incremental_sync_component_definition = name_to_stream_mapping[
@@ -204,7 +214,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
204
214
 
205
215
  partition_router_component_definition = (
206
216
  name_to_stream_mapping[declarative_stream.name]
207
- .get("retriever", {})
217
+ .get("retriever")
208
218
  .get("partition_router")
209
219
  )
210
220
  is_without_partition_router_or_cursor = not bool(
@@ -226,7 +236,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
226
236
  cursor = self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
227
237
  state_manager=state_manager,
228
238
  model_type=DatetimeBasedCursorModel,
229
- component_definition=incremental_sync_component_definition, # type: ignore # Not None because of the if condition above
239
+ component_definition=incremental_sync_component_definition,
230
240
  stream_name=declarative_stream.name,
231
241
  stream_namespace=declarative_stream.namespace,
232
242
  config=config or {},
@@ -299,6 +309,59 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
299
309
  cursor=final_state_cursor,
300
310
  )
301
311
  )
312
+ elif (
313
+ incremental_sync_component_definition
314
+ and incremental_sync_component_definition.get("type", "")
315
+ == DatetimeBasedCursorModel.__name__
316
+ and self._stream_supports_concurrent_partition_processing(
317
+ declarative_stream=declarative_stream
318
+ )
319
+ and hasattr(declarative_stream.retriever, "stream_slicer")
320
+ and isinstance(declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor)
321
+ ):
322
+ stream_state = state_manager.get_stream_state(
323
+ stream_name=declarative_stream.name, namespace=declarative_stream.namespace
324
+ )
325
+ partition_router = declarative_stream.retriever.stream_slicer._partition_router
326
+
327
+ cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
328
+ state_manager=state_manager,
329
+ model_type=DatetimeBasedCursorModel,
330
+ component_definition=incremental_sync_component_definition,
331
+ stream_name=declarative_stream.name,
332
+ stream_namespace=declarative_stream.namespace,
333
+ config=config or {},
334
+ stream_state=stream_state,
335
+ partition_router=partition_router,
336
+ )
337
+
338
+
339
+ partition_generator = StreamSlicerPartitionGenerator(
340
+ DeclarativePartitionFactory(
341
+ declarative_stream.name,
342
+ declarative_stream.get_json_schema(),
343
+ self._retriever_factory(
344
+ name_to_stream_mapping[declarative_stream.name],
345
+ config,
346
+ stream_state,
347
+ ),
348
+ self.message_repository,
349
+ ),
350
+ cursor,
351
+ )
352
+
353
+ concurrent_streams.append(
354
+ DefaultStream(
355
+ partition_generator=partition_generator,
356
+ name=declarative_stream.name,
357
+ json_schema=declarative_stream.get_json_schema(),
358
+ availability_strategy=AlwaysAvailableAvailabilityStrategy(),
359
+ primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
360
+ cursor_field=cursor.cursor_field.cursor_field_key,
361
+ logger=self.logger,
362
+ cursor=cursor,
363
+ )
364
+ )
302
365
  else:
303
366
  synchronous_streams.append(declarative_stream)
304
367
  else:
@@ -309,11 +372,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
309
372
  def _is_datetime_incremental_without_partition_routing(
310
373
  self,
311
374
  declarative_stream: DeclarativeStream,
312
- incremental_sync_component_definition: Mapping[str, Any] | None,
375
+ incremental_sync_component_definition: Mapping[str, Any],
313
376
  ) -> bool:
314
377
  return (
315
- incremental_sync_component_definition is not None
316
- and bool(incremental_sync_component_definition)
378
+ bool(incremental_sync_component_definition)
317
379
  and incremental_sync_component_definition.get("type", "")
318
380
  == DatetimeBasedCursorModel.__name__
319
381
  and self._stream_supports_concurrent_partition_processing(
@@ -7,12 +7,8 @@ version: 1.0.0
7
7
  required:
8
8
  - type
9
9
  - check
10
+ - streams
10
11
  - version
11
- anyOf:
12
- - required:
13
- - streams
14
- - required:
15
- - dynamic_streams
16
12
  properties:
17
13
  type:
18
14
  type: string
@@ -23,10 +19,6 @@ properties:
23
19
  type: array
24
20
  items:
25
21
  "$ref": "#/definitions/DeclarativeStream"
26
- dynamic_streams:
27
- type: array
28
- items:
29
- "$ref": "#/definitions/DynamicDeclarativeStream"
30
22
  version:
31
23
  type: string
32
24
  description: The version of the Airbyte CDK used to build and test the source.
@@ -1329,7 +1321,7 @@ definitions:
1329
1321
  type: array
1330
1322
  items:
1331
1323
  - type: string
1332
- interpolation_context:
1324
+ interpolation_content:
1333
1325
  - config
1334
1326
  examples:
1335
1327
  - ["data"]
@@ -2903,96 +2895,6 @@ definitions:
2903
2895
  $parameters:
2904
2896
  type: object
2905
2897
  additionalProperties: true
2906
- ComponentMappingDefinition:
2907
- title: Component Mapping Definition
2908
- description: (This component is experimental. Use at your own risk.) Specifies a mapping definition to update or add fields in a record or configuration. This allows dynamic mapping of data by interpolating values into the template based on provided contexts.
2909
- type: object
2910
- required:
2911
- - type
2912
- - field_path
2913
- - value
2914
- properties:
2915
- type:
2916
- type: string
2917
- enum: [ComponentMappingDefinition]
2918
- field_path:
2919
- title: Field Path
2920
- description: A list of potentially nested fields indicating the full path where value will be added or updated.
2921
- type: array
2922
- items:
2923
- - type: string
2924
- interpolation_context:
2925
- - config
2926
- - components_values
2927
- - stream_template_config
2928
- examples:
2929
- - ["data"]
2930
- - ["data", "records"]
2931
- - ["data", "{{ parameters.name }}"]
2932
- - ["data", "*", "record"]
2933
- value:
2934
- title: Value
2935
- description: The dynamic or static value to assign to the key. Interpolated values can be used to dynamically determine the value during runtime.
2936
- type: string
2937
- interpolation_context:
2938
- - config
2939
- - stream_template_config
2940
- - components_values
2941
- examples:
2942
- - "{{ components_values['updates'] }}"
2943
- - "{{ components_values['MetaData']['LastUpdatedTime'] }}"
2944
- - "{{ config['segment_id'] }}"
2945
- value_type:
2946
- title: Value Type
2947
- description: The expected data type of the value. If omitted, the type will be inferred from the value provided.
2948
- "$ref": "#/definitions/ValueType"
2949
- $parameters:
2950
- type: object
2951
- additionalProperties: true
2952
- HttpComponentsResolver:
2953
- type: object
2954
- description: (This component is experimental. Use at your own risk.) Component resolve and populates stream templates with components fetched via an HTTP retriever.
2955
- properties:
2956
- type:
2957
- type: string
2958
- enum: [HttpComponentsResolver]
2959
- retriever:
2960
- title: Retriever
2961
- description: Component used to coordinate how records are extracted across stream slices and request pages.
2962
- anyOf:
2963
- - "$ref": "#/definitions/AsyncRetriever"
2964
- - "$ref": "#/definitions/CustomRetriever"
2965
- - "$ref": "#/definitions/SimpleRetriever"
2966
- components_mapping:
2967
- type: array
2968
- items:
2969
- "$ref": "#/definitions/ComponentMappingDefinition"
2970
- $parameters:
2971
- type: object
2972
- additionalProperties: true
2973
- required:
2974
- - type
2975
- - retriever
2976
- - components_mapping
2977
- DynamicDeclarativeStream:
2978
- type: object
2979
- description: (This component is experimental. Use at your own risk.) A component that described how will be created declarative streams based on stream template.
2980
- properties:
2981
- type:
2982
- type: string
2983
- enum: [DynamicDeclarativeStream]
2984
- stream_template:
2985
- title: Stream Template
2986
- description: Reference to the stream template.
2987
- "$ref": "#/definitions/DeclarativeStream"
2988
- components_resolver:
2989
- title: Components Resolver
2990
- description: Component resolve and populates stream templates with components values.
2991
- "$ref": "#/definitions/HttpComponentsResolver"
2992
- required:
2993
- - type
2994
- - stream_template
2995
- - components_resolver
2996
2898
  interpolation:
2997
2899
  variables:
2998
2900
  - title: config
@@ -59,13 +59,11 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- date_time_based_cursor: DatetimeBasedCursor,
63
- substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
62
+ cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
64
63
  **kwargs: Any,
65
64
  ):
66
65
  super().__init__(**kwargs)
67
- self._date_time_based_cursor = date_time_based_cursor
68
- self._substream_cursor = substream_cursor
66
+ self._cursor = cursor
69
67
 
70
68
  def filter_records(
71
69
  self,
@@ -77,7 +75,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
77
75
  records = (
78
76
  record
79
77
  for record in records
80
- if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
78
+ if self._cursor.should_be_synced(
81
79
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
82
80
  # Record stream name is empty cause it is not used durig the filtering
83
81
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ConcurrentCursorFactory, ConcurrentPerPartitionCursor
5
6
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
6
7
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
7
8
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import GlobalSubstreamCursor
@@ -14,6 +15,8 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
14
15
 
15
16
  __all__ = [
16
17
  "CursorFactory",
18
+ "ConcurrentCursorFactory"
19
+ "ConcurrentPerPartitionCursor",
17
20
  "DatetimeBasedCursor",
18
21
  "DeclarativeCursor",
19
22
  "GlobalSubstreamCursor",
@@ -0,0 +1,270 @@
1
+ import copy
2
+
3
+ #
4
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
5
+ #
6
+ import logging
7
+ from collections import OrderedDict
8
+ from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
9
+
10
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
11
+ from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
12
+ from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
13
+ from airbyte_cdk.sources.message import MessageRepository
14
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
15
+ PerPartitionKeySerializer,
16
+ )
17
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
18
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
19
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
20
+
21
+ logger = logging.getLogger("airbyte")
22
+
23
+
24
+ class ConcurrentCursorFactory:
25
+ def __init__(self, create_function: Callable[..., Cursor]):
26
+ self._create_function = create_function
27
+
28
+ def create(self, stream_state: Mapping[str, Any]) -> Cursor:
29
+ return self._create_function(stream_state=stream_state)
30
+
31
+
32
+ class ConcurrentPerPartitionCursor(Cursor):
33
+ """
34
+ Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
35
+
36
+ **Partition Limitation and Limit Reached Logic**
37
+
38
+ - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
39
+ - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
40
+ - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
41
+
42
+ The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
43
+
44
+ - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
45
+ - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
46
+
47
+ This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
48
+ """
49
+
50
+ DEFAULT_MAX_PARTITIONS_NUMBER = 10000
51
+ _NO_STATE: Mapping[str, Any] = {}
52
+ _NO_CURSOR_STATE: Mapping[str, Any] = {}
53
+ _KEY = 0
54
+ _VALUE = 1
55
+ _state_to_migrate_from: Mapping[str, Any] = {}
56
+
57
+ def __init__(
58
+ self,
59
+ cursor_factory: ConcurrentCursorFactory,
60
+ partition_router: PartitionRouter,
61
+ stream_name: str,
62
+ stream_namespace: Optional[str],
63
+ stream_state: Any,
64
+ message_repository: MessageRepository,
65
+ connector_state_manager: ConnectorStateManager,
66
+ cursor_field: CursorField,
67
+ ) -> None:
68
+ self._stream_name = stream_name
69
+ self._stream_namespace = stream_namespace
70
+ self._message_repository = message_repository
71
+ self._connector_state_manager = connector_state_manager
72
+ self._cursor_field = cursor_field
73
+
74
+ self._cursor_factory = cursor_factory
75
+ self._partition_router = partition_router
76
+
77
+ # The dict is ordered to ensure that once the maximum number of partitions is reached,
78
+ # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
79
+ self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
80
+ self._over_limit = 0
81
+ self._partition_serializer = PerPartitionKeySerializer()
82
+
83
+ self._set_initial_state(stream_state)
84
+
85
+ @property
86
+ def cursor_field(self) -> CursorField:
87
+ return self._cursor_field
88
+
89
+ @property
90
+ def state(self) -> MutableMapping[str, Any]:
91
+ states = []
92
+ for partition_tuple, cursor in self._cursor_per_partition.items():
93
+ cursor_state = cursor._connector_state_converter.convert_to_state_message(
94
+ cursor._cursor_field, cursor.state
95
+ )
96
+ if cursor_state:
97
+ states.append(
98
+ {
99
+ "partition": self._to_dict(partition_tuple),
100
+ "cursor": copy.deepcopy(cursor_state),
101
+ }
102
+ )
103
+ state: dict[str, Any] = {"states": states}
104
+ return state
105
+
106
+ def close_partition(self, partition: Partition) -> None:
107
+ self._cursor_per_partition[self._to_partition_key(partition._stream_slice.partition)].close_partition_without_emit(partition=partition)
108
+
109
+ def ensure_at_least_one_state_emitted(self) -> None:
110
+ """
111
+ The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
112
+ called.
113
+ """
114
+ self._emit_state_message()
115
+
116
+ def _emit_state_message(self) -> None:
117
+ self._connector_state_manager.update_state_for_stream(
118
+ self._stream_name,
119
+ self._stream_namespace,
120
+ self.state,
121
+ )
122
+ state_message = self._connector_state_manager.create_state_message(
123
+ self._stream_name, self._stream_namespace
124
+ )
125
+ self._message_repository.emit_message(state_message)
126
+
127
+
128
+ def stream_slices(self) -> Iterable[StreamSlice]:
129
+ slices = self._partition_router.stream_slices()
130
+ for partition in slices:
131
+ yield from self.generate_slices_from_partition(partition)
132
+
133
+ def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
134
+ # Ensure the maximum number of partitions is not exceeded
135
+ self._ensure_partition_limit()
136
+
137
+ cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
138
+ if not cursor:
139
+ partition_state = (
140
+ self._state_to_migrate_from
141
+ if self._state_to_migrate_from
142
+ else self._NO_CURSOR_STATE
143
+ )
144
+ cursor = self._create_cursor(partition_state)
145
+ self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
146
+
147
+ for cursor_slice in cursor.stream_slices():
148
+ yield StreamSlice(
149
+ partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
150
+ )
151
+
152
+ def _ensure_partition_limit(self) -> None:
153
+ """
154
+ Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
155
+ """
156
+ while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
157
+ self._over_limit += 1
158
+ oldest_partition = self._cursor_per_partition.popitem(last=False)[
159
+ 0
160
+ ] # Remove the oldest partition
161
+ logger.warning(
162
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
163
+ )
164
+
165
+ def limit_reached(self) -> bool:
166
+ return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
167
+
168
+ def _set_initial_state(self, stream_state: StreamState) -> None:
169
+ """
170
+ Set the initial state for the cursors.
171
+
172
+ This method initializes the state for each partition cursor using the provided stream state.
173
+ If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
174
+
175
+ Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
176
+ does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
177
+
178
+ Args:
179
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
180
+ {
181
+ "states": [
182
+ {
183
+ "partition": {
184
+ "partition_key": "value"
185
+ },
186
+ "cursor": {
187
+ "last_updated": "2023-05-27T00:00:00Z"
188
+ }
189
+ }
190
+ ],
191
+ "parent_state": {
192
+ "parent_stream_name": {
193
+ "last_updated": "2023-05-27T00:00:00Z"
194
+ }
195
+ }
196
+ }
197
+ """
198
+ if not stream_state:
199
+ return
200
+
201
+ if "states" not in stream_state:
202
+ # We assume that `stream_state` is in a global format that can be applied to all partitions.
203
+ # Example: {"global_state_format_key": "global_state_format_value"}
204
+ self._state_to_migrate_from = stream_state
205
+
206
+ else:
207
+ for state in stream_state["states"]:
208
+ self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
209
+ self._create_cursor(state["cursor"])
210
+ )
211
+
212
+ # set default state for missing partitions if it is per partition with fallback to global
213
+ if "state" in stream_state:
214
+ self._state_to_migrate_from = stream_state["state"]
215
+
216
+ # Set parent state for partition routers based on parent streams
217
+ self._partition_router.set_initial_state(stream_state)
218
+
219
+ def observe(self, record: Record) -> None:
220
+ self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)].observe(record)
221
+
222
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
223
+ return self._partition_serializer.to_partition_key(partition)
224
+
225
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
226
+ return self._partition_serializer.to_partition(partition_key)
227
+
228
+ def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor:
229
+ cursor = self._cursor_factory.create(stream_state=cursor_state)
230
+ return cursor
231
+
232
+ def should_be_synced(self, record: Record) -> bool:
233
+ return self._get_cursor(record).should_be_synced(record)
234
+
235
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
236
+ if not first.associated_slice or not second.associated_slice:
237
+ raise ValueError(
238
+ f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
239
+ )
240
+ if first.associated_slice.partition != second.associated_slice.partition:
241
+ raise ValueError(
242
+ f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
243
+ )
244
+
245
+ return self._get_cursor(first).is_greater_than_or_equal(
246
+ self._convert_record_to_cursor_record(first),
247
+ self._convert_record_to_cursor_record(second),
248
+ )
249
+
250
+ @staticmethod
251
+ def _convert_record_to_cursor_record(record: Record) -> Record:
252
+ return Record(
253
+ record.data,
254
+ StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
255
+ if record.associated_slice
256
+ else None,
257
+ )
258
+
259
+ def _get_cursor(self, record: Record) -> Cursor:
260
+ if not record.associated_slice:
261
+ raise ValueError(
262
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
263
+ )
264
+ partition_key = self._to_partition_key(record.associated_slice.partition)
265
+ if partition_key not in self._cursor_per_partition:
266
+ raise ValueError(
267
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
268
+ )
269
+ cursor = self._cursor_per_partition[partition_key]
270
+ return cursor
@@ -303,6 +303,15 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
+ if self._to_partition_key(record.associated_slice.partition) not in self._cursor_per_partition:
307
+ partition_state = (
308
+ self._state_to_migrate_from
309
+ if self._state_to_migrate_from
310
+ else self._NO_CURSOR_STATE
311
+ )
312
+ cursor = self._create_cursor(partition_state)
313
+
314
+ self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)] = cursor
306
315
  return self._get_cursor(record).should_be_synced(
307
316
  self._convert_record_to_cursor_record(record)
308
317
  )