airbyte-cdk 6.17.1.dev0__py3-none-any.whl → 6.18.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
20
20
  ClientSideIncrementalRecordFilterDecorator,
21
21
  )
22
22
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23
- from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24
- PerPartitionWithGlobalCursor,
25
- )
26
23
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
27
24
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
28
25
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -307,72 +304,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
307
304
  cursor=final_state_cursor,
308
305
  )
309
306
  )
310
- elif (
311
- incremental_sync_component_definition
312
- and incremental_sync_component_definition.get("type", "")
313
- == DatetimeBasedCursorModel.__name__
314
- and self._stream_supports_concurrent_partition_processing(
315
- declarative_stream=declarative_stream
316
- )
317
- and hasattr(declarative_stream.retriever, "stream_slicer")
318
- and isinstance(
319
- declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
320
- )
321
- ):
322
- stream_state = state_manager.get_stream_state(
323
- stream_name=declarative_stream.name, namespace=declarative_stream.namespace
324
- )
325
- partition_router = declarative_stream.retriever.stream_slicer._partition_router
326
-
327
- cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
328
- state_manager=state_manager,
329
- model_type=DatetimeBasedCursorModel,
330
- component_definition=incremental_sync_component_definition,
331
- stream_name=declarative_stream.name,
332
- stream_namespace=declarative_stream.namespace,
333
- config=config or {},
334
- stream_state=stream_state,
335
- partition_router=partition_router,
336
- )
337
-
338
- retriever = declarative_stream.retriever
339
-
340
- # This is an optimization so that we don't invoke any cursor or state management flows within the
341
- # low-code framework because state management is handled through the ConcurrentCursor.
342
- if declarative_stream and isinstance(retriever, SimpleRetriever):
343
- # Also a temporary hack. In the legacy Stream implementation, as part of the read,
344
- # set_initial_state() is called to instantiate incoming state on the cursor. Although we no
345
- # longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
346
- # like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
347
- # still rely on a DatetimeBasedCursor that is properly initialized with state.
348
- if retriever.cursor:
349
- retriever.cursor.set_initial_state(stream_state=stream_state)
350
- # We zero it out here, but since this is a cursor reference, the state is still properly
351
- # instantiated for the other components that reference it
352
- retriever.cursor = None
353
-
354
- partition_generator = StreamSlicerPartitionGenerator(
355
- DeclarativePartitionFactory(
356
- declarative_stream.name,
357
- declarative_stream.get_json_schema(),
358
- retriever,
359
- self.message_repository,
360
- ),
361
- cursor,
362
- )
363
-
364
- concurrent_streams.append(
365
- DefaultStream(
366
- partition_generator=partition_generator,
367
- name=declarative_stream.name,
368
- json_schema=declarative_stream.get_json_schema(),
369
- availability_strategy=AlwaysAvailableAvailabilityStrategy(),
370
- primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
371
- cursor_field=cursor.cursor_field.cursor_field_key,
372
- logger=self.logger,
373
- cursor=cursor,
374
- )
375
- )
376
307
  else:
377
308
  synchronous_streams.append(declarative_stream)
378
309
  else:
@@ -2977,6 +2977,11 @@ definitions:
2977
2977
  anyOf:
2978
2978
  - "$ref": "#/definitions/CustomRequester"
2979
2979
  - "$ref": "#/definitions/HttpRequester"
2980
+ url_requester:
2981
+ description: Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.
2982
+ anyOf:
2983
+ - "$ref": "#/definitions/CustomRequester"
2984
+ - "$ref": "#/definitions/HttpRequester"
2980
2985
  download_requester:
2981
2986
  description: Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.
2982
2987
  anyOf:
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
62
+ date_time_based_cursor: DatetimeBasedCursor,
63
+ substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
63
64
  **kwargs: Any,
64
65
  ):
65
66
  super().__init__(**kwargs)
66
- self._cursor = cursor
67
+ self._date_time_based_cursor = date_time_based_cursor
68
+ self._substream_cursor = substream_cursor
67
69
 
68
70
  def filter_records(
69
71
  self,
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
75
77
  records = (
76
78
  record
77
79
  for record in records
78
- if self._cursor.should_be_synced(
80
+ if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
79
81
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
80
82
  # Record stream name is empty cause it is not used durig the filtering
81
83
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,10 +2,6 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
6
- ConcurrentCursorFactory,
7
- ConcurrentPerPartitionCursor,
8
- )
9
5
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
10
6
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
11
7
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
@@ -25,8 +21,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
25
21
 
26
22
  __all__ = [
27
23
  "CursorFactory",
28
- "ConcurrentCursorFactory",
29
- "ConcurrentPerPartitionCursor",
30
24
  "DatetimeBasedCursor",
31
25
  "DeclarativeCursor",
32
26
  "GlobalSubstreamCursor",
@@ -303,20 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
- if (
307
- self._to_partition_key(record.associated_slice.partition)
308
- not in self._cursor_per_partition
309
- ):
310
- partition_state = (
311
- self._state_to_migrate_from
312
- if self._state_to_migrate_from
313
- else self._NO_CURSOR_STATE
314
- )
315
- cursor = self._create_cursor(partition_state)
316
-
317
- self._cursor_per_partition[
318
- self._to_partition_key(record.associated_slice.partition)
319
- ] = cursor
320
306
  return self._get_cursor(record).should_be_synced(
321
307
  self._convert_record_to_cursor_record(record)
322
308
  )
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
737
737
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
738
738
 
739
739
 
740
+ class FlattenFields(BaseModel):
741
+ type: Literal["FlattenFields"]
742
+ flatten_lists: Optional[bool] = Field(
743
+ True,
744
+ description="Whether to flatten lists or leave it as is. Default is True.",
745
+ title="Flatten Lists",
746
+ )
747
+ parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
748
+
749
+
740
750
  class KeysReplace(BaseModel):
741
751
  type: Literal["KeysReplace"]
742
752
  old: str = Field(
743
753
  ...,
744
754
  description="Old value to replace.",
745
- examples=[" ", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
755
+ examples=[
756
+ " ",
757
+ "{{ record.id }}",
758
+ "{{ config['id'] }}",
759
+ "{{ stream_slice['id'] }}",
760
+ ],
746
761
  title="Old value",
747
762
  )
748
763
  new: str = Field(
749
764
  ...,
750
765
  description="New value to set.",
751
- examples=["_", "{{ record.id }}", "{{ config['id'] }}", "{{ stream_slice['id'] }}"],
766
+ examples=[
767
+ "_",
768
+ "{{ record.id }}",
769
+ "{{ config['id'] }}",
770
+ "{{ stream_slice['id'] }}",
771
+ ],
752
772
  title="New value",
753
773
  )
754
774
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
755
775
 
756
776
 
757
- class FlattenFields(BaseModel):
758
- type: Literal["FlattenFields"]
759
- flatten_lists: Optional[bool] = Field(
760
- True,
761
- description="Whether to flatten lists or leave it as is. Default is True.",
762
- title="Flatten Lists",
763
- )
764
- parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
765
-
766
-
767
777
  class IterableDecoder(BaseModel):
768
778
  type: Literal["IterableDecoder"]
769
779
 
@@ -2040,6 +2050,10 @@ class AsyncRetriever(BaseModel):
2040
2050
  ...,
2041
2051
  description="Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job.",
2042
2052
  )
2053
+ url_requester: Optional[Union[CustomRequester, HttpRequester]] = Field(
2054
+ None,
2055
+ description="Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.",
2056
+ )
2043
2057
  download_requester: Union[CustomRequester, HttpRequester] = Field(
2044
2058
  ...,
2045
2059
  description="Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.",
@@ -84,8 +84,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
84
84
  )
85
85
  from airbyte_cdk.sources.declarative.incremental import (
86
86
  ChildPartitionResumableFullRefreshCursor,
87
- ConcurrentCursorFactory,
88
- ConcurrentPerPartitionCursor,
89
87
  CursorFactory,
90
88
  DatetimeBasedCursor,
91
89
  DeclarativeCursor,
@@ -440,7 +438,6 @@ from airbyte_cdk.sources.message import (
440
438
  InMemoryMessageRepository,
441
439
  LogAppenderMessageRepositoryDecorator,
442
440
  MessageRepository,
443
- NoopMessageRepository,
444
441
  )
445
442
  from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
446
443
  from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
@@ -874,8 +871,6 @@ class ModelToComponentFactory:
874
871
  stream_namespace: Optional[str],
875
872
  config: Config,
876
873
  stream_state: MutableMapping[str, Any],
877
- message_repository: Optional[MessageRepository] = None,
878
- runtime_lookback_window: Optional[int] = None,
879
874
  **kwargs: Any,
880
875
  ) -> ConcurrentCursor:
881
876
  component_type = component_definition.get("type")
@@ -933,11 +928,6 @@ class ModelToComponentFactory:
933
928
  if evaluated_lookback_window:
934
929
  lookback_window = parse_duration(evaluated_lookback_window)
935
930
 
936
- if runtime_lookback_window and lookback_window:
937
- lookback_window = max(lookback_window, runtime_lookback_window)
938
- elif runtime_lookback_window:
939
- lookback_window = runtime_lookback_window
940
-
941
931
  connector_state_converter: DateTimeStreamStateConverter
942
932
  connector_state_converter = CustomFormatConcurrentStreamStateConverter(
943
933
  datetime_format=datetime_format,
@@ -1016,7 +1006,7 @@ class ModelToComponentFactory:
1016
1006
  stream_name=stream_name,
1017
1007
  stream_namespace=stream_namespace,
1018
1008
  stream_state=stream_state,
1019
- message_repository=message_repository or self._message_repository,
1009
+ message_repository=self._message_repository,
1020
1010
  connector_state_manager=state_manager,
1021
1011
  connector_state_converter=connector_state_converter,
1022
1012
  cursor_field=cursor_field,
@@ -1028,63 +1018,6 @@ class ModelToComponentFactory:
1028
1018
  cursor_granularity=cursor_granularity,
1029
1019
  )
1030
1020
 
1031
- def create_concurrent_cursor_from_perpartition_cursor(
1032
- self,
1033
- state_manager: ConnectorStateManager,
1034
- model_type: Type[BaseModel],
1035
- component_definition: ComponentDefinition,
1036
- stream_name: str,
1037
- stream_namespace: Optional[str],
1038
- config: Config,
1039
- stream_state: MutableMapping[str, Any],
1040
- partition_router,
1041
- **kwargs: Any,
1042
- ) -> ConcurrentPerPartitionCursor:
1043
- component_type = component_definition.get("type")
1044
- if component_definition.get("type") != model_type.__name__:
1045
- raise ValueError(
1046
- f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
1047
- )
1048
-
1049
- datetime_based_cursor_model = model_type.parse_obj(component_definition)
1050
-
1051
- if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
1052
- raise ValueError(
1053
- f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
1054
- )
1055
-
1056
- interpolated_cursor_field = InterpolatedString.create(
1057
- datetime_based_cursor_model.cursor_field,
1058
- parameters=datetime_based_cursor_model.parameters or {},
1059
- )
1060
- cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
1061
-
1062
- # Create the cursor factory
1063
- cursor_factory = ConcurrentCursorFactory(
1064
- partial(
1065
- self.create_concurrent_cursor_from_datetime_based_cursor,
1066
- state_manager=state_manager,
1067
- model_type=model_type,
1068
- component_definition=component_definition,
1069
- stream_name=stream_name,
1070
- stream_namespace=stream_namespace,
1071
- config=config,
1072
- message_repository=NoopMessageRepository(),
1073
- )
1074
- )
1075
-
1076
- # Return the concurrent cursor and state converter
1077
- return ConcurrentPerPartitionCursor(
1078
- cursor_factory=cursor_factory,
1079
- partition_router=partition_router,
1080
- stream_name=stream_name,
1081
- stream_namespace=stream_namespace,
1082
- stream_state=stream_state,
1083
- message_repository=self._message_repository, # type: ignore
1084
- connector_state_manager=state_manager,
1085
- cursor_field=cursor_field,
1086
- )
1087
-
1088
1021
  @staticmethod
1089
1022
  def create_constant_backoff_strategy(
1090
1023
  model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
@@ -1367,15 +1300,18 @@ class ModelToComponentFactory:
1367
1300
  raise ValueError(
1368
1301
  "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
1369
1302
  )
1370
- cursor = (
1371
- combined_slicers
1372
- if isinstance(
1373
- combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1374
- )
1375
- else self._create_component_from_model(model=model.incremental_sync, config=config)
1376
- )
1377
-
1378
- client_side_incremental_sync = {"cursor": cursor}
1303
+ client_side_incremental_sync = {
1304
+ "date_time_based_cursor": self._create_component_from_model(
1305
+ model=model.incremental_sync, config=config
1306
+ ),
1307
+ "substream_cursor": (
1308
+ combined_slicers
1309
+ if isinstance(
1310
+ combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1311
+ )
1312
+ else None
1313
+ ),
1314
+ }
1379
1315
 
1380
1316
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
1381
1317
  cursor_model = model.incremental_sync
@@ -2191,7 +2127,7 @@ class ModelToComponentFactory:
2191
2127
  if (
2192
2128
  not isinstance(stream_slicer, DatetimeBasedCursor)
2193
2129
  or type(stream_slicer) is not DatetimeBasedCursor
2194
- ) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
2130
+ ):
2195
2131
  # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
2196
2132
  # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
2197
2133
  # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
@@ -2351,7 +2287,7 @@ class ModelToComponentFactory:
2351
2287
  extractor=download_extractor,
2352
2288
  name=name,
2353
2289
  record_filter=None,
2354
- transformations=[],
2290
+ transformations=transformations,
2355
2291
  schema_normalization=TypeTransformer(TransformConfig.NoTransform),
2356
2292
  config=config,
2357
2293
  parameters={},
@@ -2388,6 +2324,16 @@ class ModelToComponentFactory:
2388
2324
  if model.delete_requester
2389
2325
  else None
2390
2326
  )
2327
+ url_requester = (
2328
+ self._create_component_from_model(
2329
+ model=model.url_requester,
2330
+ decoder=decoder,
2331
+ config=config,
2332
+ name=f"job extract_url - {name}",
2333
+ )
2334
+ if model.url_requester
2335
+ else None
2336
+ )
2391
2337
  status_extractor = self._create_component_from_model(
2392
2338
  model=model.status_extractor, decoder=decoder, config=config, name=name
2393
2339
  )
@@ -2398,6 +2344,7 @@ class ModelToComponentFactory:
2398
2344
  creation_requester=creation_requester,
2399
2345
  polling_requester=polling_requester,
2400
2346
  download_retriever=download_retriever,
2347
+ url_requester=url_requester,
2401
2348
  abort_requester=abort_requester,
2402
2349
  delete_requester=delete_requester,
2403
2350
  status_extractor=status_extractor,
@@ -0,0 +1,57 @@
1
+ # AsyncHttpJobRepository sequence diagram
2
+
3
+ - Components marked as optional are not required and can be ignored.
4
+ - if `url_requester` is not provided, `urls_extractor` will get urls from the `polling_job_response`
5
+ - interpolation_context, e.g. `create_job_response` or `polling_job_response` can be obtained from stream_slice
6
+
7
+
8
+ ```mermaid
9
+ ---
10
+ title: AsyncHttpJobRepository Sequence Diagram
11
+ ---
12
+ sequenceDiagram
13
+ participant AsyncHttpJobRepository as AsyncOrchestrator
14
+ participant CreationRequester as creation_requester
15
+ participant PollingRequester as polling_requester
16
+ participant UrlRequester as url_requester (Optional)
17
+ participant DownloadRetriever as download_retriever
18
+ participant AbortRequester as abort_requester (Optional)
19
+ participant DeleteRequester as delete_requester (Optional)
20
+ participant Reporting Server as Async Reporting Server
21
+
22
+ AsyncHttpJobRepository ->> CreationRequester: Initiate job creation
23
+ CreationRequester ->> Reporting Server: Create job request
24
+ Reporting Server -->> CreationRequester: Job ID response
25
+ CreationRequester -->> AsyncHttpJobRepository: Job ID
26
+
27
+ loop Poll for job status
28
+ AsyncHttpJobRepository ->> PollingRequester: Check job status
29
+ PollingRequester ->> Reporting Server: Status request (interpolation_context: `create_job_response`)
30
+ Reporting Server -->> PollingRequester: Status response
31
+ PollingRequester -->> AsyncHttpJobRepository: Job status
32
+ end
33
+
34
+ alt Status: Ready
35
+ AsyncHttpJobRepository ->> UrlRequester: Request download URLs (if applicable)
36
+ UrlRequester ->> Reporting Server: URL request (interpolation_context: `polling_job_response`)
37
+ Reporting Server -->> UrlRequester: Download URLs
38
+ UrlRequester -->> AsyncHttpJobRepository: Download URLs
39
+
40
+ AsyncHttpJobRepository ->> DownloadRetriever: Download reports
41
+ DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
42
+ Reporting Server -->> DownloadRetriever: Report data
43
+ DownloadRetriever -->> AsyncHttpJobRepository: Report data
44
+ else Status: Failed
45
+ AsyncHttpJobRepository ->> AbortRequester: Send abort request
46
+ AbortRequester ->> Reporting Server: Abort job
47
+ Reporting Server -->> AbortRequester: Abort confirmation
48
+ AbortRequester -->> AsyncHttpJobRepository: Confirmation
49
+ end
50
+
51
+ AsyncHttpJobRepository ->> DeleteRequester: Send delete job request
52
+ DeleteRequester ->> Reporting Server: Delete job
53
+ Reporting Server -->> DeleteRequester: Deletion confirmation
54
+ DeleteRequester -->> AsyncHttpJobRepository: Confirmation
55
+
56
+
57
+ ```
@@ -31,6 +31,10 @@ LOGGER = logging.getLogger("airbyte")
31
31
 
32
32
  @dataclass
33
33
  class AsyncHttpJobRepository(AsyncJobRepository):
34
+ """
35
+ See Readme file for more details about flow.
36
+ """
37
+
34
38
  creation_requester: Requester
35
39
  polling_requester: Requester
36
40
  download_retriever: SimpleRetriever
@@ -44,6 +48,9 @@ class AsyncHttpJobRepository(AsyncJobRepository):
44
48
  record_extractor: RecordExtractor = field(
45
49
  init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
46
50
  )
51
+ url_requester: Optional[Requester] = (
52
+ None # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
53
+ )
47
54
 
48
55
  def __post_init__(self) -> None:
49
56
  self._create_job_response_by_id: Dict[str, Response] = {}
@@ -186,10 +193,13 @@ class AsyncHttpJobRepository(AsyncJobRepository):
186
193
 
187
194
  """
188
195
 
189
- for url in self.urls_extractor.extract_records(
190
- self._polling_job_response_by_id[job.api_job_id()]
191
- ):
192
- stream_slice: StreamSlice = StreamSlice(partition={"url": url}, cursor_slice={})
196
+ for url in self._get_download_url(job):
197
+ job_slice = job.job_parameters()
198
+ stream_slice = StreamSlice(
199
+ partition=job_slice.partition,
200
+ cursor_slice=job_slice.cursor_slice,
201
+ extra_fields={**job_slice.extra_fields, "url": url},
202
+ )
193
203
  for message in self.download_retriever.read_records({}, stream_slice):
194
204
  if isinstance(message, Record):
195
205
  yield message.data
@@ -226,3 +236,22 @@ class AsyncHttpJobRepository(AsyncJobRepository):
226
236
  cursor_slice={},
227
237
  )
228
238
  return stream_slice
239
+
240
+ def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
241
+ if not self.url_requester:
242
+ url_response = self._polling_job_response_by_id[job.api_job_id()]
243
+ else:
244
+ stream_slice: StreamSlice = StreamSlice(
245
+ partition={
246
+ "polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
247
+ },
248
+ cursor_slice={},
249
+ )
250
+ url_response = self.url_requester.send_request(stream_slice=stream_slice) # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
251
+ if not url_response:
252
+ raise AirbyteTracedException(
253
+ internal_message="Always expect a response or an exception from url_requester",
254
+ failure_type=FailureType.system_error,
255
+ )
256
+
257
+ yield from self.urls_extractor.extract_records(url_response) # type: ignore # we expect urls_extractor to always return list of strings
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
160
160
  stream_slice,
161
161
  next_page_token,
162
162
  self._paginator.get_request_headers,
163
- self.request_option_provider.get_request_headers,
163
+ self.stream_slicer.get_request_headers,
164
164
  )
165
165
  if isinstance(headers, str):
166
166
  raise ValueError("Request headers cannot be a string")
@@ -152,3 +152,6 @@ class StreamSlice(Mapping[str, Any]):
152
152
 
153
153
  def __hash__(self) -> int:
154
154
  return hash(orjson.dumps(self._stream_slice, option=orjson.OPT_SORT_KEYS))
155
+
156
+ def __bool__(self) -> bool:
157
+ return bool(self._stream_slice) or bool(self._extra_fields)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: airbyte-cdk
3
- Version: 6.17.1.dev0
3
+ Version: 6.18.0
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  License: MIT
6
6
  Keywords: airbyte,connector-development-kit,cdk
@@ -62,11 +62,11 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
62
62
  airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
63
63
  airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
64
64
  airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
65
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=VfDvff6ionjGScMbEpMGlZ0TfOyIQpMUZiuV6pkI9Os,26557
65
+ airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyMCu1qoGsne1Ooz3c1da-8EDZk6Suiy2gIq9Q,22475
66
66
  airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
67
67
  airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
68
68
  airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=GfZlk9EvYQiWDx3AipNLf1us1e986q2mgqcbHbeZU0k,133172
69
+ airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=2t3_QVXWOImPcH-apR_Xd8qNl6K_URFwBbQ47YHcjXg,133490
70
70
  airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
71
71
  airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
72
72
  airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
@@ -81,16 +81,15 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=RmV-IkO1YLj0PSOrrq
81
81
  airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
82
82
  airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
83
83
  airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
84
- airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=yTdEkyDUSW2KbFkEwJJMlS963C955LgCCOVfTmmScpQ,3367
84
+ airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
85
85
  airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=tjNwcURmlyD-TGCScXvW95ThNKyPGcx2SiWbG1-H-sc,6552
86
86
  airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
87
87
  airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
88
- airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=U1oZKtBaEC6IACmvziY9Wzg7Z8EgF4ZuR7NwvjlB_Sk,1255
89
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=1qloF1gfV5nsOqkOxDfviwyckPUq1ur6sglvhIt6AeQ,15344
88
+ airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=huRz3KQJSUFmJCg5GPE9TckEBsB5TMsCa_THhJAhPVI,1037
90
89
  airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
91
90
  airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
92
91
  airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
93
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=LpBLehdJ0ffinyFadt2ZwhwYQ4Pu3yqyQrIAlOTNbvg,16265
92
+ airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
94
93
  airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
95
94
  airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
96
95
  airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
@@ -107,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
107
106
  airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
108
107
  airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
109
108
  airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
110
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=IZFT1m4d-zp5hQ0ayU06Vdxm6r3MEq-X2sOCo9SuG-k,93270
109
+ airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=3xWpeDNDGOw_I2pQ1LDiUhNBEWEvNAtd-HCi_1aklSQ,93666
111
110
  airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
112
111
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
113
112
  airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
114
113
  airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
115
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lgFqJ8DP-cRizmvFKRd4Oy_ebgoT_AceMKIpuqoFm3c,112097
114
+ airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=dpRWxZaPghPcE5vGkI4swKDaXyLWLMAbvDoazuNSobU,109709
116
115
  airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
117
116
  airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
118
117
  airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
@@ -120,6 +119,7 @@ airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py,sha25
120
119
  airbyte_cdk/sources/declarative/partition_routers/partition_router.py,sha256=YyEIzdmLd1FjbVP3QbQ2VFCLW_P-OGbVh6VpZShp54k,2218
121
120
  airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py,sha256=SKzKjSyfccq4dxGIh-J6ejrgkCHzaiTIazmbmeQiRD4,1942
122
121
  airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py,sha256=5bgXoJfBg_6i53krQMptAGb50XB5XoVfqQxKQhlLtBA,15383
122
+ airbyte_cdk/sources/declarative/requesters/README.md,sha256=WabtHlwHg_J34aL1Kwm8vboYqBaSgsFjq10qR-P2sx8,2658
123
123
  airbyte_cdk/sources/declarative/requesters/__init__.py,sha256=d7a3OoHbqaJDyyPli3nqqJ2yAW_SLX6XDaBAKOwvpxw,364
124
124
  airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py,sha256=SkEDcJxlT1683rNx93K9whoS0OyUukkuOfToGtgpF58,776
125
125
  airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py,sha256=1WZdpFmWL6W_Dko0qjflTaKIWeqt8jHT-D6HcujIp3s,884
@@ -134,7 +134,7 @@ airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.
134
134
  airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py,sha256=q0YkeYUUWO6iErUy0vjqiOkhg8_9d5YcCmtlpXAJJ9E,1314
135
135
  airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py,sha256=Tan66odx8VHzfdyyXMQkXz2pJYksllGqvxmpoajgcK4,669
136
136
  airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py,sha256=vhWsEKNTYEzZ4gerhHqnDNKu4wGIP485NAzpSQ5DRZg,7941
137
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py,sha256=o0520AmHMb7SAoeokVNwoOzuZzIAT6ryx9uFYGSOrs0,8664
137
+ airbyte_cdk/sources/declarative/requesters/http_job_repository.py,sha256=3GtOefPH08evlSUxaILkiKLTHbIspFY4qd5B3ZqNE60,10063
138
138
  airbyte_cdk/sources/declarative/requesters/http_requester.py,sha256=RqYPkgJFAWfcZBTc-JBcGHPm4JL1ZQOhs9GKU4MP2eE,14723
139
139
  airbyte_cdk/sources/declarative/requesters/paginators/__init__.py,sha256=uArbKs9JKNCt7t9tZoeWwjDpyI1HoPp29FNW0JzvaEM,644
140
140
  airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py,sha256=FnSl3qPvv5wD6ieAI2Ic5c4dqBk-3fRe4tCaWzq3YwM,11840
@@ -163,7 +163,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
163
163
  airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
164
164
  airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=kX9ltelK2xLIBWDJBK2ucrvVe5tc5xmhdbVbgsjvlxY,3696
165
165
  airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
166
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=kgnhVQxRlFqJs2-rDu2-QH-p-GzQU3nKmSp6_aq8u0s,24550
166
+ airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=jxQ_9xcVD07r9PKhofitAqMkdX1k8ZNyy50qz5NwkFs,24540
167
167
  airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
168
168
  airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
169
169
  airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
@@ -293,7 +293,7 @@ airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py,sha256=Y
293
293
  airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py,sha256=ka-bBRWvIv09LmZNYl49p2lK9nd_Tvi2g0lIp3OkU40,14872
294
294
  airbyte_cdk/sources/streams/http/requests_native_auth/token.py,sha256=h5PTzcdH-RQLeCg7xZ45w_484OPUDSwNWl_iMJQmZoI,2526
295
295
  airbyte_cdk/sources/streams/utils/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
296
- airbyte_cdk/sources/types.py,sha256=WWVigI7ZSoQU2TBCzDsHJtoX4Ima9v--lcLyYwUG_cE,4904
296
+ airbyte_cdk/sources/types.py,sha256=nLPkTpyfGV4E6e99qcBWX4r8C3fE4I8Fvgx2EjvT9ic,5005
297
297
  airbyte_cdk/sources/utils/__init__.py,sha256=TTN6VUxVy6Is8BhYQZR5pxJGQh8yH4duXh4O1TiMiEY,118
298
298
  airbyte_cdk/sources/utils/casing.py,sha256=QC-gV1O4e8DR4-bhdXieUPKm_JamzslVyfABLYYRSXA,256
299
299
  airbyte_cdk/sources/utils/record_helper.py,sha256=jeB0mucudzna7Zvj-pCBbwFrbLJ36SlAWZTh5O4Fb9Y,2168
@@ -343,8 +343,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
343
343
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
344
344
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
345
345
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
346
- airbyte_cdk-6.17.1.dev0.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
- airbyte_cdk-6.17.1.dev0.dist-info/METADATA,sha256=S54tbJ6Fg_dn3aXb91ITIaAxfs9cWd6ksAuuglUhA_w,6005
348
- airbyte_cdk-6.17.1.dev0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
- airbyte_cdk-6.17.1.dev0.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
- airbyte_cdk-6.17.1.dev0.dist-info/RECORD,,
346
+ airbyte_cdk-6.18.0.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
347
+ airbyte_cdk-6.18.0.dist-info/METADATA,sha256=RvVkgbg-LBbS5eGTntO-mp34yRIDMuPYZ26VRmSkhCA,6000
348
+ airbyte_cdk-6.18.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
349
+ airbyte_cdk-6.18.0.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
350
+ airbyte_cdk-6.18.0.dist-info/RECORD,,
@@ -1,346 +0,0 @@
1
- import copy
2
- import logging
3
-
4
- #
5
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
6
- #
7
- import threading
8
- from collections import OrderedDict
9
- from copy import deepcopy
10
- from datetime import timedelta
11
- from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
12
-
13
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
- from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
15
- from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
- Timer,
17
- iterate_with_last_flag_and_state,
18
- )
19
- from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
20
- from airbyte_cdk.sources.message import MessageRepository
21
- from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
22
- PerPartitionKeySerializer,
23
- )
24
- from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
25
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
26
- from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
27
-
28
- logger = logging.getLogger("airbyte")
29
-
30
-
31
- class ConcurrentCursorFactory:
32
- def __init__(self, create_function: Callable[..., Cursor]):
33
- self._create_function = create_function
34
-
35
- def create(self, stream_state: Mapping[str, Any], runtime_lookback_window: Any) -> Cursor:
36
- return self._create_function(
37
- stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
38
- )
39
-
40
-
41
- class ConcurrentPerPartitionCursor(Cursor):
42
- """
43
- Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
44
-
45
- **Partition Limitation and Limit Reached Logic**
46
-
47
- - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
48
- - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
49
- - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
50
-
51
- The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
52
-
53
- - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
54
- - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
55
-
56
- This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
57
- """
58
-
59
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
60
- _NO_STATE: Mapping[str, Any] = {}
61
- _NO_CURSOR_STATE: Mapping[str, Any] = {}
62
- _KEY = 0
63
- _VALUE = 1
64
-
65
- def __init__(
66
- self,
67
- cursor_factory: ConcurrentCursorFactory,
68
- partition_router: PartitionRouter,
69
- stream_name: str,
70
- stream_namespace: Optional[str],
71
- stream_state: Any,
72
- message_repository: MessageRepository,
73
- connector_state_manager: ConnectorStateManager,
74
- cursor_field: CursorField,
75
- ) -> None:
76
- self._global_cursor: Mapping[str, Any] = {}
77
- self._stream_name = stream_name
78
- self._stream_namespace = stream_namespace
79
- self._message_repository = message_repository
80
- self._connector_state_manager = connector_state_manager
81
- self._cursor_field = cursor_field
82
-
83
- self._cursor_factory = cursor_factory
84
- self._partition_router = partition_router
85
-
86
- # The dict is ordered to ensure that once the maximum number of partitions is reached,
87
- # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
88
- self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
89
- self._state = {"states": []}
90
- self._semaphore_per_partition = OrderedDict()
91
- self._finished_partitions = set()
92
- self._lock = threading.Lock()
93
- self._timer = Timer()
94
- self._new_global_cursor = None
95
- self._lookback_window = 0
96
- self._parent_state = None
97
- self._over_limit = 0
98
- self._partition_serializer = PerPartitionKeySerializer()
99
-
100
- self._set_initial_state(stream_state)
101
-
102
- @property
103
- def cursor_field(self) -> CursorField:
104
- return self._cursor_field
105
-
106
- @property
107
- def state(self) -> MutableMapping[str, Any]:
108
- states = []
109
- for partition_tuple, cursor in self._cursor_per_partition.items():
110
- cursor_state = cursor._connector_state_converter.convert_to_state_message(
111
- self.cursor_field, cursor.state
112
- )
113
- if cursor_state:
114
- states.append(
115
- {
116
- "partition": self._to_dict(partition_tuple),
117
- "cursor": copy.deepcopy(cursor_state),
118
- }
119
- )
120
- state: dict[str, Any] = {"states": states}
121
-
122
- if self._global_cursor:
123
- state["state"] = self._global_cursor
124
- if self._lookback_window is not None:
125
- state["lookback_window"] = self._lookback_window
126
- if self._parent_state is not None:
127
- state["parent_state"] = self._parent_state
128
- return state
129
-
130
- def close_partition(self, partition: Partition) -> None:
131
- self._cursor_per_partition[
132
- self._to_partition_key(partition._stream_slice.partition)
133
- ].close_partition(partition=partition)
134
- with self._lock:
135
- self._semaphore_per_partition[
136
- self._to_partition_key(partition._stream_slice.partition)
137
- ].acquire()
138
- cursor = self._cursor_per_partition[
139
- self._to_partition_key(partition._stream_slice.partition)
140
- ]
141
- cursor_state = cursor._connector_state_converter.convert_to_state_message(
142
- cursor._cursor_field, cursor.state
143
- )
144
- if (
145
- self._to_partition_key(partition._stream_slice.partition)
146
- in self._finished_partitions
147
- and self._semaphore_per_partition[
148
- self._to_partition_key(partition._stream_slice.partition)
149
- ]._value
150
- == 0
151
- ):
152
- if (
153
- self._new_global_cursor is None
154
- or self._new_global_cursor[self.cursor_field.cursor_field_key]
155
- < cursor_state[self.cursor_field.cursor_field_key]
156
- ):
157
- self._new_global_cursor = copy.deepcopy(cursor_state)
158
-
159
- def ensure_at_least_one_state_emitted(self) -> None:
160
- """
161
- The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
162
- called.
163
- """
164
- if not any(
165
- semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
166
- ):
167
- self._global_cursor = self._new_global_cursor
168
- self._lookback_window = self._timer.finish()
169
- self._parent_state = self._partition_router.get_stream_state()
170
- self._emit_state_message()
171
-
172
- def _emit_state_message(self) -> None:
173
- self._connector_state_manager.update_state_for_stream(
174
- self._stream_name,
175
- self._stream_namespace,
176
- self.state,
177
- )
178
- state_message = self._connector_state_manager.create_state_message(
179
- self._stream_name, self._stream_namespace
180
- )
181
- self._message_repository.emit_message(state_message)
182
-
183
- def stream_slices(self) -> Iterable[StreamSlice]:
184
- slices = self._partition_router.stream_slices()
185
- self._timer.start()
186
- for partition in slices:
187
- yield from self.generate_slices_from_partition(partition)
188
-
189
- def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
190
- # Ensure the maximum number of partitions is not exceeded
191
- self._ensure_partition_limit()
192
-
193
- cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
194
- if not cursor:
195
- partition_state = self._global_cursor if self._global_cursor else self._NO_CURSOR_STATE
196
- cursor = self._create_cursor(partition_state)
197
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
198
- self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
199
- threading.Semaphore(0)
200
- )
201
-
202
- for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
203
- cursor.stream_slices(),
204
- lambda: None,
205
- ):
206
- self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
207
- if is_last_slice:
208
- self._finished_partitions.add(self._to_partition_key(partition.partition))
209
- yield StreamSlice(
210
- partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
211
- )
212
-
213
- def _ensure_partition_limit(self) -> None:
214
- """
215
- Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
216
- """
217
- while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
218
- self._over_limit += 1
219
- oldest_partition = self._cursor_per_partition.popitem(last=False)[
220
- 0
221
- ] # Remove the oldest partition
222
- logger.warning(
223
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
224
- )
225
-
226
- def limit_reached(self) -> bool:
227
- return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
228
-
229
- def _set_initial_state(self, stream_state: StreamState) -> None:
230
- """
231
- Set the initial state for the cursors.
232
-
233
- This method initializes the state for each partition cursor using the provided stream state.
234
- If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
235
-
236
- Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
237
- does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
238
-
239
- Args:
240
- stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
241
- {
242
- "states": [
243
- {
244
- "partition": {
245
- "partition_key": "value"
246
- },
247
- "cursor": {
248
- "last_updated": "2023-05-27T00:00:00Z"
249
- }
250
- }
251
- ],
252
- "parent_state": {
253
- "parent_stream_name": {
254
- "last_updated": "2023-05-27T00:00:00Z"
255
- }
256
- }
257
- }
258
- """
259
- if not stream_state:
260
- return
261
-
262
- if "states" not in stream_state:
263
- # We assume that `stream_state` is in a global format that can be applied to all partitions.
264
- # Example: {"global_state_format_key": "global_state_format_value"}
265
- self._global_cursor = deepcopy(stream_state)
266
- self._new_global_cursor = deepcopy(stream_state)
267
-
268
- else:
269
- self._lookback_window = stream_state.get("lookback_window")
270
-
271
- for state in stream_state["states"]:
272
- self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
273
- self._create_cursor(
274
- state["cursor"], runtime_lookback_window=self._lookback_window
275
- )
276
- )
277
- self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
278
- threading.Semaphore(0)
279
- )
280
-
281
- # set default state for missing partitions if it is per partition with fallback to global
282
- if "state" in stream_state:
283
- self._global_cursor = deepcopy(stream_state["state"])
284
- self._new_global_cursor = deepcopy(stream_state["state"])
285
-
286
- # Set parent state for partition routers based on parent streams
287
- self._partition_router.set_initial_state(stream_state)
288
-
289
- def observe(self, record: Record) -> None:
290
- self._cursor_per_partition[
291
- self._to_partition_key(record.associated_slice.partition)
292
- ].observe(record)
293
-
294
- def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
295
- return self._partition_serializer.to_partition_key(partition)
296
-
297
- def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
298
- return self._partition_serializer.to_partition(partition_key)
299
-
300
- def _create_cursor(self, cursor_state: Any, runtime_lookback_window: Any = None) -> Cursor:
301
- if runtime_lookback_window:
302
- runtime_lookback_window = timedelta(seconds=runtime_lookback_window)
303
- cursor = self._cursor_factory.create(
304
- stream_state=deepcopy(cursor_state), runtime_lookback_window=runtime_lookback_window
305
- )
306
- return cursor
307
-
308
- def should_be_synced(self, record: Record) -> bool:
309
- return self._get_cursor(record).should_be_synced(record)
310
-
311
- def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
312
- if not first.associated_slice or not second.associated_slice:
313
- raise ValueError(
314
- f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
315
- )
316
- if first.associated_slice.partition != second.associated_slice.partition:
317
- raise ValueError(
318
- f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
319
- )
320
-
321
- return self._get_cursor(first).is_greater_than_or_equal(
322
- self._convert_record_to_cursor_record(first),
323
- self._convert_record_to_cursor_record(second),
324
- )
325
-
326
- @staticmethod
327
- def _convert_record_to_cursor_record(record: Record) -> Record:
328
- return Record(
329
- record.data,
330
- StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
331
- if record.associated_slice
332
- else None,
333
- )
334
-
335
- def _get_cursor(self, record: Record) -> Cursor:
336
- if not record.associated_slice:
337
- raise ValueError(
338
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
339
- )
340
- partition_key = self._to_partition_key(record.associated_slice.partition)
341
- if partition_key not in self._cursor_per_partition:
342
- raise ValueError(
343
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
344
- )
345
- cursor = self._cursor_per_partition[partition_key]
346
- return cursor