airbyte-cdk 6.61.3.post2.dev17299502224__py3-none-any.whl → 6.62.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
  2. airbyte_cdk/manifest_server/README.md +17 -3
  3. airbyte_cdk/manifest_server/openapi.yaml +27 -27
  4. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +2 -2
  5. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
  6. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
  7. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +196 -269
  8. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +4 -7
  9. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
  10. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
  11. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
  12. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
  13. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
  14. airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
  15. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
  16. airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
  17. airbyte_cdk/sources/file_based/file_types/excel_parser.py +3 -3
  18. airbyte_cdk/sources/message/repository.py +20 -0
  19. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/METADATA +6 -5
  20. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/RECORD +24 -23
  21. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/LICENSE_SHORT +0 -0
  23. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/WHEEL +0 -0
  24. {airbyte_cdk-6.61.3.post2.dev17299502224.dist-info → airbyte_cdk-6.62.0.dev0.dist-info}/entry_points.txt +0 -0
@@ -33,7 +33,15 @@ from requests import Response
33
33
  from airbyte_cdk.connector_builder.models import (
34
34
  LogMessage as ConnectorBuilderLogMessage,
35
35
  )
36
- from airbyte_cdk.models import FailureType, Level
36
+ from airbyte_cdk.models import (
37
+ AirbyteStateBlob,
38
+ AirbyteStateMessage,
39
+ AirbyteStateType,
40
+ AirbyteStreamState,
41
+ FailureType,
42
+ Level,
43
+ StreamDescriptor,
44
+ )
37
45
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
38
46
  from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator
39
47
  from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker
@@ -90,6 +98,7 @@ from airbyte_cdk.sources.declarative.extractors import (
90
98
  RecordSelector,
91
99
  ResponseToFileExtractor,
92
100
  )
101
+ from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
93
102
  from airbyte_cdk.sources.declarative.extractors.record_filter import (
94
103
  ClientSideIncrementalRecordFilterDecorator,
95
104
  )
@@ -98,7 +107,6 @@ from airbyte_cdk.sources.declarative.incremental import (
98
107
  ConcurrentPerPartitionCursor,
99
108
  CursorFactory,
100
109
  DatetimeBasedCursor,
101
- DeclarativeCursor,
102
110
  GlobalSubstreamCursor,
103
111
  PerPartitionWithGlobalCursor,
104
112
  )
@@ -500,8 +508,11 @@ from airbyte_cdk.sources.declarative.requesters.request_options import (
500
508
  InterpolatedRequestOptionsProvider,
501
509
  RequestOptionsProvider,
502
510
  )
511
+ from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import (
512
+ PerPartitionRequestOptionsProvider,
513
+ )
503
514
  from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
504
- from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod
515
+ from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester
505
516
  from airbyte_cdk.sources.declarative.resolvers import (
506
517
  ComponentMappingDefinition,
507
518
  ConfigComponentsResolver,
@@ -583,6 +594,7 @@ from airbyte_cdk.sources.message import (
583
594
  MessageRepository,
584
595
  NoopMessageRepository,
585
596
  )
597
+ from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository
586
598
  from airbyte_cdk.sources.streams.call_rate import (
587
599
  APIBudget,
588
600
  FixedWindowCallRatePolicy,
@@ -630,6 +642,7 @@ SCHEMA_TRANSFORMER_TYPE_MAPPING = {
630
642
  SchemaNormalizationModel.None_: TransformConfig.NoTransform,
631
643
  SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization,
632
644
  }
645
+ _NO_STREAM_SLICING = SinglePartitionRouter(parameters={})
633
646
 
634
647
  # Ideally this should use the value defined in ConcurrentDeclarativeSource, but
635
648
  # this would be a circular import
@@ -702,7 +715,7 @@ class ModelToComponentFactory:
702
715
  CustomValidationStrategyModel: self.create_custom_component,
703
716
  CustomConfigTransformationModel: self.create_custom_component,
704
717
  DatetimeBasedCursorModel: self.create_datetime_based_cursor,
705
- DeclarativeStreamModel: self.create_declarative_stream,
718
+ DeclarativeStreamModel: self.create_default_stream,
706
719
  DefaultErrorHandlerModel: self.create_default_error_handler,
707
720
  DefaultPaginatorModel: self.create_default_paginator,
708
721
  DpathExtractorModel: self.create_dpath_extractor,
@@ -1291,19 +1304,20 @@ class ModelToComponentFactory:
1291
1304
  f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
1292
1305
  )
1293
1306
 
1307
+ model_parameters = datetime_based_cursor_model.parameters or {}
1294
1308
  interpolated_cursor_field = InterpolatedString.create(
1295
1309
  datetime_based_cursor_model.cursor_field,
1296
- parameters=datetime_based_cursor_model.parameters or {},
1310
+ parameters=model_parameters,
1297
1311
  )
1298
1312
  cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
1299
1313
 
1300
1314
  interpolated_partition_field_start = InterpolatedString.create(
1301
1315
  datetime_based_cursor_model.partition_field_start or "start_time",
1302
- parameters=datetime_based_cursor_model.parameters or {},
1316
+ parameters=model_parameters,
1303
1317
  )
1304
1318
  interpolated_partition_field_end = InterpolatedString.create(
1305
1319
  datetime_based_cursor_model.partition_field_end or "end_time",
1306
- parameters=datetime_based_cursor_model.parameters or {},
1320
+ parameters=model_parameters,
1307
1321
  )
1308
1322
 
1309
1323
  slice_boundary_fields = (
@@ -1323,7 +1337,7 @@ class ModelToComponentFactory:
1323
1337
  interpolated_lookback_window = (
1324
1338
  InterpolatedString.create(
1325
1339
  datetime_based_cursor_model.lookback_window,
1326
- parameters=datetime_based_cursor_model.parameters or {},
1340
+ parameters=model_parameters,
1327
1341
  )
1328
1342
  if datetime_based_cursor_model.lookback_window
1329
1343
  else None
@@ -1409,7 +1423,7 @@ class ModelToComponentFactory:
1409
1423
  interpolated_step = (
1410
1424
  InterpolatedString.create(
1411
1425
  datetime_based_cursor_model.step,
1412
- parameters=datetime_based_cursor_model.parameters or {},
1426
+ parameters=model_parameters,
1413
1427
  )
1414
1428
  if datetime_based_cursor_model.step
1415
1429
  else None
@@ -1426,7 +1440,7 @@ class ModelToComponentFactory:
1426
1440
  # object which we want to keep agnostic of being low-code
1427
1441
  target = InterpolatedString(
1428
1442
  string=datetime_based_cursor_model.clamping.target,
1429
- parameters=datetime_based_cursor_model.parameters or {},
1443
+ parameters=model_parameters,
1430
1444
  )
1431
1445
  evaluated_target = target.eval(config=config)
1432
1446
  match evaluated_target:
@@ -1603,6 +1617,10 @@ class ModelToComponentFactory:
1603
1617
 
1604
1618
  interpolated_cursor_field = InterpolatedString.create(
1605
1619
  datetime_based_cursor_model.cursor_field,
1620
+ # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases:
1621
+ # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters`
1622
+ # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters`
1623
+ # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory.
1606
1624
  parameters=datetime_based_cursor_model.parameters or {},
1607
1625
  )
1608
1626
  cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
@@ -1634,7 +1652,7 @@ class ModelToComponentFactory:
1634
1652
  stream_namespace=stream_namespace,
1635
1653
  config=config,
1636
1654
  message_repository=NoopMessageRepository(),
1637
- stream_state_migrations=stream_state_migrations,
1655
+ # stream_state_migrations=stream_state_migrations, # FIXME is it expected to run migration on per partition state too?
1638
1656
  )
1639
1657
  )
1640
1658
 
@@ -1942,13 +1960,17 @@ class ModelToComponentFactory:
1942
1960
  parameters=model.parameters or {},
1943
1961
  )
1944
1962
 
1945
- def create_declarative_stream(
1963
+ def create_default_stream(
1946
1964
  self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any
1947
1965
  ) -> Union[DeclarativeStream, AbstractStream]:
1948
1966
  primary_key = model.primary_key.__root__ if model.primary_key else None
1949
1967
 
1968
+ partition_router = self._build_stream_slicer_from_partition_router(
1969
+ model.retriever, config, stream_name=model.name
1970
+ )
1971
+ concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
1950
1972
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
1951
- cursor_model = model.incremental_sync
1973
+ cursor_model: DatetimeBasedCursorModel = model.incremental_sync
1952
1974
 
1953
1975
  end_time_option = (
1954
1976
  self._create_component_from_model(
@@ -1965,17 +1987,29 @@ class ModelToComponentFactory:
1965
1987
  else None
1966
1988
  )
1967
1989
 
1968
- request_options_provider = DatetimeBasedRequestOptionsProvider(
1990
+ datetime_request_options_provider = DatetimeBasedRequestOptionsProvider(
1969
1991
  start_time_option=start_time_option,
1970
1992
  end_time_option=end_time_option,
1971
- partition_field_start=cursor_model.partition_field_end,
1993
+ partition_field_start=cursor_model.partition_field_start,
1972
1994
  partition_field_end=cursor_model.partition_field_end,
1973
1995
  config=config,
1974
1996
  parameters=model.parameters or {},
1975
1997
  )
1998
+ request_options_provider = (
1999
+ datetime_request_options_provider
2000
+ if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor)
2001
+ else PerPartitionRequestOptionsProvider(
2002
+ partition_router, datetime_request_options_provider
2003
+ )
2004
+ )
1976
2005
  elif model.incremental_sync and isinstance(
1977
2006
  model.incremental_sync, IncrementingCountCursorModel
1978
2007
  ):
2008
+ if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor):
2009
+ raise ValueError(
2010
+ "PerPartition does not support per partition states because switching to global state is time based"
2011
+ )
2012
+
1979
2013
  cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore
1980
2014
 
1981
2015
  start_time_option = (
@@ -2013,22 +2047,18 @@ class ModelToComponentFactory:
2013
2047
  model=model.file_uploader, config=config
2014
2048
  )
2015
2049
 
2016
- # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field
2017
- # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the
2018
- # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in
2019
- # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one.
2020
- combined_slicers = self._merge_stream_slicers(model=model, config=config)
2021
- partition_router = self._build_stream_slicer_from_partition_router(
2022
- model.retriever, config, stream_name=model.name
2050
+ stream_slicer: ConcurrentStreamSlicer = (
2051
+ partition_router
2052
+ if isinstance(concurrent_cursor, FinalStateCursor)
2053
+ else concurrent_cursor
2023
2054
  )
2024
- concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
2025
2055
  retriever = self._create_component_from_model(
2026
2056
  model=model.retriever,
2027
2057
  config=config,
2028
2058
  name=model.name,
2029
2059
  primary_key=primary_key,
2030
- stream_slicer=combined_slicers,
2031
2060
  request_options_provider=request_options_provider,
2061
+ stream_slicer=stream_slicer,
2032
2062
  stop_condition_cursor=concurrent_cursor
2033
2063
  if self._is_stop_condition_on_cursor(model)
2034
2064
  else None,
@@ -2039,6 +2069,8 @@ class ModelToComponentFactory:
2039
2069
  file_uploader=file_uploader,
2040
2070
  incremental_sync=model.incremental_sync,
2041
2071
  )
2072
+ if isinstance(retriever, AsyncRetriever):
2073
+ stream_slicer = retriever.stream_slicer
2042
2074
 
2043
2075
  schema_loader: Union[
2044
2076
  CompositeSchemaLoader,
@@ -2066,89 +2098,27 @@ class ModelToComponentFactory:
2066
2098
  options["name"] = model.name
2067
2099
  schema_loader = DefaultSchemaLoader(config=config, parameters=options)
2068
2100
 
2069
- if (
2070
- (
2071
- isinstance(combined_slicers, PartitionRouter)
2072
- or isinstance(concurrent_cursor, ConcurrentCursor)
2073
- )
2074
- and not self._emit_connector_builder_messages
2075
- and not is_parent
2076
- ):
2077
- # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the
2078
- # DeclarativeStream and assembling the DefaultStream from that. The plan is the following:
2079
- # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter
2080
- # * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition
2081
- # * Streams with both partition router and cursor
2082
- # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet
2083
- # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway
2084
-
2085
- stream_name = model.name or ""
2086
- stream_slicer: ConcurrentStreamSlicer = (
2087
- concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={})
2088
- )
2089
- cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository)
2090
- if isinstance(retriever, AsyncRetriever):
2091
- # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method
2092
- # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a
2093
- # special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when
2094
- # built because the async retriever has a specific partition router which relies on this stream slicer.
2095
- # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in
2096
- # AsyncJobPartitionRouter.
2097
- stream_slicer = retriever.stream_slicer
2098
- if isinstance(combined_slicers, Cursor):
2099
- cursor = combined_slicers
2100
- elif isinstance(combined_slicers, PartitionRouter):
2101
- stream_slicer = combined_slicers
2102
- elif concurrent_cursor:
2103
- cursor = concurrent_cursor
2104
-
2105
- # FIXME to be removed once we migrate everything to DefaultStream
2106
- if isinstance(retriever, SimpleRetriever):
2107
- # We zero it out here, but since this is a cursor reference, the state is still properly
2108
- # instantiated for the other components that reference it
2109
- retriever.cursor = None
2110
-
2111
- partition_generator = StreamSlicerPartitionGenerator(
2101
+ stream_name = model.name or ""
2102
+ return DefaultStream(
2103
+ partition_generator=StreamSlicerPartitionGenerator(
2112
2104
  DeclarativePartitionFactory(
2113
2105
  stream_name,
2114
2106
  schema_loader,
2115
2107
  retriever,
2116
2108
  self._message_repository,
2117
2109
  ),
2118
- stream_slicer=stream_slicer,
2119
- )
2120
- return DefaultStream(
2121
- partition_generator=partition_generator,
2122
- name=stream_name,
2123
- json_schema=schema_loader.get_json_schema,
2124
- primary_key=get_primary_key_from_stream(primary_key),
2125
- cursor_field=cursor.cursor_field.cursor_field_key
2126
- if hasattr(cursor, "cursor_field")
2127
- else "", # FIXME we should have the cursor field has part of the interface of cursor,
2128
- logger=logging.getLogger(f"airbyte.{stream_name}"),
2129
- # FIXME this is a breaking change compared to the old implementation which used the source name instead
2130
- cursor=cursor,
2131
- supports_file_transfer=hasattr(model, "file_uploader")
2132
- and bool(model.file_uploader),
2133
- )
2134
-
2135
- cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None
2136
- if model.state_migrations:
2137
- state_transformations = [
2138
- self._create_component_from_model(state_migration, config, declarative_stream=model)
2139
- for state_migration in model.state_migrations
2140
- ]
2141
- else:
2142
- state_transformations = []
2143
- return DeclarativeStream(
2144
- name=model.name or "",
2145
- primary_key=primary_key,
2146
- retriever=retriever,
2147
- schema_loader=schema_loader,
2148
- stream_cursor_field=cursor_field or "",
2149
- state_migrations=state_transformations,
2150
- config=config,
2151
- parameters=model.parameters or {},
2110
+ stream_slicer,
2111
+ slice_limit=self._limit_slices_fetched,
2112
+ ),
2113
+ name=stream_name,
2114
+ json_schema=schema_loader.get_json_schema,
2115
+ primary_key=get_primary_key_from_stream(primary_key),
2116
+ cursor_field=concurrent_cursor.cursor_field.cursor_field_key
2117
+ if hasattr(concurrent_cursor, "cursor_field")
2118
+ else "", # FIXME we should have the cursor field has part of the interface of cursor,
2119
+ logger=logging.getLogger(f"airbyte.{stream_name}"),
2120
+ cursor=concurrent_cursor,
2121
+ supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader),
2152
2122
  )
2153
2123
 
2154
2124
  def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool:
@@ -2197,86 +2167,15 @@ class ModelToComponentFactory:
2197
2167
  )
2198
2168
  return SinglePartitionRouter(parameters={})
2199
2169
 
2200
- def _build_incremental_cursor(
2201
- self,
2202
- model: DeclarativeStreamModel,
2203
- stream_slicer: Optional[PartitionRouter],
2204
- config: Config,
2205
- ) -> Optional[StreamSlicer]:
2206
- state_transformations = (
2207
- [
2208
- self._create_component_from_model(state_migration, config, declarative_stream=model)
2209
- for state_migration in model.state_migrations
2210
- ]
2211
- if model.state_migrations
2212
- else []
2213
- )
2214
-
2215
- if model.incremental_sync and (
2216
- stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter)
2217
- ):
2218
- if model.retriever.type == "AsyncRetriever":
2219
- stream_name = model.name or ""
2220
- stream_namespace = None
2221
- stream_state = self._connector_state_manager.get_stream_state(
2222
- stream_name, stream_namespace
2223
- )
2224
-
2225
- return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2226
- state_manager=self._connector_state_manager,
2227
- model_type=DatetimeBasedCursorModel,
2228
- component_definition=model.incremental_sync.__dict__,
2229
- stream_name=stream_name,
2230
- stream_namespace=stream_namespace,
2231
- config=config or {},
2232
- stream_state=stream_state,
2233
- stream_state_migrations=state_transformations,
2234
- partition_router=stream_slicer,
2235
- )
2236
-
2237
- incremental_sync_model = model.incremental_sync
2238
- cursor_component = self._create_component_from_model(
2239
- model=incremental_sync_model, config=config
2240
- )
2241
- is_global_cursor = (
2242
- hasattr(incremental_sync_model, "global_substream_cursor")
2243
- and incremental_sync_model.global_substream_cursor
2244
- )
2245
-
2246
- if is_global_cursor:
2247
- return GlobalSubstreamCursor(
2248
- stream_cursor=cursor_component, partition_router=stream_slicer
2249
- )
2250
- return PerPartitionWithGlobalCursor(
2251
- cursor_factory=CursorFactory(
2252
- lambda: self._create_component_from_model(
2253
- model=incremental_sync_model, config=config
2254
- ),
2255
- ),
2256
- partition_router=stream_slicer,
2257
- stream_cursor=cursor_component,
2258
- )
2259
- elif model.incremental_sync:
2260
- if model.retriever.type == "AsyncRetriever":
2261
- return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2262
- model_type=DatetimeBasedCursorModel,
2263
- component_definition=model.incremental_sync.__dict__,
2264
- stream_name=model.name or "",
2265
- stream_namespace=None,
2266
- config=config or {},
2267
- stream_state_migrations=state_transformations,
2268
- )
2269
- return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync
2270
- return None
2271
-
2272
2170
  def _build_concurrent_cursor(
2273
2171
  self,
2274
2172
  model: DeclarativeStreamModel,
2275
2173
  stream_slicer: Optional[PartitionRouter],
2276
2174
  config: Config,
2277
- ) -> Optional[StreamSlicer]:
2175
+ ) -> Cursor:
2176
+ stream_name = model.name or ""
2278
2177
  stream_state = self._connector_state_manager.get_stream_state(
2279
- stream_name=model.name or "", namespace=None
2178
+ stream_name=stream_name, namespace=None
2280
2179
  )
2281
2180
 
2282
2181
  if model.state_migrations:
@@ -2296,20 +2195,20 @@ class ModelToComponentFactory:
2296
2195
  state_manager=self._connector_state_manager,
2297
2196
  model_type=DatetimeBasedCursorModel,
2298
2197
  component_definition=model.incremental_sync.__dict__,
2299
- stream_name=model.name or "",
2198
+ stream_name=stream_name,
2300
2199
  stream_namespace=None,
2301
2200
  config=config or {},
2302
2201
  stream_state=stream_state,
2303
2202
  stream_state_migrations=state_transformations,
2304
2203
  partition_router=stream_slicer,
2305
- attempt_to_create_cursor_if_not_provided=True,
2204
+ attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now?
2306
2205
  )
2307
2206
  elif model.incremental_sync:
2308
2207
  if type(model.incremental_sync) == IncrementingCountCursorModel:
2309
2208
  return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2310
2209
  model_type=IncrementingCountCursorModel,
2311
2210
  component_definition=model.incremental_sync.__dict__,
2312
- stream_name=model.name or "",
2211
+ stream_name=stream_name,
2313
2212
  stream_namespace=None,
2314
2213
  config=config or {},
2315
2214
  stream_state_migrations=state_transformations,
@@ -2318,7 +2217,7 @@ class ModelToComponentFactory:
2318
2217
  return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2319
2218
  model_type=type(model.incremental_sync),
2320
2219
  component_definition=model.incremental_sync.__dict__,
2321
- stream_name=model.name or "",
2220
+ stream_name=stream_name,
2322
2221
  stream_namespace=None,
2323
2222
  config=config or {},
2324
2223
  stream_state_migrations=state_transformations,
@@ -2328,45 +2227,7 @@ class ModelToComponentFactory:
2328
2227
  raise ValueError(
2329
2228
  f"Incremental sync of type {type(model.incremental_sync)} is not supported"
2330
2229
  )
2331
- return None
2332
-
2333
- def _merge_stream_slicers(
2334
- self, model: DeclarativeStreamModel, config: Config
2335
- ) -> Optional[StreamSlicer]:
2336
- retriever_model = model.retriever
2337
-
2338
- stream_slicer = self._build_stream_slicer_from_partition_router(
2339
- retriever_model, config, stream_name=model.name
2340
- )
2341
-
2342
- if retriever_model.type == "AsyncRetriever":
2343
- is_not_datetime_cursor = (
2344
- model.incremental_sync.type != "DatetimeBasedCursor"
2345
- if model.incremental_sync
2346
- else None
2347
- )
2348
- is_partition_router = (
2349
- bool(retriever_model.partition_router) if model.incremental_sync else None
2350
- )
2351
-
2352
- if is_not_datetime_cursor:
2353
- # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the
2354
- # support or unordered slices (for example, when we trigger reports for January and February, the report
2355
- # in February can be completed first). Once we have support for custom concurrent cursor or have a new
2356
- # implementation available in the CDK, we can enable more cursors here.
2357
- raise ValueError(
2358
- "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet."
2359
- )
2360
-
2361
- if is_partition_router and not stream_slicer:
2362
- # Note that this development is also done in parallel to the per partition development which once merged
2363
- # we could support here by calling create_concurrent_cursor_from_perpartition_cursor
2364
- raise ValueError("Per partition state is not supported yet for AsyncRetriever.")
2365
-
2366
- if model.incremental_sync:
2367
- return self._build_incremental_cursor(model, stream_slicer, config)
2368
-
2369
- return stream_slicer
2230
+ return FinalStateCursor(stream_name, None, self._message_repository)
2370
2231
 
2371
2232
  def create_default_error_handler(
2372
2233
  self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any
@@ -3263,7 +3124,6 @@ class ModelToComponentFactory:
3263
3124
  *,
3264
3125
  name: str,
3265
3126
  primary_key: Optional[Union[str, List[str], List[List[str]]]],
3266
- stream_slicer: Optional[StreamSlicer],
3267
3127
  request_options_provider: Optional[RequestOptionsProvider] = None,
3268
3128
  stop_condition_cursor: Optional[Cursor] = None,
3269
3129
  client_side_incremental_sync: Optional[Dict[str, Any]] = None,
@@ -3278,7 +3138,7 @@ class ModelToComponentFactory:
3278
3138
  log_formatter: Optional[Callable[[Response], Any]] = None,
3279
3139
  **kwargs: Any,
3280
3140
  ) -> SimpleRetriever:
3281
- def _get_url() -> str:
3141
+ def _get_url(req: Requester) -> str:
3282
3142
  """
3283
3143
  Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever.
3284
3144
  This is needed because the URL is not set until the requester is created.
@@ -3287,12 +3147,12 @@ class ModelToComponentFactory:
3287
3147
  _url: str = (
3288
3148
  model.requester.url
3289
3149
  if hasattr(model.requester, "url") and model.requester.url is not None
3290
- else requester.get_url()
3150
+ else req.get_url(stream_state=None, stream_slice=None, next_page_token=None)
3291
3151
  )
3292
3152
  _url_base: str = (
3293
3153
  model.requester.url_base
3294
3154
  if hasattr(model.requester, "url_base") and model.requester.url_base is not None
3295
- else requester.get_url_base()
3155
+ else req.get_url(stream_state=None, stream_slice=None, next_page_token=None)
3296
3156
  )
3297
3157
 
3298
3158
  return _url or _url_base
@@ -3371,36 +3231,14 @@ class ModelToComponentFactory:
3371
3231
  config=config,
3372
3232
  )
3373
3233
 
3374
- # Define cursor only if per partition or common incremental support is needed
3375
- cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None
3376
-
3377
- if (
3378
- not isinstance(stream_slicer, DatetimeBasedCursor)
3379
- or type(stream_slicer) is not DatetimeBasedCursor
3380
- ):
3381
- # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
3382
- # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
3383
- # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
3384
- # request_options_provider
3385
- request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={})
3386
- elif not request_options_provider:
3234
+ if not request_options_provider:
3387
3235
  request_options_provider = DefaultRequestOptionsProvider(parameters={})
3388
3236
 
3389
- stream_slicer = stream_slicer or SinglePartitionRouter(parameters={})
3390
- if self._should_limit_slices_fetched():
3391
- stream_slicer = cast(
3392
- StreamSlicer,
3393
- StreamSlicerTestReadDecorator(
3394
- wrapped_slicer=stream_slicer,
3395
- maximum_number_of_slices=self._limit_slices_fetched or 5,
3396
- ),
3397
- )
3398
-
3399
3237
  paginator = (
3400
3238
  self._create_component_from_model(
3401
3239
  model=model.paginator,
3402
3240
  config=config,
3403
- url_base=_get_url(),
3241
+ url_base=_get_url(requester),
3404
3242
  extractor_model=model.record_selector.extractor,
3405
3243
  decoder=decoder,
3406
3244
  cursor_used_for_stop_condition=stop_condition_cursor or None,
@@ -3444,9 +3282,9 @@ class ModelToComponentFactory:
3444
3282
  primary_key=primary_key,
3445
3283
  requester=requester,
3446
3284
  record_selector=record_selector,
3447
- stream_slicer=stream_slicer,
3285
+ stream_slicer=_NO_STREAM_SLICING,
3448
3286
  request_option_provider=request_options_provider,
3449
- cursor=cursor,
3287
+ cursor=None,
3450
3288
  config=config,
3451
3289
  ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests,
3452
3290
  parameters=model.parameters or {},
@@ -3458,9 +3296,9 @@ class ModelToComponentFactory:
3458
3296
  primary_key=primary_key,
3459
3297
  requester=requester,
3460
3298
  record_selector=record_selector,
3461
- stream_slicer=stream_slicer,
3299
+ stream_slicer=_NO_STREAM_SLICING,
3462
3300
  request_option_provider=request_options_provider,
3463
- cursor=cursor,
3301
+ cursor=None,
3464
3302
  config=config,
3465
3303
  ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests,
3466
3304
  additional_query_properties=query_properties,
@@ -3531,14 +3369,21 @@ class ModelToComponentFactory:
3531
3369
  f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}."
3532
3370
  )
3533
3371
 
3534
- stream_model = (
3372
+ stream_model = self._get_state_delegating_stream_model(
3373
+ False if has_parent_state is None else has_parent_state, model
3374
+ )
3375
+
3376
+ return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel
3377
+
3378
+ def _get_state_delegating_stream_model(
3379
+ self, has_parent_state: bool, model: StateDelegatingStreamModel
3380
+ ) -> DeclarativeStreamModel:
3381
+ return (
3535
3382
  model.incremental_stream
3536
3383
  if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state
3537
3384
  else model.full_refresh_stream
3538
3385
  )
3539
3386
 
3540
- return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3541
-
3542
3387
  def _create_async_job_status_mapping(
3543
3388
  self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any
3544
3389
  ) -> Mapping[str, AsyncJobStatus]:
@@ -3583,12 +3428,14 @@ class ModelToComponentFactory:
3583
3428
  transformations: List[RecordTransformation],
3584
3429
  **kwargs: Any,
3585
3430
  ) -> AsyncRetriever:
3586
- def _get_download_retriever() -> SimpleRetriever:
3431
+ def _get_download_retriever(
3432
+ requester: Requester, extractor: RecordExtractor, _decoder: Decoder
3433
+ ) -> SimpleRetriever:
3587
3434
  # We create a record selector for the download retriever
3588
3435
  # with no schema normalization and no transformations, neither record filter
3589
3436
  # as all this occurs in the record_selector of the AsyncRetriever
3590
3437
  record_selector = RecordSelector(
3591
- extractor=download_extractor,
3438
+ extractor=extractor,
3592
3439
  name=name,
3593
3440
  record_filter=None,
3594
3441
  transformations=[],
@@ -3599,7 +3446,7 @@ class ModelToComponentFactory:
3599
3446
  paginator = (
3600
3447
  self._create_component_from_model(
3601
3448
  model=model.download_paginator,
3602
- decoder=decoder,
3449
+ decoder=_decoder,
3603
3450
  config=config,
3604
3451
  url_base="",
3605
3452
  )
@@ -3608,7 +3455,7 @@ class ModelToComponentFactory:
3608
3455
  )
3609
3456
 
3610
3457
  return SimpleRetriever(
3611
- requester=download_requester,
3458
+ requester=requester,
3612
3459
  record_selector=record_selector,
3613
3460
  primary_key=None,
3614
3461
  name=name,
@@ -3702,7 +3549,9 @@ class ModelToComponentFactory:
3702
3549
  config=config,
3703
3550
  name=job_download_components_name,
3704
3551
  )
3705
- download_retriever = _get_download_retriever()
3552
+ download_retriever = _get_download_retriever(
3553
+ download_requester, download_extractor, download_decoder
3554
+ )
3706
3555
  abort_requester = (
3707
3556
  self._create_component_from_model(
3708
3557
  model=model.abort_requester,
@@ -3848,29 +3697,104 @@ class ModelToComponentFactory:
3848
3697
  def _create_message_repository_substream_wrapper(
3849
3698
  self, model: ParentStreamConfigModel, config: Config, **kwargs: Any
3850
3699
  ) -> Any:
3700
+ # getting the parent state
3701
+ child_state = self._connector_state_manager.get_stream_state(
3702
+ kwargs["stream_name"], None
3703
+ )
3704
+
3705
+ # This flag will be used exclusively for StateDelegatingStream when a parent stream is created
3706
+ has_parent_state = bool(
3707
+ self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None)
3708
+ if model.incremental_dependency
3709
+ else False
3710
+ )
3711
+ connector_state_manager = self._instantiate_parent_stream_state_manager(
3712
+ child_state, config, model, has_parent_state
3713
+ )
3714
+
3851
3715
  substream_factory = ModelToComponentFactory(
3716
+ connector_state_manager=connector_state_manager,
3852
3717
  limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice,
3853
3718
  limit_slices_fetched=self._limit_slices_fetched,
3854
3719
  emit_connector_builder_messages=self._emit_connector_builder_messages,
3855
3720
  disable_retries=self._disable_retries,
3856
3721
  disable_cache=self._disable_cache,
3857
- message_repository=LogAppenderMessageRepositoryDecorator(
3858
- {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}},
3859
- self._message_repository,
3860
- self._evaluate_log_level(self._emit_connector_builder_messages),
3722
+ message_repository=StateFilteringMessageRepository(
3723
+ LogAppenderMessageRepositoryDecorator(
3724
+ {
3725
+ "airbyte_cdk": {"stream": {"is_substream": True}},
3726
+ "http": {"is_auxiliary": True},
3727
+ },
3728
+ self._message_repository,
3729
+ self._evaluate_log_level(self._emit_connector_builder_messages),
3730
+ ),
3861
3731
  ),
3862
3732
  )
3863
3733
 
3864
- # This flag will be used exclusively for StateDelegatingStream when a parent stream is created
3865
- has_parent_state = bool(
3866
- self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None)
3867
- if model.incremental_dependency
3868
- else False
3869
- )
3870
3734
  return substream_factory._create_component_from_model(
3871
3735
  model=model, config=config, has_parent_state=has_parent_state, **kwargs
3872
3736
  )
3873
3737
 
3738
+ def _instantiate_parent_stream_state_manager(
3739
+ self,
3740
+ child_state: MutableMapping[str, Any],
3741
+ config: Config,
3742
+ model: ParentStreamConfigModel,
3743
+ has_parent_state: bool,
3744
+ ) -> ConnectorStateManager:
3745
+ """
3746
+ With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the
3747
+ `set_initial_state` flow that existed for the declarative cursors. This state is taken from
3748
+ self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account
3749
+ for the MessageRepository being different). So we need to pass a ConnectorStateManager to the
3750
+ ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if
3751
+ incremental_dependency is set.
3752
+ """
3753
+ if model.incremental_dependency and child_state:
3754
+ parent_stream_name = model.stream.name or ""
3755
+ parent_state = ConcurrentPerPartitionCursor.get_parent_state(
3756
+ child_state, parent_stream_name
3757
+ )
3758
+
3759
+ if not parent_state:
3760
+ # there are two migration cases: state value from child stream or from global state
3761
+ parent_state = ConcurrentPerPartitionCursor.get_global_state(
3762
+ child_state, parent_stream_name
3763
+ )
3764
+
3765
+ if not parent_state and not isinstance(parent_state, dict):
3766
+ cursor_values = child_state.values()
3767
+ if cursor_values:
3768
+ incremental_sync_model: Union[
3769
+ DatetimeBasedCursorModel,
3770
+ IncrementingCountCursorModel,
3771
+ CustomIncrementalSyncModel,
3772
+ ] = (
3773
+ model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream
3774
+ if isinstance(model.stream, DeclarativeStreamModel)
3775
+ else self._get_state_delegating_stream_model(
3776
+ has_parent_state, model.stream
3777
+ ).incremental_sync
3778
+ )
3779
+ cursor_field = InterpolatedString.create(
3780
+ incremental_sync_model.cursor_field,
3781
+ parameters=incremental_sync_model.parameters or {},
3782
+ ).eval(config)
3783
+ parent_state = AirbyteStateMessage(
3784
+ type=AirbyteStateType.STREAM,
3785
+ stream=AirbyteStreamState(
3786
+ stream_descriptor=StreamDescriptor(
3787
+ name=parent_stream_name, namespace=None
3788
+ ),
3789
+ stream_state=AirbyteStateBlob(
3790
+ {cursor_field: list(cursor_values)[0]}
3791
+ ),
3792
+ ),
3793
+ )
3794
+ return ConnectorStateManager([parent_state] if parent_state else [])
3795
+
3796
+ return ConnectorStateManager([])
3797
+
3874
3798
  @staticmethod
3875
3799
  def create_wait_time_from_header(
3876
3800
  model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any
@@ -3951,6 +3875,7 @@ class ModelToComponentFactory:
3951
3875
 
3952
3876
  return HttpComponentsResolver(
3953
3877
  retriever=retriever,
3878
+ stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config),
3954
3879
  config=config,
3955
3880
  components_mapping=components_mapping,
3956
3881
  parameters=model.parameters or {},
@@ -4176,7 +4101,9 @@ class ModelToComponentFactory:
4176
4101
  self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
4177
4102
  ) -> GroupingPartitionRouter:
4178
4103
  underlying_router = self._create_component_from_model(
4179
- model=model.underlying_partition_router, config=config
4104
+ model=model.underlying_partition_router,
4105
+ config=config,
4106
+ **kwargs,
4180
4107
  )
4181
4108
  if model.group_size < 1:
4182
4109
  raise ValueError(f"Group size must be greater than 0, got {model.group_size}")