airbyte-cdk 6.61.6__py3-none-any.whl → 6.62.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +7 -7
  2. airbyte_cdk/connector_builder/main.py +2 -2
  3. airbyte_cdk/connector_builder/test_reader/reader.py +2 -2
  4. airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
  5. airbyte_cdk/manifest_server/Dockerfile +2 -2
  6. airbyte_cdk/manifest_server/README.md +0 -22
  7. airbyte_cdk/manifest_server/app.py +0 -6
  8. airbyte_cdk/manifest_server/cli/_common.py +0 -1
  9. airbyte_cdk/manifest_server/command_processor/processor.py +5 -2
  10. airbyte_cdk/manifest_server/command_processor/utils.py +1 -1
  11. airbyte_cdk/manifest_server/routers/manifest.py +1 -1
  12. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +6 -7
  13. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
  14. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
  15. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +208 -278
  16. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +0 -6
  17. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
  18. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
  19. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
  20. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
  21. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
  22. airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
  23. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
  24. airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
  25. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +7 -21
  26. airbyte_cdk/sources/declarative/yaml_declarative_source.py +1 -1
  27. airbyte_cdk/sources/message/repository.py +20 -0
  28. airbyte_cdk/sources/utils/schema_helpers.py +9 -29
  29. airbyte_cdk/sources/utils/transform.py +13 -25
  30. airbyte_cdk/utils/spec_schema_transformations.py +5 -7
  31. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/METADATA +2 -4
  32. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/RECORD +36 -35
  33. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/LICENSE.txt +0 -0
  34. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/LICENSE_SHORT +0 -0
  35. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/WHEEL +0 -0
  36. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/entry_points.txt +0 -0
@@ -33,7 +33,15 @@ from requests import Response
33
33
  from airbyte_cdk.connector_builder.models import (
34
34
  LogMessage as ConnectorBuilderLogMessage,
35
35
  )
36
- from airbyte_cdk.models import FailureType, Level
36
+ from airbyte_cdk.models import (
37
+ AirbyteStateBlob,
38
+ AirbyteStateMessage,
39
+ AirbyteStateType,
40
+ AirbyteStreamState,
41
+ FailureType,
42
+ Level,
43
+ StreamDescriptor,
44
+ )
37
45
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
38
46
  from airbyte_cdk.sources.declarative.async_job.job_orchestrator import AsyncJobOrchestrator
39
47
  from airbyte_cdk.sources.declarative.async_job.job_tracker import JobTracker
@@ -90,6 +98,7 @@ from airbyte_cdk.sources.declarative.extractors import (
90
98
  RecordSelector,
91
99
  ResponseToFileExtractor,
92
100
  )
101
+ from airbyte_cdk.sources.declarative.extractors.record_extractor import RecordExtractor
93
102
  from airbyte_cdk.sources.declarative.extractors.record_filter import (
94
103
  ClientSideIncrementalRecordFilterDecorator,
95
104
  )
@@ -98,7 +107,6 @@ from airbyte_cdk.sources.declarative.incremental import (
98
107
  ConcurrentPerPartitionCursor,
99
108
  CursorFactory,
100
109
  DatetimeBasedCursor,
101
- DeclarativeCursor,
102
110
  GlobalSubstreamCursor,
103
111
  PerPartitionWithGlobalCursor,
104
112
  )
@@ -500,8 +508,11 @@ from airbyte_cdk.sources.declarative.requesters.request_options import (
500
508
  InterpolatedRequestOptionsProvider,
501
509
  RequestOptionsProvider,
502
510
  )
511
+ from airbyte_cdk.sources.declarative.requesters.request_options.per_partition_request_option_provider import (
512
+ PerPartitionRequestOptionsProvider,
513
+ )
503
514
  from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
504
- from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod
515
+ from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester
505
516
  from airbyte_cdk.sources.declarative.resolvers import (
506
517
  ComponentMappingDefinition,
507
518
  ConfigComponentsResolver,
@@ -583,6 +594,7 @@ from airbyte_cdk.sources.message import (
583
594
  MessageRepository,
584
595
  NoopMessageRepository,
585
596
  )
597
+ from airbyte_cdk.sources.message.repository import StateFilteringMessageRepository
586
598
  from airbyte_cdk.sources.streams.call_rate import (
587
599
  APIBudget,
588
600
  FixedWindowCallRatePolicy,
@@ -630,6 +642,7 @@ SCHEMA_TRANSFORMER_TYPE_MAPPING = {
630
642
  SchemaNormalizationModel.None_: TransformConfig.NoTransform,
631
643
  SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization,
632
644
  }
645
+ _NO_STREAM_SLICING = SinglePartitionRouter(parameters={})
633
646
 
634
647
  # Ideally this should use the value defined in ConcurrentDeclarativeSource, but
635
648
  # this would be a circular import
@@ -702,7 +715,7 @@ class ModelToComponentFactory:
702
715
  CustomValidationStrategyModel: self.create_custom_component,
703
716
  CustomConfigTransformationModel: self.create_custom_component,
704
717
  DatetimeBasedCursorModel: self.create_datetime_based_cursor,
705
- DeclarativeStreamModel: self.create_declarative_stream,
718
+ DeclarativeStreamModel: self.create_default_stream,
706
719
  DefaultErrorHandlerModel: self.create_default_error_handler,
707
720
  DefaultPaginatorModel: self.create_default_paginator,
708
721
  DpathExtractorModel: self.create_dpath_extractor,
@@ -739,7 +752,7 @@ class ModelToComponentFactory:
739
752
  OAuthAuthenticatorModel: self.create_oauth_authenticator,
740
753
  OffsetIncrementModel: self.create_offset_increment,
741
754
  PageIncrementModel: self.create_page_increment,
742
- ParentStreamConfigModel: self.create_parent_stream_config,
755
+ ParentStreamConfigModel: self._create_message_repository_substream_wrapper,
743
756
  PredicateValidatorModel: self.create_predicate_validator,
744
757
  PropertiesFromEndpointModel: self.create_properties_from_endpoint,
745
758
  PropertyChunkingModel: self.create_property_chunking,
@@ -1291,19 +1304,20 @@ class ModelToComponentFactory:
1291
1304
  f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
1292
1305
  )
1293
1306
 
1307
+ model_parameters = datetime_based_cursor_model.parameters or {}
1294
1308
  interpolated_cursor_field = InterpolatedString.create(
1295
1309
  datetime_based_cursor_model.cursor_field,
1296
- parameters=datetime_based_cursor_model.parameters or {},
1310
+ parameters=model_parameters,
1297
1311
  )
1298
1312
  cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
1299
1313
 
1300
1314
  interpolated_partition_field_start = InterpolatedString.create(
1301
1315
  datetime_based_cursor_model.partition_field_start or "start_time",
1302
- parameters=datetime_based_cursor_model.parameters or {},
1316
+ parameters=model_parameters,
1303
1317
  )
1304
1318
  interpolated_partition_field_end = InterpolatedString.create(
1305
1319
  datetime_based_cursor_model.partition_field_end or "end_time",
1306
- parameters=datetime_based_cursor_model.parameters or {},
1320
+ parameters=model_parameters,
1307
1321
  )
1308
1322
 
1309
1323
  slice_boundary_fields = (
@@ -1323,7 +1337,7 @@ class ModelToComponentFactory:
1323
1337
  interpolated_lookback_window = (
1324
1338
  InterpolatedString.create(
1325
1339
  datetime_based_cursor_model.lookback_window,
1326
- parameters=datetime_based_cursor_model.parameters or {},
1340
+ parameters=model_parameters,
1327
1341
  )
1328
1342
  if datetime_based_cursor_model.lookback_window
1329
1343
  else None
@@ -1409,7 +1423,7 @@ class ModelToComponentFactory:
1409
1423
  interpolated_step = (
1410
1424
  InterpolatedString.create(
1411
1425
  datetime_based_cursor_model.step,
1412
- parameters=datetime_based_cursor_model.parameters or {},
1426
+ parameters=model_parameters,
1413
1427
  )
1414
1428
  if datetime_based_cursor_model.step
1415
1429
  else None
@@ -1426,7 +1440,7 @@ class ModelToComponentFactory:
1426
1440
  # object which we want to keep agnostic of being low-code
1427
1441
  target = InterpolatedString(
1428
1442
  string=datetime_based_cursor_model.clamping.target,
1429
- parameters=datetime_based_cursor_model.parameters or {},
1443
+ parameters=model_parameters,
1430
1444
  )
1431
1445
  evaluated_target = target.eval(config=config)
1432
1446
  match evaluated_target:
@@ -1603,6 +1617,10 @@ class ModelToComponentFactory:
1603
1617
 
1604
1618
  interpolated_cursor_field = InterpolatedString.create(
1605
1619
  datetime_based_cursor_model.cursor_field,
1620
+ # FIXME the interfaces of the concurrent cursor are kind of annoying as they take a `ComponentDefinition` instead of the actual model. This was done because the ConcurrentDeclarativeSource didn't have access to the models [here for example](https://github.com/airbytehq/airbyte-python-cdk/blob/f525803b3fec9329e4cc8478996a92bf884bfde9/airbyte_cdk/sources/declarative/concurrent_declarative_source.py#L354C54-L354C91). So now we have two cases:
1621
+ # * The ComponentDefinition comes from model.__dict__ in which case we have `parameters`
1622
+ # * The ComponentDefinition comes from the manifest as a dict in which case we have `$parameters`
1623
+ # We should change those interfaces to use the model once we clean up the code in CDS at which point the parameter propagation should happen as part of the ModelToComponentFactory.
1606
1624
  parameters=datetime_based_cursor_model.parameters or {},
1607
1625
  )
1608
1626
  cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
@@ -1634,7 +1652,7 @@ class ModelToComponentFactory:
1634
1652
  stream_namespace=stream_namespace,
1635
1653
  config=config,
1636
1654
  message_repository=NoopMessageRepository(),
1637
- stream_state_migrations=stream_state_migrations,
1655
+ # stream_state_migrations=stream_state_migrations, # FIXME is it expected to run migration on per partition state too?
1638
1656
  )
1639
1657
  )
1640
1658
 
@@ -1730,7 +1748,7 @@ class ModelToComponentFactory:
1730
1748
 
1731
1749
  if self._is_component(model_value):
1732
1750
  model_args[model_field] = self._create_nested_component(
1733
- model, model_field, model_value, config
1751
+ model, model_field, model_value, config, **kwargs,
1734
1752
  )
1735
1753
  elif isinstance(model_value, list):
1736
1754
  vals = []
@@ -1742,7 +1760,7 @@ class ModelToComponentFactory:
1742
1760
  if derived_type:
1743
1761
  v["type"] = derived_type
1744
1762
  if self._is_component(v):
1745
- vals.append(self._create_nested_component(model, model_field, v, config))
1763
+ vals.append(self._create_nested_component(model, model_field, v, config, **kwargs,))
1746
1764
  else:
1747
1765
  vals.append(v)
1748
1766
  model_args[model_field] = vals
@@ -1832,7 +1850,7 @@ class ModelToComponentFactory:
1832
1850
  return []
1833
1851
 
1834
1852
  def _create_nested_component(
1835
- self, model: Any, model_field: str, model_value: Any, config: Config
1853
+ self, model: Any, model_field: str, model_value: Any, config: Config, **kwargs: Any
1836
1854
  ) -> Any:
1837
1855
  type_name = model_value.get("type", None)
1838
1856
  if not type_name:
@@ -1857,8 +1875,11 @@ class ModelToComponentFactory:
1857
1875
  for kwarg in constructor_kwargs
1858
1876
  if kwarg in model_parameters
1859
1877
  }
1878
+ matching_kwargs = {
1879
+ kwarg: kwargs[kwarg] for kwarg in constructor_kwargs if kwarg in kwargs
1880
+ }
1860
1881
  return self._create_component_from_model(
1861
- model=parsed_model, config=config, **matching_parameters
1882
+ model=parsed_model, config=config, **(matching_parameters | matching_kwargs)
1862
1883
  )
1863
1884
  except TypeError as error:
1864
1885
  missing_parameters = self._extract_missing_parameters(error)
@@ -1942,13 +1963,17 @@ class ModelToComponentFactory:
1942
1963
  parameters=model.parameters or {},
1943
1964
  )
1944
1965
 
1945
- def create_declarative_stream(
1966
+ def create_default_stream(
1946
1967
  self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any
1947
1968
  ) -> Union[DeclarativeStream, AbstractStream]:
1948
1969
  primary_key = model.primary_key.__root__ if model.primary_key else None
1949
1970
 
1971
+ partition_router = self._build_stream_slicer_from_partition_router(
1972
+ model.retriever, config, stream_name=model.name
1973
+ )
1974
+ concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
1950
1975
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
1951
- cursor_model = model.incremental_sync
1976
+ cursor_model: DatetimeBasedCursorModel = model.incremental_sync
1952
1977
 
1953
1978
  end_time_option = (
1954
1979
  self._create_component_from_model(
@@ -1965,17 +1990,29 @@ class ModelToComponentFactory:
1965
1990
  else None
1966
1991
  )
1967
1992
 
1968
- request_options_provider = DatetimeBasedRequestOptionsProvider(
1993
+ datetime_request_options_provider = DatetimeBasedRequestOptionsProvider(
1969
1994
  start_time_option=start_time_option,
1970
1995
  end_time_option=end_time_option,
1971
- partition_field_start=cursor_model.partition_field_end,
1996
+ partition_field_start=cursor_model.partition_field_start,
1972
1997
  partition_field_end=cursor_model.partition_field_end,
1973
1998
  config=config,
1974
1999
  parameters=model.parameters or {},
1975
2000
  )
2001
+ request_options_provider = (
2002
+ datetime_request_options_provider
2003
+ if not isinstance(concurrent_cursor, ConcurrentPerPartitionCursor)
2004
+ else PerPartitionRequestOptionsProvider(
2005
+ partition_router, datetime_request_options_provider
2006
+ )
2007
+ )
1976
2008
  elif model.incremental_sync and isinstance(
1977
2009
  model.incremental_sync, IncrementingCountCursorModel
1978
2010
  ):
2011
+ if isinstance(concurrent_cursor, ConcurrentPerPartitionCursor):
2012
+ raise ValueError(
2013
+ "PerPartition does not support per partition states because switching to global state is time based"
2014
+ )
2015
+
1979
2016
  cursor_model: IncrementingCountCursorModel = model.incremental_sync # type: ignore
1980
2017
 
1981
2018
  start_time_option = (
@@ -2013,22 +2050,18 @@ class ModelToComponentFactory:
2013
2050
  model=model.file_uploader, config=config
2014
2051
  )
2015
2052
 
2016
- # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field
2017
- # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the
2018
- # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in
2019
- # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one.
2020
- combined_slicers = self._merge_stream_slicers(model=model, config=config)
2021
- partition_router = self._build_stream_slicer_from_partition_router(
2022
- model.retriever, config, stream_name=model.name
2053
+ stream_slicer: ConcurrentStreamSlicer = (
2054
+ partition_router
2055
+ if isinstance(concurrent_cursor, FinalStateCursor)
2056
+ else concurrent_cursor
2023
2057
  )
2024
- concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
2025
2058
  retriever = self._create_component_from_model(
2026
2059
  model=model.retriever,
2027
2060
  config=config,
2028
2061
  name=model.name,
2029
2062
  primary_key=primary_key,
2030
- stream_slicer=combined_slicers,
2031
2063
  request_options_provider=request_options_provider,
2064
+ stream_slicer=stream_slicer,
2032
2065
  stop_condition_cursor=concurrent_cursor
2033
2066
  if self._is_stop_condition_on_cursor(model)
2034
2067
  else None,
@@ -2039,6 +2072,8 @@ class ModelToComponentFactory:
2039
2072
  file_uploader=file_uploader,
2040
2073
  incremental_sync=model.incremental_sync,
2041
2074
  )
2075
+ if isinstance(retriever, AsyncRetriever):
2076
+ stream_slicer = retriever.stream_slicer
2042
2077
 
2043
2078
  schema_loader: Union[
2044
2079
  CompositeSchemaLoader,
@@ -2066,89 +2101,27 @@ class ModelToComponentFactory:
2066
2101
  options["name"] = model.name
2067
2102
  schema_loader = DefaultSchemaLoader(config=config, parameters=options)
2068
2103
 
2069
- if (
2070
- (
2071
- isinstance(combined_slicers, PartitionRouter)
2072
- or isinstance(concurrent_cursor, ConcurrentCursor)
2073
- )
2074
- and not self._emit_connector_builder_messages
2075
- and not is_parent
2076
- ):
2077
- # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the
2078
- # DeclarativeStream and assembling the DefaultStream from that. The plan is the following:
2079
- # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter
2080
- # * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition
2081
- # * Streams with both partition router and cursor
2082
- # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet
2083
- # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway
2084
-
2085
- stream_name = model.name or ""
2086
- stream_slicer: ConcurrentStreamSlicer = (
2087
- concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={})
2088
- )
2089
- cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository)
2090
- if isinstance(retriever, AsyncRetriever):
2091
- # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method
2092
- # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a
2093
- # special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when
2094
- # built because the async retriever has a specific partition router which relies on this stream slicer.
2095
- # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in
2096
- # AsyncJobPartitionRouter.
2097
- stream_slicer = retriever.stream_slicer
2098
- if isinstance(combined_slicers, Cursor):
2099
- cursor = combined_slicers
2100
- elif isinstance(combined_slicers, PartitionRouter):
2101
- stream_slicer = combined_slicers
2102
- elif concurrent_cursor:
2103
- cursor = concurrent_cursor
2104
-
2105
- # FIXME to be removed once we migrate everything to DefaultStream
2106
- if isinstance(retriever, SimpleRetriever):
2107
- # We zero it out here, but since this is a cursor reference, the state is still properly
2108
- # instantiated for the other components that reference it
2109
- retriever.cursor = None
2110
-
2111
- partition_generator = StreamSlicerPartitionGenerator(
2104
+ stream_name = model.name or ""
2105
+ return DefaultStream(
2106
+ partition_generator=StreamSlicerPartitionGenerator(
2112
2107
  DeclarativePartitionFactory(
2113
2108
  stream_name,
2114
2109
  schema_loader,
2115
2110
  retriever,
2116
2111
  self._message_repository,
2117
2112
  ),
2118
- stream_slicer=stream_slicer,
2119
- )
2120
- return DefaultStream(
2121
- partition_generator=partition_generator,
2122
- name=stream_name,
2123
- json_schema=schema_loader.get_json_schema,
2124
- primary_key=get_primary_key_from_stream(primary_key),
2125
- cursor_field=cursor.cursor_field.cursor_field_key
2126
- if hasattr(cursor, "cursor_field")
2127
- else "", # FIXME we should have the cursor field has part of the interface of cursor,
2128
- logger=logging.getLogger(f"airbyte.{stream_name}"),
2129
- # FIXME this is a breaking change compared to the old implementation which used the source name instead
2130
- cursor=cursor,
2131
- supports_file_transfer=hasattr(model, "file_uploader")
2132
- and bool(model.file_uploader),
2133
- )
2134
-
2135
- cursor_field = model.incremental_sync.cursor_field if model.incremental_sync else None
2136
- if model.state_migrations:
2137
- state_transformations = [
2138
- self._create_component_from_model(state_migration, config, declarative_stream=model)
2139
- for state_migration in model.state_migrations
2140
- ]
2141
- else:
2142
- state_transformations = []
2143
- return DeclarativeStream(
2144
- name=model.name or "",
2145
- primary_key=primary_key,
2146
- retriever=retriever,
2147
- schema_loader=schema_loader,
2148
- stream_cursor_field=cursor_field or "",
2149
- state_migrations=state_transformations,
2150
- config=config,
2151
- parameters=model.parameters or {},
2113
+ stream_slicer,
2114
+ slice_limit=self._limit_slices_fetched,
2115
+ ),
2116
+ name=stream_name,
2117
+ json_schema=schema_loader.get_json_schema,
2118
+ primary_key=get_primary_key_from_stream(primary_key),
2119
+ cursor_field=concurrent_cursor.cursor_field.cursor_field_key
2120
+ if hasattr(concurrent_cursor, "cursor_field")
2121
+ else "", # FIXME we should have the cursor field has part of the interface of cursor,
2122
+ logger=logging.getLogger(f"airbyte.{stream_name}"),
2123
+ cursor=concurrent_cursor,
2124
+ supports_file_transfer=hasattr(model, "file_uploader") and bool(model.file_uploader),
2152
2125
  )
2153
2126
 
2154
2127
  def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool:
@@ -2197,86 +2170,15 @@ class ModelToComponentFactory:
2197
2170
  )
2198
2171
  return SinglePartitionRouter(parameters={})
2199
2172
 
2200
- def _build_incremental_cursor(
2201
- self,
2202
- model: DeclarativeStreamModel,
2203
- stream_slicer: Optional[PartitionRouter],
2204
- config: Config,
2205
- ) -> Optional[StreamSlicer]:
2206
- state_transformations = (
2207
- [
2208
- self._create_component_from_model(state_migration, config, declarative_stream=model)
2209
- for state_migration in model.state_migrations
2210
- ]
2211
- if model.state_migrations
2212
- else []
2213
- )
2214
-
2215
- if model.incremental_sync and (
2216
- stream_slicer and not isinstance(stream_slicer, SinglePartitionRouter)
2217
- ):
2218
- if model.retriever.type == "AsyncRetriever":
2219
- stream_name = model.name or ""
2220
- stream_namespace = None
2221
- stream_state = self._connector_state_manager.get_stream_state(
2222
- stream_name, stream_namespace
2223
- )
2224
-
2225
- return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2226
- state_manager=self._connector_state_manager,
2227
- model_type=DatetimeBasedCursorModel,
2228
- component_definition=model.incremental_sync.__dict__,
2229
- stream_name=stream_name,
2230
- stream_namespace=stream_namespace,
2231
- config=config or {},
2232
- stream_state=stream_state,
2233
- stream_state_migrations=state_transformations,
2234
- partition_router=stream_slicer,
2235
- )
2236
-
2237
- incremental_sync_model = model.incremental_sync
2238
- cursor_component = self._create_component_from_model(
2239
- model=incremental_sync_model, config=config
2240
- )
2241
- is_global_cursor = (
2242
- hasattr(incremental_sync_model, "global_substream_cursor")
2243
- and incremental_sync_model.global_substream_cursor
2244
- )
2245
-
2246
- if is_global_cursor:
2247
- return GlobalSubstreamCursor(
2248
- stream_cursor=cursor_component, partition_router=stream_slicer
2249
- )
2250
- return PerPartitionWithGlobalCursor(
2251
- cursor_factory=CursorFactory(
2252
- lambda: self._create_component_from_model(
2253
- model=incremental_sync_model, config=config
2254
- ),
2255
- ),
2256
- partition_router=stream_slicer,
2257
- stream_cursor=cursor_component,
2258
- )
2259
- elif model.incremental_sync:
2260
- if model.retriever.type == "AsyncRetriever":
2261
- return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2262
- model_type=DatetimeBasedCursorModel,
2263
- component_definition=model.incremental_sync.__dict__,
2264
- stream_name=model.name or "",
2265
- stream_namespace=None,
2266
- config=config or {},
2267
- stream_state_migrations=state_transformations,
2268
- )
2269
- return self._create_component_from_model(model=model.incremental_sync, config=config) # type: ignore[no-any-return] # Will be created Cursor as stream_slicer_model is model.incremental_sync
2270
- return None
2271
-
2272
2173
  def _build_concurrent_cursor(
2273
2174
  self,
2274
2175
  model: DeclarativeStreamModel,
2275
2176
  stream_slicer: Optional[PartitionRouter],
2276
2177
  config: Config,
2277
- ) -> Optional[StreamSlicer]:
2178
+ ) -> Cursor:
2179
+ stream_name = model.name or ""
2278
2180
  stream_state = self._connector_state_manager.get_stream_state(
2279
- stream_name=model.name or "", namespace=None
2181
+ stream_name=stream_name, namespace=None
2280
2182
  )
2281
2183
 
2282
2184
  if model.state_migrations:
@@ -2296,20 +2198,20 @@ class ModelToComponentFactory:
2296
2198
  state_manager=self._connector_state_manager,
2297
2199
  model_type=DatetimeBasedCursorModel,
2298
2200
  component_definition=model.incremental_sync.__dict__,
2299
- stream_name=model.name or "",
2201
+ stream_name=stream_name,
2300
2202
  stream_namespace=None,
2301
2203
  config=config or {},
2302
2204
  stream_state=stream_state,
2303
2205
  stream_state_migrations=state_transformations,
2304
2206
  partition_router=stream_slicer,
2305
- attempt_to_create_cursor_if_not_provided=True,
2207
+ attempt_to_create_cursor_if_not_provided=True, # FIXME can we remove that now?
2306
2208
  )
2307
2209
  elif model.incremental_sync:
2308
2210
  if type(model.incremental_sync) == IncrementingCountCursorModel:
2309
2211
  return self.create_concurrent_cursor_from_incrementing_count_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2310
2212
  model_type=IncrementingCountCursorModel,
2311
2213
  component_definition=model.incremental_sync.__dict__,
2312
- stream_name=model.name or "",
2214
+ stream_name=stream_name,
2313
2215
  stream_namespace=None,
2314
2216
  config=config or {},
2315
2217
  stream_state_migrations=state_transformations,
@@ -2318,7 +2220,7 @@ class ModelToComponentFactory:
2318
2220
  return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
2319
2221
  model_type=type(model.incremental_sync),
2320
2222
  component_definition=model.incremental_sync.__dict__,
2321
- stream_name=model.name or "",
2223
+ stream_name=stream_name,
2322
2224
  stream_namespace=None,
2323
2225
  config=config or {},
2324
2226
  stream_state_migrations=state_transformations,
@@ -2328,45 +2230,7 @@ class ModelToComponentFactory:
2328
2230
  raise ValueError(
2329
2231
  f"Incremental sync of type {type(model.incremental_sync)} is not supported"
2330
2232
  )
2331
- return None
2332
-
2333
- def _merge_stream_slicers(
2334
- self, model: DeclarativeStreamModel, config: Config
2335
- ) -> Optional[StreamSlicer]:
2336
- retriever_model = model.retriever
2337
-
2338
- stream_slicer = self._build_stream_slicer_from_partition_router(
2339
- retriever_model, config, stream_name=model.name
2340
- )
2341
-
2342
- if retriever_model.type == "AsyncRetriever":
2343
- is_not_datetime_cursor = (
2344
- model.incremental_sync.type != "DatetimeBasedCursor"
2345
- if model.incremental_sync
2346
- else None
2347
- )
2348
- is_partition_router = (
2349
- bool(retriever_model.partition_router) if model.incremental_sync else None
2350
- )
2351
-
2352
- if is_not_datetime_cursor:
2353
- # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the
2354
- # support or unordered slices (for example, when we trigger reports for January and February, the report
2355
- # in February can be completed first). Once we have support for custom concurrent cursor or have a new
2356
- # implementation available in the CDK, we can enable more cursors here.
2357
- raise ValueError(
2358
- "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet."
2359
- )
2360
-
2361
- if is_partition_router and not stream_slicer:
2362
- # Note that this development is also done in parallel to the per partition development which once merged
2363
- # we could support here by calling create_concurrent_cursor_from_perpartition_cursor
2364
- raise ValueError("Per partition state is not supported yet for AsyncRetriever.")
2365
-
2366
- if model.incremental_sync:
2367
- return self._build_incremental_cursor(model, stream_slicer, config)
2368
-
2369
- return stream_slicer
2233
+ return FinalStateCursor(stream_name, None, self._message_repository)
2370
2234
 
2371
2235
  def create_default_error_handler(
2372
2236
  self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any
@@ -3010,7 +2874,7 @@ class ModelToComponentFactory:
3010
2874
  )
3011
2875
 
3012
2876
  def create_parent_stream_config(
3013
- self, model: ParentStreamConfigModel, config: Config, **kwargs: Any
2877
+ self, model: ParentStreamConfigModel, config: Config, stream_name: str, **kwargs: Any
3014
2878
  ) -> ParentStreamConfig:
3015
2879
  declarative_stream = self._create_component_from_model(
3016
2880
  model.stream,
@@ -3263,7 +3127,6 @@ class ModelToComponentFactory:
3263
3127
  *,
3264
3128
  name: str,
3265
3129
  primary_key: Optional[Union[str, List[str], List[List[str]]]],
3266
- stream_slicer: Optional[StreamSlicer],
3267
3130
  request_options_provider: Optional[RequestOptionsProvider] = None,
3268
3131
  stop_condition_cursor: Optional[Cursor] = None,
3269
3132
  client_side_incremental_sync: Optional[Dict[str, Any]] = None,
@@ -3278,7 +3141,7 @@ class ModelToComponentFactory:
3278
3141
  log_formatter: Optional[Callable[[Response], Any]] = None,
3279
3142
  **kwargs: Any,
3280
3143
  ) -> SimpleRetriever:
3281
- def _get_url() -> str:
3144
+ def _get_url(req: Requester) -> str:
3282
3145
  """
3283
3146
  Closure to get the URL from the requester. This is used to get the URL in the case of a lazy retriever.
3284
3147
  This is needed because the URL is not set until the requester is created.
@@ -3287,12 +3150,12 @@ class ModelToComponentFactory:
3287
3150
  _url: str = (
3288
3151
  model.requester.url
3289
3152
  if hasattr(model.requester, "url") and model.requester.url is not None
3290
- else requester.get_url()
3153
+ else req.get_url(stream_state=None, stream_slice=None, next_page_token=None)
3291
3154
  )
3292
3155
  _url_base: str = (
3293
3156
  model.requester.url_base
3294
3157
  if hasattr(model.requester, "url_base") and model.requester.url_base is not None
3295
- else requester.get_url_base()
3158
+ else req.get_url(stream_state=None, stream_slice=None, next_page_token=None)
3296
3159
  )
3297
3160
 
3298
3161
  return _url or _url_base
@@ -3371,36 +3234,14 @@ class ModelToComponentFactory:
3371
3234
  config=config,
3372
3235
  )
3373
3236
 
3374
- # Define cursor only if per partition or common incremental support is needed
3375
- cursor = stream_slicer if isinstance(stream_slicer, DeclarativeCursor) else None
3376
-
3377
- if (
3378
- not isinstance(stream_slicer, DatetimeBasedCursor)
3379
- or type(stream_slicer) is not DatetimeBasedCursor
3380
- ):
3381
- # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
3382
- # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
3383
- # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
3384
- # request_options_provider
3385
- request_options_provider = stream_slicer or DefaultRequestOptionsProvider(parameters={})
3386
- elif not request_options_provider:
3237
+ if not request_options_provider:
3387
3238
  request_options_provider = DefaultRequestOptionsProvider(parameters={})
3388
3239
 
3389
- stream_slicer = stream_slicer or SinglePartitionRouter(parameters={})
3390
- if self._should_limit_slices_fetched():
3391
- stream_slicer = cast(
3392
- StreamSlicer,
3393
- StreamSlicerTestReadDecorator(
3394
- wrapped_slicer=stream_slicer,
3395
- maximum_number_of_slices=self._limit_slices_fetched or 5,
3396
- ),
3397
- )
3398
-
3399
3240
  paginator = (
3400
3241
  self._create_component_from_model(
3401
3242
  model=model.paginator,
3402
3243
  config=config,
3403
- url_base=_get_url(),
3244
+ url_base=_get_url(requester),
3404
3245
  extractor_model=model.record_selector.extractor,
3405
3246
  decoder=decoder,
3406
3247
  cursor_used_for_stop_condition=stop_condition_cursor or None,
@@ -3444,9 +3285,9 @@ class ModelToComponentFactory:
3444
3285
  primary_key=primary_key,
3445
3286
  requester=requester,
3446
3287
  record_selector=record_selector,
3447
- stream_slicer=stream_slicer,
3288
+ stream_slicer=_NO_STREAM_SLICING,
3448
3289
  request_option_provider=request_options_provider,
3449
- cursor=cursor,
3290
+ cursor=None,
3450
3291
  config=config,
3451
3292
  ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests,
3452
3293
  parameters=model.parameters or {},
@@ -3458,9 +3299,9 @@ class ModelToComponentFactory:
3458
3299
  primary_key=primary_key,
3459
3300
  requester=requester,
3460
3301
  record_selector=record_selector,
3461
- stream_slicer=stream_slicer,
3302
+ stream_slicer=_NO_STREAM_SLICING,
3462
3303
  request_option_provider=request_options_provider,
3463
- cursor=cursor,
3304
+ cursor=None,
3464
3305
  config=config,
3465
3306
  ignore_stream_slicer_parameters_on_paginated_requests=ignore_stream_slicer_parameters_on_paginated_requests,
3466
3307
  additional_query_properties=query_properties,
@@ -3531,14 +3372,21 @@ class ModelToComponentFactory:
3531
3372
  f"state_delegating_stream, full_refresh_stream name and incremental_stream must have equal names. Instead has {model.name}, {model.full_refresh_stream.name} and {model.incremental_stream.name}."
3532
3373
  )
3533
3374
 
3534
- stream_model = (
3375
+ stream_model = self._get_state_delegating_stream_model(
3376
+ False if has_parent_state is None else has_parent_state, model
3377
+ )
3378
+
3379
+ return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # DeclarativeStream will be created as stream_model is alwyas DeclarativeStreamModel
3380
+
3381
+ def _get_state_delegating_stream_model(
3382
+ self, has_parent_state: bool, model: StateDelegatingStreamModel
3383
+ ) -> DeclarativeStreamModel:
3384
+ return (
3535
3385
  model.incremental_stream
3536
3386
  if self._connector_state_manager.get_stream_state(model.name, None) or has_parent_state
3537
3387
  else model.full_refresh_stream
3538
3388
  )
3539
3389
 
3540
- return self._create_component_from_model(stream_model, config=config, **kwargs) # type: ignore[no-any-return] # Will be created DeclarativeStream as stream_model is stream description
3541
-
3542
3390
  def _create_async_job_status_mapping(
3543
3391
  self, model: AsyncJobStatusMapModel, config: Config, **kwargs: Any
3544
3392
  ) -> Mapping[str, AsyncJobStatus]:
@@ -3583,12 +3431,14 @@ class ModelToComponentFactory:
3583
3431
  transformations: List[RecordTransformation],
3584
3432
  **kwargs: Any,
3585
3433
  ) -> AsyncRetriever:
3586
- def _get_download_retriever() -> SimpleRetriever:
3434
+ def _get_download_retriever(
3435
+ requester: Requester, extractor: RecordExtractor, _decoder: Decoder
3436
+ ) -> SimpleRetriever:
3587
3437
  # We create a record selector for the download retriever
3588
3438
  # with no schema normalization and no transformations, neither record filter
3589
3439
  # as all this occurs in the record_selector of the AsyncRetriever
3590
3440
  record_selector = RecordSelector(
3591
- extractor=download_extractor,
3441
+ extractor=extractor,
3592
3442
  name=name,
3593
3443
  record_filter=None,
3594
3444
  transformations=[],
@@ -3599,7 +3449,7 @@ class ModelToComponentFactory:
3599
3449
  paginator = (
3600
3450
  self._create_component_from_model(
3601
3451
  model=model.download_paginator,
3602
- decoder=decoder,
3452
+ decoder=_decoder,
3603
3453
  config=config,
3604
3454
  url_base="",
3605
3455
  )
@@ -3608,7 +3458,7 @@ class ModelToComponentFactory:
3608
3458
  )
3609
3459
 
3610
3460
  return SimpleRetriever(
3611
- requester=download_requester,
3461
+ requester=requester,
3612
3462
  record_selector=record_selector,
3613
3463
  primary_key=None,
3614
3464
  name=name,
@@ -3702,7 +3552,9 @@ class ModelToComponentFactory:
3702
3552
  config=config,
3703
3553
  name=job_download_components_name,
3704
3554
  )
3705
- download_retriever = _get_download_retriever()
3555
+ download_retriever = _get_download_retriever(
3556
+ download_requester, download_extractor, download_decoder
3557
+ )
3706
3558
  abort_requester = (
3707
3559
  self._create_component_from_model(
3708
3560
  model=model.abort_requester,
@@ -3846,31 +3698,106 @@ class ModelToComponentFactory:
3846
3698
  )
3847
3699
 
3848
3700
  def _create_message_repository_substream_wrapper(
3849
- self, model: ParentStreamConfigModel, config: Config, **kwargs: Any
3701
+ self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any
3850
3702
  ) -> Any:
3703
+ # getting the parent state
3704
+ child_state = self._connector_state_manager.get_stream_state(
3705
+ stream_name, None
3706
+ )
3707
+
3708
+ # This flag will be used exclusively for StateDelegatingStream when a parent stream is created
3709
+ has_parent_state = bool(
3710
+ self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None)
3711
+ if model.incremental_dependency
3712
+ else False
3713
+ )
3714
+ connector_state_manager = self._instantiate_parent_stream_state_manager(
3715
+ child_state, config, model, has_parent_state
3716
+ )
3717
+
3851
3718
  substream_factory = ModelToComponentFactory(
3719
+ connector_state_manager=connector_state_manager,
3852
3720
  limit_pages_fetched_per_slice=self._limit_pages_fetched_per_slice,
3853
3721
  limit_slices_fetched=self._limit_slices_fetched,
3854
3722
  emit_connector_builder_messages=self._emit_connector_builder_messages,
3855
3723
  disable_retries=self._disable_retries,
3856
3724
  disable_cache=self._disable_cache,
3857
- message_repository=LogAppenderMessageRepositoryDecorator(
3858
- {"airbyte_cdk": {"stream": {"is_substream": True}}, "http": {"is_auxiliary": True}},
3859
- self._message_repository,
3860
- self._evaluate_log_level(self._emit_connector_builder_messages),
3725
+ message_repository=StateFilteringMessageRepository(
3726
+ LogAppenderMessageRepositoryDecorator(
3727
+ {
3728
+ "airbyte_cdk": {"stream": {"is_substream": True}},
3729
+ "http": {"is_auxiliary": True},
3730
+ },
3731
+ self._message_repository,
3732
+ self._evaluate_log_level(self._emit_connector_builder_messages),
3733
+ ),
3861
3734
  ),
3862
3735
  )
3863
3736
 
3864
- # This flag will be used exclusively for StateDelegatingStream when a parent stream is created
3865
- has_parent_state = bool(
3866
- self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None)
3867
- if model.incremental_dependency
3868
- else False
3869
- )
3870
- return substream_factory._create_component_from_model(
3871
- model=model, config=config, has_parent_state=has_parent_state, **kwargs
3737
+ return substream_factory.create_parent_stream_config(
3738
+ model=model, config=config, stream_name=stream_name, **kwargs
3872
3739
  )
3873
3740
 
3741
+ def _instantiate_parent_stream_state_manager(
3742
+ self,
3743
+ child_state: MutableMapping[str, Any],
3744
+ config: Config,
3745
+ model: ParentStreamConfigModel,
3746
+ has_parent_state: bool,
3747
+ ) -> ConnectorStateManager:
3748
+ """
3749
+ With DefaultStream, the state needs to be provided during __init__ of the cursor as opposed to the
3750
+ `set_initial_state` flow that existed for the declarative cursors. This state is taken from
3751
+ self._connector_state_manager.get_stream_state (`self` being a newly created ModelToComponentFactory to account
3752
+ for the MessageRepository being different). So we need to pass a ConnectorStateManager to the
3753
+ ModelToComponentFactory that has the parent states. This method populates this if there is a child state and if
3754
+ incremental_dependency is set.
3755
+ """
3756
+ if model.incremental_dependency and child_state:
3757
+ parent_stream_name = model.stream.name or ""
3758
+ parent_state = ConcurrentPerPartitionCursor.get_parent_state(
3759
+ child_state, parent_stream_name
3760
+ )
3761
+
3762
+ if not parent_state:
3763
+ # there are two migration cases: state value from child stream or from global state
3764
+ parent_state = ConcurrentPerPartitionCursor.get_global_state(
3765
+ child_state, parent_stream_name
3766
+ )
3767
+
3768
+ if not parent_state and not isinstance(parent_state, dict):
3769
+ cursor_values = child_state.values()
3770
+ if cursor_values:
3771
+ incremental_sync_model: Union[
3772
+ DatetimeBasedCursorModel,
3773
+ IncrementingCountCursorModel,
3774
+ CustomIncrementalSyncModel,
3775
+ ] = (
3776
+ model.stream.incremental_sync # type: ignore # if we are there, it is because there is incremental_dependency and therefore there is an incremental_sync on the parent stream
3777
+ if isinstance(model.stream, DeclarativeStreamModel)
3778
+ else self._get_state_delegating_stream_model(
3779
+ has_parent_state, model.stream
3780
+ ).incremental_sync
3781
+ )
3782
+ cursor_field = InterpolatedString.create(
3783
+ incremental_sync_model.cursor_field,
3784
+ parameters=incremental_sync_model.parameters or {},
3785
+ ).eval(config)
3786
+ parent_state = AirbyteStateMessage(
3787
+ type=AirbyteStateType.STREAM,
3788
+ stream=AirbyteStreamState(
3789
+ stream_descriptor=StreamDescriptor(
3790
+ name=parent_stream_name, namespace=None
3791
+ ),
3792
+ stream_state=AirbyteStateBlob(
3793
+ {cursor_field: list(cursor_values)[0]}
3794
+ ),
3795
+ ),
3796
+ )
3797
+ return ConnectorStateManager([parent_state] if parent_state else [])
3798
+
3799
+ return ConnectorStateManager([])
3800
+
3874
3801
  @staticmethod
3875
3802
  def create_wait_time_from_header(
3876
3803
  model: WaitTimeFromHeaderModel, config: Config, **kwargs: Any
@@ -3951,6 +3878,7 @@ class ModelToComponentFactory:
3951
3878
 
3952
3879
  return HttpComponentsResolver(
3953
3880
  retriever=retriever,
3881
+ stream_slicer=self._build_stream_slicer_from_partition_router(model.retriever, config),
3954
3882
  config=config,
3955
3883
  components_mapping=components_mapping,
3956
3884
  parameters=model.parameters or {},
@@ -4176,7 +4104,9 @@ class ModelToComponentFactory:
4176
4104
  self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
4177
4105
  ) -> GroupingPartitionRouter:
4178
4106
  underlying_router = self._create_component_from_model(
4179
- model=model.underlying_partition_router, config=config
4107
+ model=model.underlying_partition_router,
4108
+ config=config,
4109
+ **kwargs,
4180
4110
  )
4181
4111
  if model.group_size < 1:
4182
4112
  raise ValueError(f"Group size must be greater than 0, got {model.group_size}")