airbyte-cdk 6.26.0.dev4105__py3-none-any.whl → 6.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +57 -32
  2. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +14 -0
  3. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +39 -13
  4. airbyte_cdk/sources/declarative/manifest_declarative_source.py +0 -3
  5. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +7 -1
  6. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +52 -6
  7. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +46 -16
  8. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +1 -1
  9. airbyte_cdk/sources/declarative/schema/__init__.py +2 -0
  10. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +43 -5
  11. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +0 -10
  12. airbyte_cdk/sources/file_based/file_based_source.py +1 -44
  13. airbyte_cdk/sources/file_based/file_based_stream_reader.py +0 -33
  14. airbyte_cdk/sources/file_based/schema_helpers.py +0 -25
  15. airbyte_cdk/sources/file_based/stream/__init__.py +1 -2
  16. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +0 -29
  17. airbyte_cdk/sources/types.py +4 -2
  18. airbyte_cdk/utils/slice_hasher.py +8 -1
  19. {airbyte_cdk-6.26.0.dev4105.dist-info → airbyte_cdk-6.27.0.dist-info}/METADATA +1 -1
  20. {airbyte_cdk-6.26.0.dev4105.dist-info → airbyte_cdk-6.27.0.dist-info}/RECORD +23 -25
  21. airbyte_cdk/sources/file_based/config/permissions.py +0 -34
  22. airbyte_cdk/sources/file_based/stream/identities_stream.py +0 -96
  23. {airbyte_cdk-6.26.0.dev4105.dist-info → airbyte_cdk-6.27.0.dist-info}/LICENSE.txt +0 -0
  24. {airbyte_cdk-6.26.0.dev4105.dist-info → airbyte_cdk-6.27.0.dist-info}/WHEEL +0 -0
  25. {airbyte_cdk-6.26.0.dev4105.dist-info → airbyte_cdk-6.27.0.dist-info}/entry_points.txt +0 -0
@@ -34,8 +34,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
34
34
  from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import (
35
35
  ModelToComponentFactory,
36
36
  )
37
+ from airbyte_cdk.sources.declarative.partition_routers import AsyncJobPartitionRouter
37
38
  from airbyte_cdk.sources.declarative.requesters import HttpRequester
38
- from airbyte_cdk.sources.declarative.retrievers import Retriever, SimpleRetriever
39
+ from airbyte_cdk.sources.declarative.retrievers import AsyncRetriever, Retriever, SimpleRetriever
39
40
  from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
40
41
  DeclarativePartitionFactory,
41
42
  StreamSlicerPartitionGenerator,
@@ -48,7 +49,7 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStrea
48
49
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
49
50
  AlwaysAvailableAvailabilityStrategy,
50
51
  )
51
- from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
52
+ from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor
52
53
  from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
53
54
  from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
54
55
 
@@ -69,6 +70,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
69
70
  component_factory: Optional[ModelToComponentFactory] = None,
70
71
  **kwargs: Any,
71
72
  ) -> None:
73
+ # todo: We could remove state from initialization. Now that streams are grouped during the read(), a source
74
+ # no longer needs to store the original incoming state. But maybe there's an edge case?
75
+ self._connector_state_manager = ConnectorStateManager(state=state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
76
+
72
77
  # To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
73
78
  # cursors. We do this by no longer automatically instantiating RFR cursors when converting
74
79
  # the declarative models into runtime components. Concurrent sources will continue to checkpoint
@@ -76,6 +81,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
76
81
  component_factory = component_factory or ModelToComponentFactory(
77
82
  emit_connector_builder_messages=emit_connector_builder_messages,
78
83
  disable_resumable_full_refresh=True,
84
+ connector_state_manager=self._connector_state_manager,
79
85
  )
80
86
 
81
87
  super().__init__(
@@ -86,10 +92,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
86
92
  component_factory=component_factory,
87
93
  )
88
94
 
89
- # todo: We could remove state from initialization. Now that streams are grouped during the read(), a source
90
- # no longer needs to store the original incoming state. But maybe there's an edge case?
91
- self._state = state
92
-
93
95
  concurrency_level_from_manifest = self._source_config.get("concurrency_level")
94
96
  if concurrency_level_from_manifest:
95
97
  concurrency_level_component = self._constructor.create_component(
@@ -179,8 +181,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
179
181
  concurrent_streams: List[AbstractStream] = []
180
182
  synchronous_streams: List[Stream] = []
181
183
 
182
- state_manager = ConnectorStateManager(state=self._state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
183
-
184
184
  # Combine streams and dynamic_streams. Note: both cannot be empty at the same time,
185
185
  # and this is validated during the initialization of the source.
186
186
  streams = self._stream_configs(self._source_config) + self._dynamic_stream_configs(
@@ -220,31 +220,52 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
220
220
  if self._is_datetime_incremental_without_partition_routing(
221
221
  declarative_stream, incremental_sync_component_definition
222
222
  ):
223
- stream_state = state_manager.get_stream_state(
223
+ stream_state = self._connector_state_manager.get_stream_state(
224
224
  stream_name=declarative_stream.name, namespace=declarative_stream.namespace
225
225
  )
226
226
 
227
- cursor = self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
228
- state_manager=state_manager,
229
- model_type=DatetimeBasedCursorModel,
230
- component_definition=incremental_sync_component_definition, # type: ignore # Not None because of the if condition above
231
- stream_name=declarative_stream.name,
232
- stream_namespace=declarative_stream.namespace,
233
- config=config or {},
234
- stream_state=stream_state,
235
- )
236
-
237
227
  retriever = self._get_retriever(declarative_stream, stream_state)
238
228
 
239
- partition_generator = StreamSlicerPartitionGenerator(
240
- DeclarativePartitionFactory(
241
- declarative_stream.name,
242
- declarative_stream.get_json_schema(),
243
- retriever,
244
- self.message_repository,
245
- ),
246
- cursor,
247
- )
229
+ if isinstance(declarative_stream.retriever, AsyncRetriever) and isinstance(
230
+ declarative_stream.retriever.stream_slicer, AsyncJobPartitionRouter
231
+ ):
232
+ cursor = declarative_stream.retriever.stream_slicer.stream_slicer
233
+
234
+ if not isinstance(cursor, ConcurrentCursor):
235
+ # This should never happen since we instantiate ConcurrentCursor in
236
+ # model_to_component_factory.py
237
+ raise ValueError(
238
+ f"Expected AsyncJobPartitionRouter stream_slicer to be of type ConcurrentCursor, but received{cursor.__class__}"
239
+ )
240
+
241
+ partition_generator = StreamSlicerPartitionGenerator(
242
+ partition_factory=DeclarativePartitionFactory(
243
+ declarative_stream.name,
244
+ declarative_stream.get_json_schema(),
245
+ retriever,
246
+ self.message_repository,
247
+ ),
248
+ stream_slicer=declarative_stream.retriever.stream_slicer,
249
+ )
250
+ else:
251
+ cursor = (
252
+ self._constructor.create_concurrent_cursor_from_datetime_based_cursor(
253
+ model_type=DatetimeBasedCursorModel,
254
+ component_definition=incremental_sync_component_definition, # type: ignore # Not None because of the if condition above
255
+ stream_name=declarative_stream.name,
256
+ stream_namespace=declarative_stream.namespace,
257
+ config=config or {},
258
+ )
259
+ )
260
+ partition_generator = StreamSlicerPartitionGenerator(
261
+ partition_factory=DeclarativePartitionFactory(
262
+ declarative_stream.name,
263
+ declarative_stream.get_json_schema(),
264
+ retriever,
265
+ self.message_repository,
266
+ ),
267
+ stream_slicer=cursor,
268
+ )
248
269
 
249
270
  concurrent_streams.append(
250
271
  DefaultStream(
@@ -306,14 +327,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
306
327
  declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
307
328
  )
308
329
  ):
309
- stream_state = state_manager.get_stream_state(
330
+ stream_state = self._connector_state_manager.get_stream_state(
310
331
  stream_name=declarative_stream.name, namespace=declarative_stream.namespace
311
332
  )
312
333
  partition_router = declarative_stream.retriever.stream_slicer._partition_router
313
334
 
314
335
  perpartition_cursor = (
315
336
  self._constructor.create_concurrent_cursor_from_perpartition_cursor(
316
- state_manager=state_manager,
337
+ state_manager=self._connector_state_manager,
317
338
  model_type=DatetimeBasedCursorModel,
318
339
  component_definition=incremental_sync_component_definition,
319
340
  stream_name=declarative_stream.name,
@@ -369,7 +390,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
369
390
  declarative_stream=declarative_stream
370
391
  )
371
392
  and hasattr(declarative_stream.retriever, "stream_slicer")
372
- and isinstance(declarative_stream.retriever.stream_slicer, DatetimeBasedCursor)
393
+ and (
394
+ isinstance(declarative_stream.retriever.stream_slicer, DatetimeBasedCursor)
395
+ or isinstance(declarative_stream.retriever.stream_slicer, AsyncJobPartitionRouter)
396
+ )
373
397
  )
374
398
 
375
399
  def _stream_supports_concurrent_partition_processing(
@@ -438,8 +462,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
438
462
  return False
439
463
  return True
440
464
 
465
+ @staticmethod
441
466
  def _get_retriever(
442
- self, declarative_stream: DeclarativeStream, stream_state: Mapping[str, Any]
467
+ declarative_stream: DeclarativeStream, stream_state: Mapping[str, Any]
443
468
  ) -> Retriever:
444
469
  retriever = declarative_stream.retriever
445
470
 
@@ -1800,6 +1800,19 @@ definitions:
1800
1800
  $parameters:
1801
1801
  type: object
1802
1802
  additionalProperties: true
1803
+ ComplexFieldType:
1804
+ title: Schema Field Type
1805
+ description: (This component is experimental. Use at your own risk.) Represents a complex field type.
1806
+ type: object
1807
+ required:
1808
+ - field_type
1809
+ properties:
1810
+ field_type:
1811
+ type: string
1812
+ items:
1813
+ anyOf:
1814
+ - type: string
1815
+ - "$ref": "#/definitions/ComplexFieldType"
1803
1816
  TypesMap:
1804
1817
  title: Types Map
1805
1818
  description: (This component is experimental. Use at your own risk.) Represents a mapping between a current type and its corresponding target type.
@@ -1814,6 +1827,7 @@ definitions:
1814
1827
  - type: array
1815
1828
  items:
1816
1829
  type: string
1830
+ - "$ref": "#/definitions/ComplexFieldType"
1817
1831
  current_type:
1818
1832
  anyOf:
1819
1833
  - type: string
@@ -147,7 +147,7 @@ class ConcurrentPerPartitionCursor(Cursor):
147
147
  < cursor.state[self.cursor_field.cursor_field_key]
148
148
  ):
149
149
  self._new_global_cursor = copy.deepcopy(cursor.state)
150
- self._emit_state_message()
150
+ self._emit_state_message()
151
151
 
152
152
  def ensure_at_least_one_state_emitted(self) -> None:
153
153
  """
@@ -192,7 +192,8 @@ class ConcurrentPerPartitionCursor(Cursor):
192
192
  self._global_cursor,
193
193
  self._lookback_window if self._global_cursor else 0,
194
194
  )
195
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
195
+ with self._lock:
196
+ self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
196
197
  self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
197
198
  threading.Semaphore(0)
198
199
  )
@@ -210,16 +211,38 @@ class ConcurrentPerPartitionCursor(Cursor):
210
211
 
211
212
  def _ensure_partition_limit(self) -> None:
212
213
  """
213
- Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
214
+ Ensure the maximum number of partitions does not exceed the predefined limit.
215
+
216
+ Steps:
217
+ 1. Attempt to remove partitions that are marked as finished in `_finished_partitions`.
218
+ These partitions are considered processed and safe to delete.
219
+ 2. If the limit is still exceeded and no finished partitions are available for removal,
220
+ remove the oldest partition unconditionally. We expect failed partitions to be removed.
221
+
222
+ Logging:
223
+ - Logs a warning each time a partition is removed, indicating whether it was finished
224
+ or removed due to being the oldest.
214
225
  """
215
- while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
216
- self._over_limit += 1
217
- oldest_partition = self._cursor_per_partition.popitem(last=False)[
218
- 0
219
- ] # Remove the oldest partition
220
- logger.warning(
221
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
222
- )
226
+ with self._lock:
227
+ while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
228
+ # Try removing finished partitions first
229
+ for partition_key in list(self._cursor_per_partition.keys()):
230
+ if partition_key in self._finished_partitions:
231
+ oldest_partition = self._cursor_per_partition.pop(
232
+ partition_key
233
+ ) # Remove the oldest partition
234
+ logger.warning(
235
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
236
+ )
237
+ break
238
+ else:
239
+ # If no finished partitions can be removed, fall back to removing the oldest partition
240
+ oldest_partition = self._cursor_per_partition.popitem(last=False)[
241
+ 1
242
+ ] # Remove the oldest partition
243
+ logger.warning(
244
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
245
+ )
223
246
 
224
247
  def _set_initial_state(self, stream_state: StreamState) -> None:
225
248
  """
@@ -264,7 +287,10 @@ class ConcurrentPerPartitionCursor(Cursor):
264
287
  if not stream_state:
265
288
  return
266
289
 
267
- if self._PERPARTITION_STATE_KEY not in stream_state:
290
+ if (
291
+ self._PERPARTITION_STATE_KEY not in stream_state
292
+ and self._GLOBAL_STATE_KEY not in stream_state
293
+ ):
268
294
  # We assume that `stream_state` is in a global format that can be applied to all partitions.
269
295
  # Example: {"global_state_format_key": "global_state_format_value"}
270
296
  self._global_cursor = deepcopy(stream_state)
@@ -273,7 +299,7 @@ class ConcurrentPerPartitionCursor(Cursor):
273
299
  else:
274
300
  self._lookback_window = int(stream_state.get("lookback_window", 0))
275
301
 
276
- for state in stream_state[self._PERPARTITION_STATE_KEY]:
302
+ for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
277
303
  self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
278
304
  self._create_cursor(state["cursor"])
279
305
  )
@@ -26,9 +26,6 @@ from airbyte_cdk.models import (
26
26
  from airbyte_cdk.sources.declarative.checks import COMPONENTS_CHECKER_TYPE_MAPPING
27
27
  from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker
28
28
  from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
29
- from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
30
- CheckStream as CheckStreamModel,
31
- )
32
29
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
33
30
  DeclarativeStream as DeclarativeStreamModel,
34
31
  )
@@ -736,8 +736,13 @@ class HttpResponseFilter(BaseModel):
736
736
  parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
737
737
 
738
738
 
739
+ class ComplexFieldType(BaseModel):
740
+ field_type: str
741
+ items: Optional[Union[str, ComplexFieldType]] = None
742
+
743
+
739
744
  class TypesMap(BaseModel):
740
- target_type: Union[str, List[str]]
745
+ target_type: Union[str, List[str], ComplexFieldType]
741
746
  current_type: Union[str, List[str]]
742
747
  condition: Optional[str] = None
743
748
 
@@ -2260,6 +2265,7 @@ class DynamicDeclarativeStream(BaseModel):
2260
2265
  )
2261
2266
 
2262
2267
 
2268
+ ComplexFieldType.update_forward_refs()
2263
2269
  CompositeErrorHandler.update_forward_refs()
2264
2270
  DeclarativeSource1.update_forward_refs()
2265
2271
  DeclarativeSource2.update_forward_refs()
@@ -133,6 +133,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
133
133
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
134
134
  CheckStream as CheckStreamModel,
135
135
  )
136
+ from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
137
+ ComplexFieldType as ComplexFieldTypeModel,
138
+ )
136
139
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
137
140
  ComponentMappingDefinition as ComponentMappingDefinitionModel,
138
141
  )
@@ -429,6 +432,7 @@ from airbyte_cdk.sources.declarative.retrievers import (
429
432
  SimpleRetrieverTestReadDecorator,
430
433
  )
431
434
  from airbyte_cdk.sources.declarative.schema import (
435
+ ComplexFieldType,
432
436
  DefaultSchemaLoader,
433
437
  DynamicSchemaLoader,
434
438
  InlineSchemaLoader,
@@ -503,6 +507,7 @@ class ModelToComponentFactory:
503
507
  disable_cache: bool = False,
504
508
  disable_resumable_full_refresh: bool = False,
505
509
  message_repository: Optional[MessageRepository] = None,
510
+ connector_state_manager: Optional[ConnectorStateManager] = None,
506
511
  ):
507
512
  self._init_mappings()
508
513
  self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice
@@ -514,6 +519,7 @@ class ModelToComponentFactory:
514
519
  self._message_repository = message_repository or InMemoryMessageRepository(
515
520
  self._evaluate_log_level(emit_connector_builder_messages)
516
521
  )
522
+ self._connector_state_manager = connector_state_manager or ConnectorStateManager()
517
523
 
518
524
  def _init_mappings(self) -> None:
519
525
  self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = {
@@ -572,6 +578,7 @@ class ModelToComponentFactory:
572
578
  DynamicSchemaLoaderModel: self.create_dynamic_schema_loader,
573
579
  SchemaTypeIdentifierModel: self.create_schema_type_identifier,
574
580
  TypesMapModel: self.create_types_map,
581
+ ComplexFieldTypeModel: self.create_complex_field_type,
575
582
  JwtAuthenticatorModel: self.create_jwt_authenticator,
576
583
  LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration,
577
584
  ListPartitionRouterModel: self.create_list_partition_router,
@@ -922,17 +929,24 @@ class ModelToComponentFactory:
922
929
 
923
930
  def create_concurrent_cursor_from_datetime_based_cursor(
924
931
  self,
925
- state_manager: ConnectorStateManager,
926
932
  model_type: Type[BaseModel],
927
933
  component_definition: ComponentDefinition,
928
934
  stream_name: str,
929
935
  stream_namespace: Optional[str],
930
936
  config: Config,
931
- stream_state: MutableMapping[str, Any],
932
937
  message_repository: Optional[MessageRepository] = None,
933
938
  runtime_lookback_window: Optional[datetime.timedelta] = None,
934
939
  **kwargs: Any,
935
940
  ) -> ConcurrentCursor:
941
+ # Per-partition incremental streams can dynamically create child cursors which will pass their current
942
+ # state via the stream_state keyword argument. Incremental syncs without parent streams use the
943
+ # incoming state and connector_state_manager that is initialized when the component factory is created
944
+ stream_state = (
945
+ self._connector_state_manager.get_stream_state(stream_name, stream_namespace)
946
+ if "stream_state" not in kwargs
947
+ else kwargs["stream_state"]
948
+ )
949
+
936
950
  component_type = component_definition.get("type")
937
951
  if component_definition.get("type") != model_type.__name__:
938
952
  raise ValueError(
@@ -1126,7 +1140,7 @@ class ModelToComponentFactory:
1126
1140
  stream_namespace=stream_namespace,
1127
1141
  stream_state=stream_state,
1128
1142
  message_repository=message_repository or self._message_repository,
1129
- connector_state_manager=state_manager,
1143
+ connector_state_manager=self._connector_state_manager,
1130
1144
  connector_state_converter=connector_state_converter,
1131
1145
  cursor_field=cursor_field,
1132
1146
  slice_boundary_fields=slice_boundary_fields,
@@ -1676,6 +1690,22 @@ class ModelToComponentFactory:
1676
1690
  stream_cursor=cursor_component,
1677
1691
  )
1678
1692
  elif model.incremental_sync:
1693
+ if model.retriever.type == "AsyncRetriever":
1694
+ if model.incremental_sync.type != "DatetimeBasedCursor":
1695
+ # We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the support or unordered slices (for example, when we trigger reports for January and February, the report in February can be completed first). Once we have support for custom concurrent cursor or have a new implementation available in the CDK, we can enable more cursors here.
1696
+ raise ValueError(
1697
+ "AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet"
1698
+ )
1699
+ if model.retriever.partition_router:
1700
+ # Note that this development is also done in parallel to the per partition development which once merged we could support here by calling `create_concurrent_cursor_from_perpartition_cursor`
1701
+ raise ValueError("Per partition state is not supported yet for AsyncRetriever")
1702
+ return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
1703
+ model_type=DatetimeBasedCursorModel,
1704
+ component_definition=model.incremental_sync.__dict__,
1705
+ stream_name=model.name or "",
1706
+ stream_namespace=None,
1707
+ config=config or {},
1708
+ )
1679
1709
  return (
1680
1710
  self._create_component_from_model(model=model.incremental_sync, config=config)
1681
1711
  if model.incremental_sync
@@ -1894,10 +1924,26 @@ class ModelToComponentFactory:
1894
1924
  ) -> InlineSchemaLoader:
1895
1925
  return InlineSchemaLoader(schema=model.schema_ or {}, parameters={})
1896
1926
 
1897
- @staticmethod
1898
- def create_types_map(model: TypesMapModel, **kwargs: Any) -> TypesMap:
1927
+ def create_complex_field_type(
1928
+ self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any
1929
+ ) -> ComplexFieldType:
1930
+ items = (
1931
+ self._create_component_from_model(model=model.items, config=config)
1932
+ if isinstance(model.items, ComplexFieldTypeModel)
1933
+ else model.items
1934
+ )
1935
+
1936
+ return ComplexFieldType(field_type=model.field_type, items=items)
1937
+
1938
+ def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap:
1939
+ target_type = (
1940
+ self._create_component_from_model(model=model.target_type, config=config)
1941
+ if isinstance(model.target_type, ComplexFieldTypeModel)
1942
+ else model.target_type
1943
+ )
1944
+
1899
1945
  return TypesMap(
1900
- target_type=model.target_type,
1946
+ target_type=target_type,
1901
1947
  current_type=model.current_type,
1902
1948
  condition=model.condition if model.condition is not None else "True",
1903
1949
  )
@@ -295,28 +295,58 @@ class SubstreamPartitionRouter(PartitionRouter):
295
295
  return
296
296
 
297
297
  if not parent_state and incremental_dependency:
298
- # Attempt to retrieve child state
299
- substream_state_values = list(stream_state.values())
300
- substream_state = substream_state_values[0] if substream_state_values else {}
301
- # Filter out per partition state. Because we pass the state to the parent stream in the format {cursor_field: substream_state}
302
- if isinstance(substream_state, (list, dict)):
303
- substream_state = {}
304
-
305
- parent_state = {}
306
-
307
- # Copy child state to parent streams with incremental dependencies
308
- if substream_state:
309
- for parent_config in self.parent_stream_configs:
310
- if parent_config.incremental_dependency:
311
- parent_state[parent_config.stream.name] = {
312
- parent_config.stream.cursor_field: substream_state
313
- }
298
+ # Migrate child state to parent state format
299
+ parent_state = self._migrate_child_state_to_parent_state(stream_state)
314
300
 
315
301
  # Set state for each parent stream with an incremental dependency
316
302
  for parent_config in self.parent_stream_configs:
317
303
  if parent_config.incremental_dependency:
318
304
  parent_config.stream.state = parent_state.get(parent_config.stream.name, {})
319
305
 
306
+ def _migrate_child_state_to_parent_state(self, stream_state: StreamState) -> StreamState:
307
+ """
308
+ Migrate the child stream state to the parent stream's state format.
309
+
310
+ This method converts the global or child state into a format compatible with parent
311
+ streams. The migration occurs only for parent streams with incremental dependencies.
312
+ The method filters out per-partition states and retains only the global state in the
313
+ format `{cursor_field: cursor_value}`.
314
+
315
+ Args:
316
+ stream_state (StreamState): The state to migrate. Expected formats include:
317
+ - {"updated_at": "2023-05-27T00:00:00Z"}
318
+ - {"states": [...] } (ignored during migration)
319
+
320
+ Returns:
321
+ StreamState: A migrated state for parent streams in the format:
322
+ {
323
+ "parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
324
+ }
325
+
326
+ Example:
327
+ Input: {"updated_at": "2023-05-27T00:00:00Z"}
328
+ Output: {
329
+ "parent_stream_name": {"parent_stream_cursor": "2023-05-27T00:00:00Z"}
330
+ }
331
+ """
332
+ substream_state_values = list(stream_state.values())
333
+ substream_state = substream_state_values[0] if substream_state_values else {}
334
+
335
+ # Ignore per-partition states or invalid formats
336
+ if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
337
+ return {}
338
+
339
+ # Copy child state to parent streams with incremental dependencies
340
+ parent_state = {}
341
+ if substream_state:
342
+ for parent_config in self.parent_stream_configs:
343
+ if parent_config.incremental_dependency:
344
+ parent_state[parent_config.stream.name] = {
345
+ parent_config.stream.cursor_field: substream_state
346
+ }
347
+
348
+ return parent_state
349
+
320
350
  def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
321
351
  """
322
352
  Get the state of the parent streams.
@@ -75,7 +75,7 @@ class AsyncRetriever(Retriever):
75
75
  """
76
76
  if not isinstance(stream_slice, StreamSlice) or "partition" not in stream_slice.partition:
77
77
  raise AirbyteTracedException(
78
- message="Invalid arguments to AsyncJobRetriever.read_records: stream_slice is no optional. Please contact Airbyte Support",
78
+ message="Invalid arguments to AsyncRetriever.read_records: stream_slice is not optional. Please contact Airbyte Support",
79
79
  failure_type=FailureType.system_error,
80
80
  )
81
81
  return stream_slice["partition"] # type: ignore # stream_slice["partition"] has been added as an AsyncPartition as part of stream_slices
@@ -4,6 +4,7 @@
4
4
 
5
5
  from airbyte_cdk.sources.declarative.schema.default_schema_loader import DefaultSchemaLoader
6
6
  from airbyte_cdk.sources.declarative.schema.dynamic_schema_loader import (
7
+ ComplexFieldType,
7
8
  DynamicSchemaLoader,
8
9
  SchemaTypeIdentifier,
9
10
  TypesMap,
@@ -18,6 +19,7 @@ __all__ = [
18
19
  "SchemaLoader",
19
20
  "InlineSchemaLoader",
20
21
  "DynamicSchemaLoader",
22
+ "ComplexFieldType",
21
23
  "TypesMap",
22
24
  "SchemaTypeIdentifier",
23
25
  ]
@@ -18,7 +18,7 @@ from airbyte_cdk.sources.declarative.transformations import RecordTransformation
18
18
  from airbyte_cdk.sources.source import ExperimentalClassWarning
19
19
  from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
20
20
 
21
- AIRBYTE_DATA_TYPES: Mapping[str, Mapping[str, Any]] = {
21
+ AIRBYTE_DATA_TYPES: Mapping[str, MutableMapping[str, Any]] = {
22
22
  "string": {"type": ["null", "string"]},
23
23
  "boolean": {"type": ["null", "boolean"]},
24
24
  "date": {"type": ["null", "string"], "format": "date"},
@@ -45,6 +45,25 @@ AIRBYTE_DATA_TYPES: Mapping[str, Mapping[str, Any]] = {
45
45
  }
46
46
 
47
47
 
48
+ @deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning)
49
+ @dataclass(frozen=True)
50
+ class ComplexFieldType:
51
+ """
52
+ Identifies complex field type
53
+ """
54
+
55
+ field_type: str
56
+ items: Optional[Union[str, "ComplexFieldType"]] = None
57
+
58
+ def __post_init__(self) -> None:
59
+ """
60
+ Enforces that `items` is only used when `field_type` is a array
61
+ """
62
+ # `items_type` is valid only for array target types
63
+ if self.items and self.field_type != "array":
64
+ raise ValueError("'items' can only be used when 'field_type' is an array.")
65
+
66
+
48
67
  @deprecated("This class is experimental. Use at your own risk.", category=ExperimentalClassWarning)
49
68
  @dataclass(frozen=True)
50
69
  class TypesMap:
@@ -52,7 +71,7 @@ class TypesMap:
52
71
  Represents a mapping between a current type and its corresponding target type.
53
72
  """
54
73
 
55
- target_type: Union[List[str], str]
74
+ target_type: Union[List[str], str, ComplexFieldType]
56
75
  current_type: Union[List[str], str]
57
76
  condition: Optional[str]
58
77
 
@@ -135,8 +154,9 @@ class DynamicSchemaLoader(SchemaLoader):
135
154
  transformed_properties = self._transform(properties, {})
136
155
 
137
156
  return {
138
- "$schema": "http://json-schema.org/draft-07/schema#",
157
+ "$schema": "https://json-schema.org/draft-07/schema#",
139
158
  "type": "object",
159
+ "additionalProperties": True,
140
160
  "properties": transformed_properties,
141
161
  }
142
162
 
@@ -188,18 +208,36 @@ class DynamicSchemaLoader(SchemaLoader):
188
208
  first_type = self._get_airbyte_type(mapped_field_type[0])
189
209
  second_type = self._get_airbyte_type(mapped_field_type[1])
190
210
  return {"oneOf": [first_type, second_type]}
211
+
191
212
  elif isinstance(mapped_field_type, str):
192
213
  return self._get_airbyte_type(mapped_field_type)
214
+
215
+ elif isinstance(mapped_field_type, ComplexFieldType):
216
+ return self._resolve_complex_type(mapped_field_type)
217
+
193
218
  else:
194
219
  raise ValueError(
195
220
  f"Invalid data type. Available string or two items list of string. Got {mapped_field_type}."
196
221
  )
197
222
 
223
+ def _resolve_complex_type(self, complex_type: ComplexFieldType) -> Mapping[str, Any]:
224
+ if not complex_type.items:
225
+ return self._get_airbyte_type(complex_type.field_type)
226
+
227
+ field_type = self._get_airbyte_type(complex_type.field_type)
228
+ field_type["items"] = (
229
+ self._get_airbyte_type(complex_type.items)
230
+ if isinstance(complex_type.items, str)
231
+ else self._resolve_complex_type(complex_type.items)
232
+ )
233
+
234
+ return field_type
235
+
198
236
  def _replace_type_if_not_valid(
199
237
  self,
200
238
  field_type: Union[List[str], str],
201
239
  raw_schema: MutableMapping[str, Any],
202
- ) -> Union[List[str], str]:
240
+ ) -> Union[List[str], str, ComplexFieldType]:
203
241
  """
204
242
  Replaces a field type if it matches a type mapping in `types_map`.
205
243
  """
@@ -216,7 +254,7 @@ class DynamicSchemaLoader(SchemaLoader):
216
254
  return field_type
217
255
 
218
256
  @staticmethod
219
- def _get_airbyte_type(field_type: str) -> Mapping[str, Any]:
257
+ def _get_airbyte_type(field_type: str) -> MutableMapping[str, Any]:
220
258
  """
221
259
  Maps a field type to its corresponding Airbyte type definition.
222
260
  """