airbyte-cdk 6.60.15__py3-none-any.whl → 6.60.16.post40.dev17219503797__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +32 -36
  2. airbyte_cdk/connector_builder/main.py +3 -3
  3. airbyte_cdk/connector_builder/test_reader/helpers.py +24 -2
  4. airbyte_cdk/connector_builder/test_reader/message_grouper.py +1 -1
  5. airbyte_cdk/manifest_server/Dockerfile +45 -0
  6. airbyte_cdk/manifest_server/README.md +142 -0
  7. airbyte_cdk/manifest_server/__init__.py +3 -0
  8. airbyte_cdk/manifest_server/api_models/__init__.py +41 -0
  9. airbyte_cdk/manifest_server/api_models/capabilities.py +7 -0
  10. airbyte_cdk/manifest_server/api_models/dicts.py +17 -0
  11. airbyte_cdk/manifest_server/api_models/manifest.py +73 -0
  12. airbyte_cdk/manifest_server/api_models/stream.py +76 -0
  13. airbyte_cdk/manifest_server/app.py +17 -0
  14. airbyte_cdk/manifest_server/auth.py +43 -0
  15. airbyte_cdk/manifest_server/cli/__init__.py +5 -0
  16. airbyte_cdk/manifest_server/cli/_common.py +28 -0
  17. airbyte_cdk/manifest_server/cli/_info.py +30 -0
  18. airbyte_cdk/manifest_server/cli/_openapi.py +43 -0
  19. airbyte_cdk/manifest_server/cli/_start.py +38 -0
  20. airbyte_cdk/manifest_server/cli/run.py +59 -0
  21. airbyte_cdk/manifest_server/command_processor/__init__.py +0 -0
  22. airbyte_cdk/manifest_server/command_processor/processor.py +151 -0
  23. airbyte_cdk/manifest_server/command_processor/utils.py +76 -0
  24. airbyte_cdk/manifest_server/main.py +24 -0
  25. airbyte_cdk/manifest_server/openapi.yaml +641 -0
  26. airbyte_cdk/manifest_server/routers/__init__.py +0 -0
  27. airbyte_cdk/manifest_server/routers/capabilities.py +25 -0
  28. airbyte_cdk/manifest_server/routers/health.py +13 -0
  29. airbyte_cdk/manifest_server/routers/manifest.py +137 -0
  30. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +15 -22
  31. airbyte_cdk/sources/concurrent_source/concurrent_source.py +30 -18
  32. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +73 -3
  33. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +66 -39
  34. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +42 -4
  35. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +2 -2
  36. airbyte_cdk/sources/message/concurrent_repository.py +47 -0
  37. airbyte_cdk/sources/streams/concurrent/cursor.py +23 -7
  38. airbyte_cdk/sources/streams/concurrent/partition_reader.py +46 -5
  39. airbyte_cdk/sources/streams/concurrent/partitions/types.py +7 -1
  40. airbyte_cdk/sources/streams/http/http_client.py +4 -1
  41. airbyte_cdk/sources/utils/slice_logger.py +4 -0
  42. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/METADATA +4 -1
  43. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/RECORD +47 -21
  44. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/entry_points.txt +1 -0
  45. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE.txt +0 -0
  46. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE_SHORT +0 -0
  47. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/WHEEL +0 -0
@@ -94,16 +94,13 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
94
94
  ClientSideIncrementalRecordFilterDecorator,
95
95
  )
96
96
  from airbyte_cdk.sources.declarative.incremental import (
97
- ChildPartitionResumableFullRefreshCursor,
98
97
  ConcurrentCursorFactory,
99
98
  ConcurrentPerPartitionCursor,
100
99
  CursorFactory,
101
100
  DatetimeBasedCursor,
102
101
  DeclarativeCursor,
103
102
  GlobalSubstreamCursor,
104
- PerPartitionCursor,
105
103
  PerPartitionWithGlobalCursor,
106
- ResumableFullRefreshCursor,
107
104
  )
108
105
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
109
106
  from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping
@@ -446,10 +443,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
446
443
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
447
444
  ZipfileDecoder as ZipfileDecoderModel,
448
445
  )
449
- from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
450
- COMPONENTS_MODULE_NAME,
451
- SDM_COMPONENTS_MODULE_NAME,
452
- )
453
446
  from airbyte_cdk.sources.declarative.partition_routers import (
454
447
  CartesianProductStreamSlicer,
455
448
  GroupingPartitionRouter,
@@ -508,7 +501,7 @@ from airbyte_cdk.sources.declarative.requesters.request_options import (
508
501
  RequestOptionsProvider,
509
502
  )
510
503
  from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
511
- from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod, Requester
504
+ from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod
512
505
  from airbyte_cdk.sources.declarative.resolvers import (
513
506
  ComponentMappingDefinition,
514
507
  ConfigComponentsResolver,
@@ -617,6 +610,9 @@ from airbyte_cdk.sources.streams.concurrent.cursor import (
617
610
  )
618
611
  from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
619
612
  from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
613
+ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import (
614
+ StreamSlicer as ConcurrentStreamSlicer,
615
+ )
620
616
  from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
621
617
  CustomFormatConcurrentStreamStateConverter,
622
618
  DateTimeStreamStateConverter,
@@ -635,6 +631,10 @@ SCHEMA_TRANSFORMER_TYPE_MAPPING = {
635
631
  SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization,
636
632
  }
637
633
 
634
+ # Ideally this should use the value defined in ConcurrentDeclarativeSource, but
635
+ # this would be a circular import
636
+ MAX_SLICES = 5
637
+
638
638
 
639
639
  class ModelToComponentFactory:
640
640
  EPOCH_DATETIME_FORMAT = "%s"
@@ -1933,29 +1933,7 @@ class ModelToComponentFactory:
1933
1933
  def create_declarative_stream(
1934
1934
  self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any
1935
1935
  ) -> Union[DeclarativeStream, AbstractStream]:
1936
- # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field
1937
- # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the
1938
- # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in
1939
- # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one.
1940
- combined_slicers = self._merge_stream_slicers(model=model, config=config)
1941
-
1942
1936
  primary_key = model.primary_key.__root__ if model.primary_key else None
1943
- stop_condition_on_cursor = (
1944
- model.incremental_sync
1945
- and hasattr(model.incremental_sync, "is_data_feed")
1946
- and model.incremental_sync.is_data_feed
1947
- )
1948
- client_side_filtering_enabled = (
1949
- model.incremental_sync
1950
- and hasattr(model.incremental_sync, "is_client_side_incremental")
1951
- and model.incremental_sync.is_client_side_incremental
1952
- )
1953
- concurrent_cursor = None
1954
- if stop_condition_on_cursor or client_side_filtering_enabled:
1955
- stream_slicer = self._build_stream_slicer_from_partition_router(
1956
- model.retriever, config, stream_name=model.name
1957
- )
1958
- concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config)
1959
1937
 
1960
1938
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
1961
1939
  cursor_model = model.incremental_sync
@@ -2023,6 +2001,15 @@ class ModelToComponentFactory:
2023
2001
  model=model.file_uploader, config=config
2024
2002
  )
2025
2003
 
2004
+ # When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field
2005
+ # components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the
2006
+ # Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in
2007
+ # the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one.
2008
+ combined_slicers = self._merge_stream_slicers(model=model, config=config)
2009
+ partition_router = self._build_stream_slicer_from_partition_router(
2010
+ model.retriever, config, stream_name=model.name
2011
+ )
2012
+ concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
2026
2013
  retriever = self._create_component_from_model(
2027
2014
  model=model.retriever,
2028
2015
  config=config,
@@ -2030,9 +2017,11 @@ class ModelToComponentFactory:
2030
2017
  primary_key=primary_key,
2031
2018
  stream_slicer=combined_slicers,
2032
2019
  request_options_provider=request_options_provider,
2033
- stop_condition_cursor=concurrent_cursor,
2020
+ stop_condition_cursor=concurrent_cursor
2021
+ if self._is_stop_condition_on_cursor(model)
2022
+ else None,
2034
2023
  client_side_incremental_sync={"cursor": concurrent_cursor}
2035
- if client_side_filtering_enabled
2024
+ if self._is_client_side_filtering_enabled(model)
2036
2025
  else None,
2037
2026
  transformations=transformations,
2038
2027
  file_uploader=file_uploader,
@@ -2066,18 +2055,41 @@ class ModelToComponentFactory:
2066
2055
  schema_loader = DefaultSchemaLoader(config=config, parameters=options)
2067
2056
 
2068
2057
  if (
2069
- isinstance(combined_slicers, PartitionRouter)
2058
+ (
2059
+ isinstance(combined_slicers, PartitionRouter)
2060
+ or isinstance(concurrent_cursor, ConcurrentCursor)
2061
+ )
2070
2062
  and not self._emit_connector_builder_messages
2071
2063
  and not is_parent
2072
2064
  ):
2073
2065
  # We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the
2074
2066
  # DeclarativeStream and assembling the DefaultStream from that. The plan is the following:
2075
2067
  # * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter
2076
- # * Streams without partition router but with cursor
2068
+ # * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition
2077
2069
  # * Streams with both partition router and cursor
2078
2070
  # We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet
2079
2071
  # We specifically exclude Connector Builder stuff for now as Brian is working on this anyway
2072
+
2080
2073
  stream_name = model.name or ""
2074
+ stream_slicer: ConcurrentStreamSlicer = (
2075
+ concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={})
2076
+ )
2077
+ cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository)
2078
+ if isinstance(retriever, AsyncRetriever):
2079
+ # The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method
2080
+ # `_build_incremental_cursor` which we would usually think would return only declarative stuff has a
2081
+ # special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when
2082
+ # built because the async retriever has a specific partition router which relies on this stream slicer.
2083
+ # We can't re-use `concurrent_cursor` because it is a different instance than the one passed in
2084
+ # AsyncJobPartitionRouter.
2085
+ stream_slicer = retriever.stream_slicer
2086
+ if isinstance(combined_slicers, Cursor):
2087
+ cursor = combined_slicers
2088
+ elif isinstance(combined_slicers, PartitionRouter):
2089
+ stream_slicer = combined_slicers
2090
+ elif concurrent_cursor:
2091
+ cursor = concurrent_cursor
2092
+
2081
2093
  partition_generator = StreamSlicerPartitionGenerator(
2082
2094
  DeclarativePartitionFactory(
2083
2095
  stream_name,
@@ -2085,18 +2097,19 @@ class ModelToComponentFactory:
2085
2097
  retriever,
2086
2098
  self._message_repository,
2087
2099
  ),
2088
- stream_slicer=combined_slicers,
2100
+ stream_slicer=stream_slicer,
2089
2101
  )
2090
2102
  return DefaultStream(
2091
2103
  partition_generator=partition_generator,
2092
2104
  name=stream_name,
2093
2105
  json_schema=schema_loader.get_json_schema,
2094
2106
  primary_key=get_primary_key_from_stream(primary_key),
2095
- cursor_field=None,
2096
- # FIXME we should have the cursor field has part of the interface of cursor
2107
+ cursor_field=cursor.cursor_field.cursor_field_key
2108
+ if hasattr(cursor, "cursor_field")
2109
+ else "", # FIXME we should have the cursor field has part of the interface of cursor,
2097
2110
  logger=logging.getLogger(f"airbyte.{stream_name}"),
2098
- # FIXME this is a breaking change compared to the old implementation,
2099
- cursor=FinalStateCursor(stream_name, None, self._message_repository),
2111
+ # FIXME this is a breaking change compared to the old implementation which used the source name instead
2112
+ cursor=cursor,
2100
2113
  supports_file_transfer=hasattr(model, "file_uploader")
2101
2114
  and bool(model.file_uploader),
2102
2115
  )
@@ -2120,6 +2133,20 @@ class ModelToComponentFactory:
2120
2133
  parameters=model.parameters or {},
2121
2134
  )
2122
2135
 
2136
+ def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool:
2137
+ return bool(
2138
+ model.incremental_sync
2139
+ and hasattr(model.incremental_sync, "is_data_feed")
2140
+ and model.incremental_sync.is_data_feed
2141
+ )
2142
+
2143
+ def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool:
2144
+ return bool(
2145
+ model.incremental_sync
2146
+ and hasattr(model.incremental_sync, "is_client_side_incremental")
2147
+ and model.incremental_sync.is_client_side_incremental
2148
+ )
2149
+
2123
2150
  def _build_stream_slicer_from_partition_router(
2124
2151
  self,
2125
2152
  model: Union[
@@ -1,9 +1,12 @@
1
- # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
2
 
3
- from typing import Any, Iterable, Mapping, Optional
3
+ from typing import Any, Iterable, Mapping, Optional, cast
4
4
 
5
5
  from airbyte_cdk.sources.declarative.retrievers import Retriever
6
6
  from airbyte_cdk.sources.declarative.schema import SchemaLoader
7
+ from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer_test_read_decorator import (
8
+ StreamSlicerTestReadDecorator,
9
+ )
7
10
  from airbyte_cdk.sources.message import MessageRepository
8
11
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
9
12
  from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
@@ -11,6 +14,11 @@ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import Stre
11
14
  from airbyte_cdk.sources.types import Record, StreamSlice
12
15
  from airbyte_cdk.utils.slice_hasher import SliceHasher
13
16
 
17
+ # For Connector Builder test read operations, we track the total number of records
18
+ # read for the stream at the global level so that we can stop reading early if we
19
+ # exceed the record limit
20
+ total_record_counter = 0
21
+
14
22
 
15
23
  class SchemaLoaderCachingDecorator(SchemaLoader):
16
24
  def __init__(self, schema_loader: SchemaLoader):
@@ -31,6 +39,7 @@ class DeclarativePartitionFactory:
31
39
  schema_loader: SchemaLoader,
32
40
  retriever: Retriever,
33
41
  message_repository: MessageRepository,
42
+ max_records_limit: Optional[int] = None,
34
43
  ) -> None:
35
44
  """
36
45
  The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not
@@ -41,6 +50,7 @@ class DeclarativePartitionFactory:
41
50
  self._schema_loader = SchemaLoaderCachingDecorator(schema_loader)
42
51
  self._retriever = retriever
43
52
  self._message_repository = message_repository
53
+ self._max_records_limit = max_records_limit
44
54
 
45
55
  def create(self, stream_slice: StreamSlice) -> Partition:
46
56
  return DeclarativePartition(
@@ -48,6 +58,7 @@ class DeclarativePartitionFactory:
48
58
  schema_loader=self._schema_loader,
49
59
  retriever=self._retriever,
50
60
  message_repository=self._message_repository,
61
+ max_records_limit=self._max_records_limit,
51
62
  stream_slice=stream_slice,
52
63
  )
53
64
 
@@ -59,19 +70,29 @@ class DeclarativePartition(Partition):
59
70
  schema_loader: SchemaLoader,
60
71
  retriever: Retriever,
61
72
  message_repository: MessageRepository,
73
+ max_records_limit: Optional[int],
62
74
  stream_slice: StreamSlice,
63
75
  ):
64
76
  self._stream_name = stream_name
65
77
  self._schema_loader = schema_loader
66
78
  self._retriever = retriever
67
79
  self._message_repository = message_repository
80
+ self._max_records_limit = max_records_limit
68
81
  self._stream_slice = stream_slice
69
82
  self._hash = SliceHasher.hash(self._stream_name, self._stream_slice)
70
83
 
71
84
  def read(self) -> Iterable[Record]:
85
+ if self._max_records_limit is not None:
86
+ global total_record_counter
87
+ if total_record_counter >= self._max_records_limit:
88
+ return
72
89
  for stream_data in self._retriever.read_records(
73
90
  self._schema_loader.get_json_schema(), self._stream_slice
74
91
  ):
92
+ if self._max_records_limit is not None:
93
+ if total_record_counter >= self._max_records_limit:
94
+ break
95
+
75
96
  if isinstance(stream_data, Mapping):
76
97
  record = (
77
98
  stream_data
@@ -86,6 +107,9 @@ class DeclarativePartition(Partition):
86
107
  else:
87
108
  self._message_repository.emit_message(stream_data)
88
109
 
110
+ if self._max_records_limit is not None:
111
+ total_record_counter += 1
112
+
89
113
  def to_slice(self) -> Optional[Mapping[str, Any]]:
90
114
  return self._stream_slice
91
115
 
@@ -98,10 +122,24 @@ class DeclarativePartition(Partition):
98
122
 
99
123
  class StreamSlicerPartitionGenerator(PartitionGenerator):
100
124
  def __init__(
101
- self, partition_factory: DeclarativePartitionFactory, stream_slicer: StreamSlicer
125
+ self,
126
+ partition_factory: DeclarativePartitionFactory,
127
+ stream_slicer: StreamSlicer,
128
+ slice_limit: Optional[int] = None,
129
+ max_records_limit: Optional[int] = None,
102
130
  ) -> None:
103
131
  self._partition_factory = partition_factory
104
- self._stream_slicer = stream_slicer
132
+
133
+ if slice_limit:
134
+ self._stream_slicer = cast(
135
+ StreamSlicer,
136
+ StreamSlicerTestReadDecorator(
137
+ wrapped_slicer=stream_slicer,
138
+ maximum_number_of_slices=slice_limit,
139
+ ),
140
+ )
141
+ else:
142
+ self._stream_slicer = stream_slicer
105
143
 
106
144
  def generate(self) -> Iterable[Partition]:
107
145
  for stream_slice in self._stream_slicer.stream_slices():
@@ -4,10 +4,10 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from itertools import islice
7
- from typing import Any, Iterable, Mapping, Optional, Union
7
+ from typing import Any, Iterable
8
8
 
9
9
  from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
10
- from airbyte_cdk.sources.types import StreamSlice, StreamState
10
+ from airbyte_cdk.sources.types import StreamSlice
11
11
 
12
12
 
13
13
  @dataclass
@@ -0,0 +1,47 @@
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+ import logging
3
+ import os
4
+ from queue import Queue
5
+ from typing import Callable, Iterable
6
+
7
+ from airbyte_cdk.models import AirbyteMessage, Level
8
+ from airbyte_cdk.models import Type as MessageType
9
+ from airbyte_cdk.sources.message.repository import LogMessage, MessageRepository
10
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
11
+
12
+ logger = logging.getLogger("airbyte")
13
+
14
+
15
+ class ConcurrentMessageRepository(MessageRepository):
16
+ """
17
+ Message repository that immediately loads messages onto the queue processed on the
18
+ main thread. This ensures that messages are processed in the correct order they are
19
+ received. The InMemoryMessageRepository implementation does not have guaranteed
20
+ ordering since whether to process the main thread vs. partitions is non-deterministic
21
+ and there can be a lag between reading the main-thread and consuming messages on the
22
+ MessageRepository.
23
+
24
+ This is particularly important for the connector builder which relies on grouping
25
+ of messages to organize request/response, pages, and partitions.
26
+ """
27
+
28
+ def __init__(self, queue: Queue[QueueItem], message_repository: MessageRepository):
29
+ self._queue = queue
30
+ self._decorated_message_repository = message_repository
31
+
32
+ def emit_message(self, message: AirbyteMessage) -> None:
33
+ self._decorated_message_repository.emit_message(message)
34
+ for message in self._decorated_message_repository.consume_queue():
35
+ self._queue.put(message)
36
+
37
+ def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
38
+ self._decorated_message_repository.log_message(level, message_provider)
39
+ for message in self._decorated_message_repository.consume_queue():
40
+ self._queue.put(message)
41
+
42
+ def consume_queue(self) -> Iterable[AirbyteMessage]:
43
+ """
44
+ This method shouldn't need to be called because as part of emit_message() we are already
45
+ loading messages onto the queue processed on the main thread.
46
+ """
47
+ yield from []
@@ -4,6 +4,7 @@
4
4
 
5
5
  import functools
6
6
  import logging
7
+ import threading
7
8
  from abc import ABC, abstractmethod
8
9
  from typing import (
9
10
  Any,
@@ -174,6 +175,12 @@ class ConcurrentCursor(Cursor):
174
175
  self._should_be_synced_logger_triggered = False
175
176
  self._clamping_strategy = clamping_strategy
176
177
 
178
+ # A lock is required when closing a partition because updating the cursor's concurrent_state is
179
+ # not thread safe. When multiple partitions are being closed by the cursor at the same time, it is
180
+ # possible for one partition to update concurrent_state after a second partition has already read
181
+ # the previous state. This can lead to the second partition overwriting the previous one's state.
182
+ self._lock = threading.Lock()
183
+
177
184
  @property
178
185
  def state(self) -> MutableMapping[str, Any]:
179
186
  return self._connector_state_converter.convert_to_state_message(
@@ -222,6 +229,14 @@ class ConcurrentCursor(Cursor):
222
229
  )
223
230
 
224
231
  def observe(self, record: Record) -> None:
232
+ # Because observe writes to the most_recent_cursor_value_per_partition mapping,
233
+ # it is not thread-safe. However, this shouldn't lead to concurrency issues because
234
+ # observe() is only invoked by PartitionReader.process_partition(). Since the map is
235
+ # broken down according to partition, concurrent threads processing only read/write
236
+ # from different keys which avoids any conflicts.
237
+ #
238
+ # If we were to add thread safety, we should implement a lock per-partition
239
+ # which is instantiated during stream_slices()
225
240
  most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
226
241
  record.associated_slice
227
242
  )
@@ -237,13 +252,14 @@ class ConcurrentCursor(Cursor):
237
252
  return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
238
253
 
239
254
  def close_partition(self, partition: Partition) -> None:
240
- slice_count_before = len(self._concurrent_state.get("slices", []))
241
- self._add_slice_to_state(partition)
242
- if slice_count_before < len(
243
- self._concurrent_state["slices"]
244
- ): # only emit if at least one slice has been processed
245
- self._merge_partitions()
246
- self._emit_state_message()
255
+ with self._lock:
256
+ slice_count_before = len(self._concurrent_state.get("slices", []))
257
+ self._add_slice_to_state(partition)
258
+ if slice_count_before < len(
259
+ self._concurrent_state["slices"]
260
+ ): # only emit if at least one slice has been processed
261
+ self._merge_partitions()
262
+ self._emit_state_message()
247
263
  self._has_closed_at_least_one_slice = True
248
264
 
249
265
  def _add_slice_to_state(self, partition: Partition) -> None:
@@ -1,14 +1,45 @@
1
- #
2
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
- #
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+
3
+ import logging
4
4
  from queue import Queue
5
+ from typing import Optional
5
6
 
6
7
  from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
8
+ from airbyte_cdk.sources.message.repository import MessageRepository
9
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
7
10
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
8
11
  from airbyte_cdk.sources.streams.concurrent.partitions.types import (
9
12
  PartitionCompleteSentinel,
10
13
  QueueItem,
11
14
  )
15
+ from airbyte_cdk.sources.utils.slice_logger import SliceLogger
16
+
17
+
18
+ # Since moving all the connector builder workflow to the concurrent CDK which required correct ordering
19
+ # of grouping log messages onto the main write thread using the ConcurrentMessageRepository, this
20
+ # separate flow and class that was used to log slices onto this partition's message_repository
21
+ # should just be replaced by emitting messages directly onto the repository instead of an intermediary.
22
+ class PartitionLogger:
23
+ """
24
+ Helper class that provides a mechanism for passing a log message onto the current
25
+ partitions message repository
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ slice_logger: SliceLogger,
31
+ logger: logging.Logger,
32
+ message_repository: MessageRepository,
33
+ ):
34
+ self._slice_logger = slice_logger
35
+ self._logger = logger
36
+ self._message_repository = message_repository
37
+
38
+ def log(self, partition: Partition) -> None:
39
+ if self._slice_logger.should_log_slice_message(self._logger):
40
+ self._message_repository.emit_message(
41
+ self._slice_logger.create_slice_log_message(partition.to_slice())
42
+ )
12
43
 
13
44
 
14
45
  class PartitionReader:
@@ -18,13 +49,18 @@ class PartitionReader:
18
49
 
19
50
  _IS_SUCCESSFUL = True
20
51
 
21
- def __init__(self, queue: Queue[QueueItem]) -> None:
52
+ def __init__(
53
+ self,
54
+ queue: Queue[QueueItem],
55
+ partition_logger: Optional[PartitionLogger] = None,
56
+ ) -> None:
22
57
  """
23
58
  :param queue: The queue to put the records in.
24
59
  """
25
60
  self._queue = queue
61
+ self._partition_logger = partition_logger
26
62
 
27
- def process_partition(self, partition: Partition) -> None:
63
+ def process_partition(self, partition: Partition, cursor: Cursor) -> None:
28
64
  """
29
65
  Process a partition and put the records in the output queue.
30
66
  When all the partitions are added to the queue, a sentinel is added to the queue to indicate that all the partitions have been generated.
@@ -37,8 +73,13 @@ class PartitionReader:
37
73
  :return: None
38
74
  """
39
75
  try:
76
+ if self._partition_logger:
77
+ self._partition_logger.log(partition)
78
+
40
79
  for record in partition.read():
41
80
  self._queue.put(record)
81
+ cursor.observe(record)
82
+ cursor.close_partition(partition)
42
83
  self._queue.put(PartitionCompleteSentinel(partition, self._IS_SUCCESSFUL))
43
84
  except Exception as e:
44
85
  self._queue.put(StreamThreadException(e, partition.stream_name()))
@@ -4,6 +4,7 @@
4
4
 
5
5
  from typing import Any, Union
6
6
 
7
+ from airbyte_cdk.models import AirbyteMessage
7
8
  from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
8
9
  PartitionGenerationCompletedSentinel,
9
10
  )
@@ -34,5 +35,10 @@ class PartitionCompleteSentinel:
34
35
  Typedef representing the items that can be added to the ThreadBasedConcurrentStream
35
36
  """
36
37
  QueueItem = Union[
37
- Record, Partition, PartitionCompleteSentinel, PartitionGenerationCompletedSentinel, Exception
38
+ Record,
39
+ Partition,
40
+ PartitionCompleteSentinel,
41
+ PartitionGenerationCompletedSentinel,
42
+ Exception,
43
+ AirbyteMessage,
38
44
  ]
@@ -153,7 +153,10 @@ class HttpClient:
153
153
  # * `If the application running SQLite crashes, the data will be safe, but the database [might become corrupted](https://www.sqlite.org/howtocorrupt.html#cfgerr) if the operating system crashes or the computer loses power before that data has been written to the disk surface.` in [this description](https://www.sqlite.org/pragma.html#pragma_synchronous).
154
154
  backend = requests_cache.SQLiteCache(sqlite_path, fast_save=True, wal=True)
155
155
  return CachedLimiterSession(
156
- sqlite_path, backend=backend, api_budget=self._api_budget, match_headers=True
156
+ cache_name=sqlite_path,
157
+ backend=backend,
158
+ api_budget=self._api_budget,
159
+ match_headers=True,
157
160
  )
158
161
  else:
159
162
  return LimiterSession(api_budget=self._api_budget)
@@ -11,6 +11,10 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
11
11
  from airbyte_cdk.models import Type as MessageType
12
12
 
13
13
 
14
+ # Once everything runs on the concurrent CDK and we've cleaned up the legacy flows, we should try to remove
15
+ # this class and write messages directly to the message_repository instead of through the logger because for
16
+ # cases like the connector builder where ordering of messages is important, using the logger can cause
17
+ # messages to be grouped out of order. Alas work for a different day.
14
18
  class SliceLogger(ABC):
15
19
  """
16
20
  SliceLogger is an interface that allows us to log slices of data in a uniform way.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: airbyte-cdk
3
- Version: 6.60.15
3
+ Version: 6.60.16.post40.dev17219503797
4
4
  Summary: A framework for writing Airbyte Connectors.
5
5
  Home-page: https://airbyte.com
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Topic :: Scientific/Engineering
19
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
20
  Provides-Extra: dev
21
21
  Provides-Extra: file-based
22
+ Provides-Extra: manifest-server
22
23
  Provides-Extra: sql
23
24
  Provides-Extra: vector-db-based
24
25
  Requires-Dist: Jinja2 (>=3.1.2,<3.2.0)
@@ -35,6 +36,7 @@ Requires-Dist: cryptography (>=44.0.0,<45.0.0)
35
36
  Requires-Dist: dateparser (>=1.2.2,<2.0.0)
36
37
  Requires-Dist: dpath (>=2.1.6,<3.0.0)
37
38
  Requires-Dist: dunamai (>=1.22.0,<2.0.0)
39
+ Requires-Dist: fastapi (>=0.116.1) ; extra == "manifest-server"
38
40
  Requires-Dist: fastavro (>=1.8.0,<1.9.0) ; extra == "file-based"
39
41
  Requires-Dist: genson (==1.3.0)
40
42
  Requires-Dist: google-cloud-secret-manager (>=2.17.0,<3.0.0)
@@ -77,6 +79,7 @@ Requires-Dist: typing-extensions
77
79
  Requires-Dist: unidecode (>=1.3.8,<2.0.0)
78
80
  Requires-Dist: unstructured.pytesseract (>=0.3.12) ; extra == "file-based"
79
81
  Requires-Dist: unstructured[docx,pptx] (==0.10.27) ; extra == "file-based"
82
+ Requires-Dist: uvicorn (>=0.35.0) ; extra == "manifest-server"
80
83
  Requires-Dist: wcmatch (==10.0)
81
84
  Requires-Dist: whenever (>=0.6.16,<0.7.0)
82
85
  Requires-Dist: xmltodict (>=0.13,<0.15)