airbyte-cdk 6.60.15__py3-none-any.whl → 6.61.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +32 -36
  2. airbyte_cdk/connector_builder/main.py +3 -3
  3. airbyte_cdk/connector_builder/test_reader/helpers.py +24 -2
  4. airbyte_cdk/connector_builder/test_reader/message_grouper.py +1 -1
  5. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +15 -22
  6. airbyte_cdk/sources/concurrent_source/concurrent_source.py +30 -18
  7. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +73 -3
  8. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +9 -5
  9. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -5
  10. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +72 -39
  11. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +42 -4
  12. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +2 -2
  13. airbyte_cdk/sources/message/concurrent_repository.py +47 -0
  14. airbyte_cdk/sources/streams/concurrent/cursor.py +23 -7
  15. airbyte_cdk/sources/streams/concurrent/partition_reader.py +46 -5
  16. airbyte_cdk/sources/streams/concurrent/partitions/types.py +7 -1
  17. airbyte_cdk/sources/streams/http/http_client.py +4 -1
  18. airbyte_cdk/sources/utils/slice_logger.py +4 -0
  19. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/METADATA +1 -1
  20. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/RECORD +24 -23
  21. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/LICENSE.txt +0 -0
  22. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/LICENSE_SHORT +0 -0
  23. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/WHEEL +0 -0
  24. {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,8 @@
3
3
  #
4
4
 
5
5
 
6
- from dataclasses import asdict, dataclass, field
7
- from typing import Any, ClassVar, Dict, List, Mapping
6
+ from dataclasses import asdict
7
+ from typing import Any, Dict, List, Mapping, Optional
8
8
 
9
9
  from airbyte_cdk.connector_builder.test_reader import TestReader
10
10
  from airbyte_cdk.models import (
@@ -15,45 +15,32 @@ from airbyte_cdk.models import (
15
15
  Type,
16
16
  )
17
17
  from airbyte_cdk.models import Type as MessageType
18
+ from airbyte_cdk.sources.declarative.concurrent_declarative_source import (
19
+ ConcurrentDeclarativeSource,
20
+ TestLimits,
21
+ )
18
22
  from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
19
23
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
20
- from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import (
21
- ModelToComponentFactory,
22
- )
23
24
  from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
24
25
  from airbyte_cdk.utils.datetime_helpers import ab_datetime_now
25
26
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
26
27
 
27
- DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE = 5
28
- DEFAULT_MAXIMUM_NUMBER_OF_SLICES = 5
29
- DEFAULT_MAXIMUM_RECORDS = 100
30
- DEFAULT_MAXIMUM_STREAMS = 100
31
-
32
28
  MAX_PAGES_PER_SLICE_KEY = "max_pages_per_slice"
33
29
  MAX_SLICES_KEY = "max_slices"
34
30
  MAX_RECORDS_KEY = "max_records"
35
31
  MAX_STREAMS_KEY = "max_streams"
36
32
 
37
33
 
38
- @dataclass
39
- class TestLimits:
40
- __test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name
41
-
42
- max_records: int = field(default=DEFAULT_MAXIMUM_RECORDS)
43
- max_pages_per_slice: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE)
44
- max_slices: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES)
45
- max_streams: int = field(default=DEFAULT_MAXIMUM_STREAMS)
46
-
47
-
48
34
  def get_limits(config: Mapping[str, Any]) -> TestLimits:
49
35
  command_config = config.get("__test_read_config", {})
50
- max_pages_per_slice = (
51
- command_config.get(MAX_PAGES_PER_SLICE_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE
36
+ return TestLimits(
37
+ max_records=command_config.get(MAX_RECORDS_KEY, TestLimits.DEFAULT_MAX_RECORDS),
38
+ max_pages_per_slice=command_config.get(
39
+ MAX_PAGES_PER_SLICE_KEY, TestLimits.DEFAULT_MAX_PAGES_PER_SLICE
40
+ ),
41
+ max_slices=command_config.get(MAX_SLICES_KEY, TestLimits.DEFAULT_MAX_SLICES),
42
+ max_streams=command_config.get(MAX_STREAMS_KEY, TestLimits.DEFAULT_MAX_STREAMS),
52
43
  )
53
- max_slices = command_config.get(MAX_SLICES_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_SLICES
54
- max_records = command_config.get(MAX_RECORDS_KEY) or DEFAULT_MAXIMUM_RECORDS
55
- max_streams = command_config.get(MAX_STREAMS_KEY) or DEFAULT_MAXIMUM_STREAMS
56
- return TestLimits(max_records, max_pages_per_slice, max_slices, max_streams)
57
44
 
58
45
 
59
46
  def should_migrate_manifest(config: Mapping[str, Any]) -> bool:
@@ -75,21 +62,30 @@ def should_normalize_manifest(config: Mapping[str, Any]) -> bool:
75
62
  return config.get("__should_normalize", False)
76
63
 
77
64
 
78
- def create_source(config: Mapping[str, Any], limits: TestLimits) -> ManifestDeclarativeSource:
65
+ def create_source(
66
+ config: Mapping[str, Any],
67
+ limits: TestLimits,
68
+ catalog: Optional[ConfiguredAirbyteCatalog],
69
+ state: Optional[List[AirbyteStateMessage]],
70
+ ) -> ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]:
79
71
  manifest = config["__injected_declarative_manifest"]
80
- return ManifestDeclarativeSource(
72
+
73
+ # We enforce a concurrency level of 1 so that the stream is processed on a single thread
74
+ # to retain ordering for the grouping of the builder message responses.
75
+ if "concurrency_level" in manifest:
76
+ manifest["concurrency_level"]["default_concurrency"] = 1
77
+ else:
78
+ manifest["concurrency_level"] = {"type": "ConcurrencyLevel", "default_concurrency": 1}
79
+
80
+ return ConcurrentDeclarativeSource(
81
+ catalog=catalog,
81
82
  config=config,
82
- emit_connector_builder_messages=True,
83
+ state=state,
83
84
  source_config=manifest,
85
+ emit_connector_builder_messages=True,
84
86
  migrate_manifest=should_migrate_manifest(config),
85
87
  normalize_manifest=should_normalize_manifest(config),
86
- component_factory=ModelToComponentFactory(
87
- emit_connector_builder_messages=True,
88
- limit_pages_fetched_per_slice=limits.max_pages_per_slice,
89
- limit_slices_fetched=limits.max_slices,
90
- disable_retries=True,
91
- disable_cache=True,
92
- ),
88
+ limits=limits,
93
89
  )
94
90
 
95
91
 
@@ -91,12 +91,12 @@ def handle_connector_builder_request(
91
91
  def handle_request(args: List[str]) -> str:
92
92
  command, config, catalog, state = get_config_and_catalog_from_args(args)
93
93
  limits = get_limits(config)
94
- source = create_source(config, limits)
95
- return orjson.dumps(
94
+ source = create_source(config=config, limits=limits, catalog=catalog, state=state)
95
+ return orjson.dumps( # type: ignore[no-any-return] # Serializer.dump() always returns AirbyteMessage
96
96
  AirbyteMessageSerializer.dump(
97
97
  handle_connector_builder_request(source, command, config, catalog, state, limits)
98
98
  )
99
- ).decode() # type: ignore[no-any-return] # Serializer.dump() always returns AirbyteMessage
99
+ ).decode()
100
100
 
101
101
 
102
102
  if __name__ == "__main__":
@@ -5,7 +5,7 @@
5
5
  import json
6
6
  from copy import deepcopy
7
7
  from json import JSONDecodeError
8
- from typing import Any, Dict, List, Mapping, Optional
8
+ from typing import Any, Dict, List, Mapping, Optional, Union
9
9
 
10
10
  from airbyte_cdk.connector_builder.models import (
11
11
  AuxiliaryRequest,
@@ -17,6 +17,8 @@ from airbyte_cdk.connector_builder.models import (
17
17
  from airbyte_cdk.models import (
18
18
  AirbyteLogMessage,
19
19
  AirbyteMessage,
20
+ AirbyteStateBlob,
21
+ AirbyteStateMessage,
20
22
  OrchestratorType,
21
23
  TraceType,
22
24
  )
@@ -466,7 +468,7 @@ def handle_current_slice(
466
468
  return StreamReadSlices(
467
469
  pages=current_slice_pages,
468
470
  slice_descriptor=current_slice_descriptor,
469
- state=[latest_state_message] if latest_state_message else [],
471
+ state=[convert_state_blob_to_mapping(latest_state_message)] if latest_state_message else [],
470
472
  auxiliary_requests=auxiliary_requests if auxiliary_requests else [],
471
473
  )
472
474
 
@@ -718,3 +720,23 @@ def get_auxiliary_request_type(stream: dict, http: dict) -> str: # type: ignore
718
720
  Determines the type of the auxiliary request based on the stream and HTTP properties.
719
721
  """
720
722
  return "PARENT_STREAM" if stream.get("is_substream", False) else str(http.get("type", None))
723
+
724
+
725
+ def convert_state_blob_to_mapping(
726
+ state_message: Union[AirbyteStateMessage, Dict[str, Any]],
727
+ ) -> Dict[str, Any]:
728
+ """
729
+ The AirbyteStreamState stores state as an AirbyteStateBlob which deceivingly is not
730
+ a dictionary, but rather a list of kwargs fields. This in turn causes it to not be
731
+ properly turned into a dictionary when translating this back into response output
732
+ by the connector_builder_handler using asdict()
733
+ """
734
+
735
+ if isinstance(state_message, AirbyteStateMessage) and state_message.stream:
736
+ state_value = state_message.stream.stream_state
737
+ if isinstance(state_value, AirbyteStateBlob):
738
+ state_value_mapping = {k: v for k, v in state_value.__dict__.items()}
739
+ state_message.stream.stream_state = state_value_mapping # type: ignore # we intentionally set this as a Dict so that StreamReadSlices is translated properly in the resulting HTTP response
740
+ return state_message # type: ignore # See above, but when this is an AirbyteStateMessage we must convert AirbyteStateBlob to a Dict
741
+ else:
742
+ return state_message # type: ignore # This is guaranteed to be a Dict since we check isinstance AirbyteStateMessage above
@@ -95,7 +95,7 @@ def get_message_groups(
95
95
  latest_state_message: Optional[Dict[str, Any]] = None
96
96
  slice_auxiliary_requests: List[AuxiliaryRequest] = []
97
97
 
98
- while records_count < limit and (message := next(messages, None)):
98
+ while message := next(messages, None):
99
99
  json_message = airbyte_message_to_json(message)
100
100
 
101
101
  if is_page_http_request_for_different_stream(json_message, stream_name):
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  from typing import Dict, Iterable, List, Optional, Set
6
7
 
7
8
  from airbyte_cdk.exception_handler import generate_failed_streams_error_message
@@ -95,11 +96,14 @@ class ConcurrentReadProcessor:
95
96
  """
96
97
  stream_name = partition.stream_name()
97
98
  self._streams_to_running_partitions[stream_name].add(partition)
99
+ cursor = self._stream_name_to_instance[stream_name].cursor
98
100
  if self._slice_logger.should_log_slice_message(self._logger):
99
101
  self._message_repository.emit_message(
100
102
  self._slice_logger.create_slice_log_message(partition.to_slice())
101
103
  )
102
- self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
104
+ self._thread_pool_manager.submit(
105
+ self._partition_reader.process_partition, partition, cursor
106
+ )
103
107
 
104
108
  def on_partition_complete_sentinel(
105
109
  self, sentinel: PartitionCompleteSentinel
@@ -112,26 +116,16 @@ class ConcurrentReadProcessor:
112
116
  """
113
117
  partition = sentinel.partition
114
118
 
115
- try:
116
- if sentinel.is_successful:
117
- stream = self._stream_name_to_instance[partition.stream_name()]
118
- stream.cursor.close_partition(partition)
119
- except Exception as exception:
120
- self._flag_exception(partition.stream_name(), exception)
121
- yield AirbyteTracedException.from_exception(
122
- exception, stream_descriptor=StreamDescriptor(name=partition.stream_name())
123
- ).as_sanitized_airbyte_message()
124
- finally:
125
- partitions_running = self._streams_to_running_partitions[partition.stream_name()]
126
- if partition in partitions_running:
127
- partitions_running.remove(partition)
128
- # If all partitions were generated and this was the last one, the stream is done
129
- if (
130
- partition.stream_name() not in self._streams_currently_generating_partitions
131
- and len(partitions_running) == 0
132
- ):
133
- yield from self._on_stream_is_done(partition.stream_name())
134
- yield from self._message_repository.consume_queue()
119
+ partitions_running = self._streams_to_running_partitions[partition.stream_name()]
120
+ if partition in partitions_running:
121
+ partitions_running.remove(partition)
122
+ # If all partitions were generated and this was the last one, the stream is done
123
+ if (
124
+ partition.stream_name() not in self._streams_currently_generating_partitions
125
+ and len(partitions_running) == 0
126
+ ):
127
+ yield from self._on_stream_is_done(partition.stream_name())
128
+ yield from self._message_repository.consume_queue()
135
129
 
136
130
  def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
137
131
  """
@@ -160,7 +154,6 @@ class ConcurrentReadProcessor:
160
154
  stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING
161
155
  )
162
156
  self._record_counter[stream.name] += 1
163
- stream.cursor.observe(record)
164
157
  yield message
165
158
  yield from self._message_repository.consume_queue()
166
159
 
@@ -1,10 +1,11 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
+
4
5
  import concurrent
5
6
  import logging
6
7
  from queue import Queue
7
- from typing import Iterable, Iterator, List
8
+ from typing import Iterable, Iterator, List, Optional
8
9
 
9
10
  from airbyte_cdk.models import AirbyteMessage
10
11
  from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
@@ -16,7 +17,7 @@ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPool
16
17
  from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
17
18
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
18
19
  from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
19
- from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
20
+ from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionLogger, PartitionReader
20
21
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
21
22
  from airbyte_cdk.sources.streams.concurrent.partitions.types import (
22
23
  PartitionCompleteSentinel,
@@ -43,6 +44,7 @@ class ConcurrentSource:
43
44
  logger: logging.Logger,
44
45
  slice_logger: SliceLogger,
45
46
  message_repository: MessageRepository,
47
+ queue: Optional[Queue[QueueItem]] = None,
46
48
  timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
47
49
  ) -> "ConcurrentSource":
48
50
  is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
@@ -59,12 +61,13 @@ class ConcurrentSource:
59
61
  logger,
60
62
  )
61
63
  return ConcurrentSource(
62
- threadpool,
63
- logger,
64
- slice_logger,
65
- message_repository,
66
- initial_number_of_partitions_to_generate,
67
- timeout_seconds,
64
+ threadpool=threadpool,
65
+ logger=logger,
66
+ slice_logger=slice_logger,
67
+ queue=queue,
68
+ message_repository=message_repository,
69
+ initial_number_partitions_to_generate=initial_number_of_partitions_to_generate,
70
+ timeout_seconds=timeout_seconds,
68
71
  )
69
72
 
70
73
  def __init__(
@@ -72,6 +75,7 @@ class ConcurrentSource:
72
75
  threadpool: ThreadPoolManager,
73
76
  logger: logging.Logger,
74
77
  slice_logger: SliceLogger = DebugSliceLogger(),
78
+ queue: Optional[Queue[QueueItem]] = None,
75
79
  message_repository: MessageRepository = InMemoryMessageRepository(),
76
80
  initial_number_partitions_to_generate: int = 1,
77
81
  timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
@@ -91,25 +95,28 @@ class ConcurrentSource:
91
95
  self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
92
96
  self._timeout_seconds = timeout_seconds
93
97
 
98
+ # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
99
+ # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
100
+ # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
101
+ # information and might even need to be configurable depending on the source
102
+ self._queue = queue or Queue(maxsize=10_000)
103
+
94
104
  def read(
95
105
  self,
96
106
  streams: List[AbstractStream],
97
107
  ) -> Iterator[AirbyteMessage]:
98
108
  self._logger.info("Starting syncing")
99
-
100
- # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
101
- # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
102
- # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
103
- # information and might even need to be configurable depending on the source
104
- queue: Queue[QueueItem] = Queue(maxsize=10_000)
105
109
  concurrent_stream_processor = ConcurrentReadProcessor(
106
110
  streams,
107
- PartitionEnqueuer(queue, self._threadpool),
111
+ PartitionEnqueuer(self._queue, self._threadpool),
108
112
  self._threadpool,
109
113
  self._logger,
110
114
  self._slice_logger,
111
115
  self._message_repository,
112
- PartitionReader(queue),
116
+ PartitionReader(
117
+ self._queue,
118
+ PartitionLogger(self._slice_logger, self._logger, self._message_repository),
119
+ ),
113
120
  )
114
121
 
115
122
  # Enqueue initial partition generation tasks
@@ -117,7 +124,7 @@ class ConcurrentSource:
117
124
 
118
125
  # Read from the queue until all partitions were generated and read
119
126
  yield from self._consume_from_queue(
120
- queue,
127
+ self._queue,
121
128
  concurrent_stream_processor,
122
129
  )
123
130
  self._threadpool.check_for_errors_and_shutdown()
@@ -141,7 +148,10 @@ class ConcurrentSource:
141
148
  airbyte_message_or_record_or_exception,
142
149
  concurrent_stream_processor,
143
150
  )
144
- if concurrent_stream_processor.is_done() and queue.empty():
151
+ # In the event that a partition raises an exception, anything remaining in
152
+ # the queue will be missed because is_done() can raise an exception and exit
153
+ # out of this loop before remaining items are consumed
154
+ if queue.empty() and concurrent_stream_processor.is_done():
145
155
  # all partitions were generated and processed. we're done here
146
156
  break
147
157
 
@@ -161,5 +171,7 @@ class ConcurrentSource:
161
171
  yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
162
172
  elif isinstance(queue_item, Record):
163
173
  yield from concurrent_stream_processor.on_record(queue_item)
174
+ elif isinstance(queue_item, AirbyteMessage):
175
+ yield queue_item
164
176
  else:
165
177
  raise ValueError(f"Unknown queue item type: {type(queue_item)}")
@@ -3,7 +3,22 @@
3
3
  #
4
4
 
5
5
  import logging
6
- from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union
6
+ from dataclasses import dataclass, field
7
+ from queue import Queue
8
+ from typing import (
9
+ Any,
10
+ ClassVar,
11
+ Generic,
12
+ Iterator,
13
+ List,
14
+ Mapping,
15
+ MutableMapping,
16
+ Optional,
17
+ Tuple,
18
+ Union,
19
+ )
20
+
21
+ from airbyte_protocol_dataclasses.models import Level
7
22
 
8
23
  from airbyte_cdk.models import (
9
24
  AirbyteCatalog,
@@ -43,6 +58,8 @@ from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_genera
43
58
  StreamSlicerPartitionGenerator,
44
59
  )
45
60
  from airbyte_cdk.sources.declarative.types import ConnectionDefinition
61
+ from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository
62
+ from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
46
63
  from airbyte_cdk.sources.source import TState
47
64
  from airbyte_cdk.sources.streams import Stream
48
65
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
@@ -50,6 +67,22 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import Abstra
50
67
  from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor
51
68
  from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
52
69
  from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
70
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
71
+
72
+
73
+ @dataclass
74
+ class TestLimits:
75
+ __test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name
76
+
77
+ DEFAULT_MAX_PAGES_PER_SLICE: ClassVar[int] = 5
78
+ DEFAULT_MAX_SLICES: ClassVar[int] = 5
79
+ DEFAULT_MAX_RECORDS: ClassVar[int] = 100
80
+ DEFAULT_MAX_STREAMS: ClassVar[int] = 100
81
+
82
+ max_records: int = field(default=DEFAULT_MAX_RECORDS)
83
+ max_pages_per_slice: int = field(default=DEFAULT_MAX_PAGES_PER_SLICE)
84
+ max_slices: int = field(default=DEFAULT_MAX_SLICES)
85
+ max_streams: int = field(default=DEFAULT_MAX_STREAMS)
53
86
 
54
87
 
55
88
  class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
@@ -65,7 +98,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
65
98
  source_config: ConnectionDefinition,
66
99
  debug: bool = False,
67
100
  emit_connector_builder_messages: bool = False,
68
- component_factory: Optional[ModelToComponentFactory] = None,
101
+ migrate_manifest: bool = False,
102
+ normalize_manifest: bool = False,
103
+ limits: Optional[TestLimits] = None,
69
104
  config_path: Optional[str] = None,
70
105
  **kwargs: Any,
71
106
  ) -> None:
@@ -73,21 +108,39 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
73
108
  # no longer needs to store the original incoming state. But maybe there's an edge case?
74
109
  self._connector_state_manager = ConnectorStateManager(state=state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
75
110
 
111
+ # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
112
+ # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
113
+ # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
114
+ # information and might even need to be configurable depending on the source
115
+ queue: Queue[QueueItem] = Queue(maxsize=10_000)
116
+ message_repository = InMemoryMessageRepository(
117
+ Level.DEBUG if emit_connector_builder_messages else Level.INFO
118
+ )
119
+
76
120
  # To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
77
121
  # cursors. We do this by no longer automatically instantiating RFR cursors when converting
78
122
  # the declarative models into runtime components. Concurrent sources will continue to checkpoint
79
123
  # incremental streams running in full refresh.
80
- component_factory = component_factory or ModelToComponentFactory(
124
+ component_factory = ModelToComponentFactory(
81
125
  emit_connector_builder_messages=emit_connector_builder_messages,
126
+ message_repository=ConcurrentMessageRepository(queue, message_repository),
82
127
  connector_state_manager=self._connector_state_manager,
83
128
  max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
129
+ limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None,
130
+ limit_slices_fetched=limits.max_slices if limits else None,
131
+ disable_retries=True if limits else False,
132
+ disable_cache=True if limits else False,
84
133
  )
85
134
 
135
+ self._limits = limits
136
+
86
137
  super().__init__(
87
138
  source_config=source_config,
88
139
  config=config,
89
140
  debug=debug,
90
141
  emit_connector_builder_messages=emit_connector_builder_messages,
142
+ migrate_manifest=migrate_manifest,
143
+ normalize_manifest=normalize_manifest,
91
144
  component_factory=component_factory,
92
145
  config_path=config_path,
93
146
  )
@@ -117,6 +170,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
117
170
  initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate,
118
171
  logger=self.logger,
119
172
  slice_logger=self._slice_logger,
173
+ queue=queue,
120
174
  message_repository=self.message_repository,
121
175
  )
122
176
 
@@ -280,8 +334,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
280
334
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
281
335
  retriever=retriever,
282
336
  message_repository=self.message_repository,
337
+ max_records_limit=self._limits.max_records
338
+ if self._limits
339
+ else None,
283
340
  ),
284
341
  stream_slicer=declarative_stream.retriever.stream_slicer,
342
+ slice_limit=self._limits.max_slices
343
+ if self._limits
344
+ else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
285
345
  )
286
346
  else:
287
347
  if (
@@ -311,8 +371,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
311
371
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
312
372
  retriever=retriever,
313
373
  message_repository=self.message_repository,
374
+ max_records_limit=self._limits.max_records
375
+ if self._limits
376
+ else None,
314
377
  ),
315
378
  stream_slicer=cursor,
379
+ slice_limit=self._limits.max_slices if self._limits else None,
316
380
  )
317
381
 
318
382
  concurrent_streams.append(
@@ -341,8 +405,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
341
405
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
342
406
  retriever=declarative_stream.retriever,
343
407
  message_repository=self.message_repository,
408
+ max_records_limit=self._limits.max_records if self._limits else None,
344
409
  ),
345
410
  declarative_stream.retriever.stream_slicer,
411
+ slice_limit=self._limits.max_slices
412
+ if self._limits
413
+ else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
346
414
  )
347
415
 
348
416
  final_state_cursor = FinalStateCursor(
@@ -401,8 +469,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
401
469
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
402
470
  retriever=retriever,
403
471
  message_repository=self.message_repository,
472
+ max_records_limit=self._limits.max_records if self._limits else None,
404
473
  ),
405
474
  perpartition_cursor,
475
+ slice_limit=self._limits.max_slices if self._limits else None,
406
476
  )
407
477
 
408
478
  concurrent_streams.append(
@@ -4142,11 +4142,9 @@ definitions:
4142
4142
  - stream_slice
4143
4143
  - stream_template_config
4144
4144
  examples:
4145
- - ["data"]
4146
- - ["data", "records"]
4147
- - ["data", 1, "name"]
4148
- - ["data", "{{ components_values.name }}"]
4149
- - ["data", "*", "record"]
4145
+ - ["name"]
4146
+ - ["retriever", "requester", "url"]
4147
+ - ["retriever", "requester", "{{ components_values.field }}"]
4150
4148
  - ["*", "**", "name"]
4151
4149
  value:
4152
4150
  title: Value
@@ -4777,6 +4775,12 @@ interpolation:
4777
4775
  - title: stream_slice
4778
4776
  description: This variable is deprecated. Use stream_interval or stream_partition instead.
4779
4777
  type: object
4778
+ - title: components_values
4779
+ description: The record object produced by the components resolver for which a stream will be generated.
4780
+ type: object
4781
+ examples:
4782
+ - name: "accounts"
4783
+ id: 1234
4780
4784
  macros:
4781
4785
  - title: now_utc
4782
4786
  description: Returns the current date and time in the UTC timezone.
@@ -1463,11 +1463,9 @@ class ComponentMappingDefinition(BaseModel):
1463
1463
  ...,
1464
1464
  description="A list of potentially nested fields indicating the full path where value will be added or updated.",
1465
1465
  examples=[
1466
- ["data"],
1467
- ["data", "records"],
1468
- ["data", 1, "name"],
1469
- ["data", "{{ components_values.name }}"],
1470
- ["data", "*", "record"],
1466
+ ["name"],
1467
+ ["retriever", "requester", "url"],
1468
+ ["retriever", "requester", "{{ components_values.field }}"],
1471
1469
  ["*", "**", "name"],
1472
1470
  ],
1473
1471
  title="Field Path",