airbyte-cdk 6.60.16__py3-none-any.whl → 6.60.16.post40.dev17219503797__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +32 -36
  2. airbyte_cdk/connector_builder/main.py +3 -3
  3. airbyte_cdk/connector_builder/test_reader/helpers.py +24 -2
  4. airbyte_cdk/connector_builder/test_reader/message_grouper.py +1 -1
  5. airbyte_cdk/manifest_server/Dockerfile +45 -0
  6. airbyte_cdk/manifest_server/README.md +142 -0
  7. airbyte_cdk/manifest_server/__init__.py +3 -0
  8. airbyte_cdk/manifest_server/api_models/__init__.py +41 -0
  9. airbyte_cdk/manifest_server/api_models/capabilities.py +7 -0
  10. airbyte_cdk/manifest_server/api_models/dicts.py +17 -0
  11. airbyte_cdk/manifest_server/api_models/manifest.py +73 -0
  12. airbyte_cdk/manifest_server/api_models/stream.py +76 -0
  13. airbyte_cdk/manifest_server/app.py +17 -0
  14. airbyte_cdk/manifest_server/auth.py +43 -0
  15. airbyte_cdk/manifest_server/cli/__init__.py +5 -0
  16. airbyte_cdk/manifest_server/cli/_common.py +28 -0
  17. airbyte_cdk/manifest_server/cli/_info.py +30 -0
  18. airbyte_cdk/manifest_server/cli/_openapi.py +43 -0
  19. airbyte_cdk/manifest_server/cli/_start.py +38 -0
  20. airbyte_cdk/manifest_server/cli/run.py +59 -0
  21. airbyte_cdk/manifest_server/command_processor/__init__.py +0 -0
  22. airbyte_cdk/manifest_server/command_processor/processor.py +151 -0
  23. airbyte_cdk/manifest_server/command_processor/utils.py +76 -0
  24. airbyte_cdk/manifest_server/main.py +24 -0
  25. airbyte_cdk/manifest_server/openapi.yaml +641 -0
  26. airbyte_cdk/manifest_server/routers/__init__.py +0 -0
  27. airbyte_cdk/manifest_server/routers/capabilities.py +25 -0
  28. airbyte_cdk/manifest_server/routers/health.py +13 -0
  29. airbyte_cdk/manifest_server/routers/manifest.py +137 -0
  30. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +15 -22
  31. airbyte_cdk/sources/concurrent_source/concurrent_source.py +30 -18
  32. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +73 -3
  33. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +4 -0
  34. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +42 -4
  35. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +2 -2
  36. airbyte_cdk/sources/message/concurrent_repository.py +47 -0
  37. airbyte_cdk/sources/streams/concurrent/cursor.py +23 -7
  38. airbyte_cdk/sources/streams/concurrent/partition_reader.py +46 -5
  39. airbyte_cdk/sources/streams/concurrent/partitions/types.py +7 -1
  40. airbyte_cdk/sources/streams/http/http_client.py +4 -1
  41. airbyte_cdk/sources/utils/slice_logger.py +4 -0
  42. {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/METADATA +4 -1
  43. {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/RECORD +47 -21
  44. {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/entry_points.txt +1 -0
  45. {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE.txt +0 -0
  46. {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE_SHORT +0 -0
  47. {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/WHEEL +0 -0
@@ -0,0 +1,137 @@
1
+ import hashlib
2
+ from dataclasses import asdict
3
+ from typing import Any, Dict, List, Mapping, Optional
4
+
5
+ import jsonschema
6
+ from fastapi import APIRouter, Depends, HTTPException
7
+
8
+ from airbyte_cdk.manifest_server.api_models.manifest import (
9
+ CheckRequest,
10
+ CheckResponse,
11
+ DiscoverRequest,
12
+ DiscoverResponse,
13
+ )
14
+ from airbyte_cdk.models import AirbyteStateMessageSerializer
15
+ from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
16
+ from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
17
+ INJECTED_COMPONENTS_PY,
18
+ INJECTED_COMPONENTS_PY_CHECKSUMS,
19
+ )
20
+
21
+ from ..api_models import (
22
+ FullResolveRequest,
23
+ Manifest,
24
+ ManifestResponse,
25
+ ResolveRequest,
26
+ StreamRead,
27
+ StreamTestReadRequest,
28
+ )
29
+ from ..auth import verify_jwt_token
30
+ from ..command_processor.processor import ManifestCommandProcessor
31
+ from ..command_processor.utils import build_catalog, build_source
32
+
33
+
34
+ def safe_build_source(
35
+ manifest_dict: Mapping[str, Any],
36
+ config_dict: Mapping[str, Any],
37
+ page_limit: Optional[int] = None,
38
+ slice_limit: Optional[int] = None,
39
+ ) -> ManifestDeclarativeSource:
40
+ """Wrapper around build_source that converts ValidationError to HTTPException."""
41
+ try:
42
+ return build_source(manifest_dict, config_dict, page_limit, slice_limit)
43
+ except jsonschema.exceptions.ValidationError as e:
44
+ raise HTTPException(status_code=400, detail=f"Invalid manifest: {e.message}")
45
+
46
+
47
+ router = APIRouter(
48
+ prefix="/manifest",
49
+ tags=["manifest"],
50
+ dependencies=[Depends(verify_jwt_token)],
51
+ )
52
+
53
+
54
+ @router.post("/test_read", operation_id="testRead")
55
+ def test_read(request: StreamTestReadRequest) -> StreamRead:
56
+ """
57
+ Test reading from a specific stream in the manifest.
58
+ """
59
+ config_dict = request.config.model_dump()
60
+
61
+ source = safe_build_source(
62
+ request.manifest.model_dump(), config_dict, request.page_limit, request.slice_limit
63
+ )
64
+ catalog = build_catalog(request.stream_name)
65
+ state = [AirbyteStateMessageSerializer.load(state) for state in request.state]
66
+
67
+ if request.custom_components_code:
68
+ config_dict[INJECTED_COMPONENTS_PY] = request.custom_components_code
69
+ config_dict[INJECTED_COMPONENTS_PY_CHECKSUMS] = {
70
+ "md5": hashlib.md5(request.custom_components_code.encode()).hexdigest()
71
+ }
72
+
73
+ runner = ManifestCommandProcessor(source)
74
+ cdk_result = runner.test_read(
75
+ config_dict,
76
+ catalog,
77
+ state,
78
+ request.record_limit,
79
+ request.page_limit,
80
+ request.slice_limit,
81
+ )
82
+ return StreamRead.model_validate(asdict(cdk_result))
83
+
84
+
85
+ @router.post("/check", operation_id="check")
86
+ def check(request: CheckRequest) -> CheckResponse:
87
+ """Check configuration against a manifest"""
88
+ source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
89
+ runner = ManifestCommandProcessor(source)
90
+ success, message = runner.check_connection(request.config.model_dump())
91
+ return CheckResponse(success=success, message=message)
92
+
93
+
94
+ @router.post("/discover", operation_id="discover")
95
+ def discover(request: DiscoverRequest) -> DiscoverResponse:
96
+ """Discover streams from a manifest"""
97
+ source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
98
+ runner = ManifestCommandProcessor(source)
99
+ catalog = runner.discover(request.config.model_dump())
100
+ if catalog is None:
101
+ raise HTTPException(status_code=422, detail="Connector did not return a discovered catalog")
102
+ return DiscoverResponse(catalog=catalog)
103
+
104
+
105
+ @router.post("/resolve", operation_id="resolve")
106
+ def resolve(request: ResolveRequest) -> ManifestResponse:
107
+ """Resolve a manifest to its final configuration."""
108
+ source = safe_build_source(request.manifest.model_dump(), {})
109
+ return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
110
+
111
+
112
+ @router.post("/full_resolve", operation_id="fullResolve")
113
+ def full_resolve(request: FullResolveRequest) -> ManifestResponse:
114
+ """
115
+ Fully resolve a manifest including dynamic streams.
116
+
117
+ Generates dynamic streams up to the specified limit and includes
118
+ them in the resolved manifest.
119
+ """
120
+ source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
121
+ manifest = {**source.resolved_manifest}
122
+ streams = manifest.get("streams", [])
123
+ for stream in streams:
124
+ stream["dynamic_stream_name"] = None
125
+
126
+ mapped_streams: Dict[str, List[Dict[str, Any]]] = {}
127
+ for stream in source.dynamic_streams:
128
+ generated_streams = mapped_streams.setdefault(stream["dynamic_stream_name"], [])
129
+
130
+ if len(generated_streams) < request.stream_limit:
131
+ generated_streams += [stream]
132
+
133
+ for generated_streams_list in mapped_streams.values():
134
+ streams.extend(generated_streams_list)
135
+
136
+ manifest["streams"] = streams
137
+ return ManifestResponse(manifest=Manifest(**manifest))
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
  import logging
5
+ import os
5
6
  from typing import Dict, Iterable, List, Optional, Set
6
7
 
7
8
  from airbyte_cdk.exception_handler import generate_failed_streams_error_message
@@ -95,11 +96,14 @@ class ConcurrentReadProcessor:
95
96
  """
96
97
  stream_name = partition.stream_name()
97
98
  self._streams_to_running_partitions[stream_name].add(partition)
99
+ cursor = self._stream_name_to_instance[stream_name].cursor
98
100
  if self._slice_logger.should_log_slice_message(self._logger):
99
101
  self._message_repository.emit_message(
100
102
  self._slice_logger.create_slice_log_message(partition.to_slice())
101
103
  )
102
- self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
104
+ self._thread_pool_manager.submit(
105
+ self._partition_reader.process_partition, partition, cursor
106
+ )
103
107
 
104
108
  def on_partition_complete_sentinel(
105
109
  self, sentinel: PartitionCompleteSentinel
@@ -112,26 +116,16 @@ class ConcurrentReadProcessor:
112
116
  """
113
117
  partition = sentinel.partition
114
118
 
115
- try:
116
- if sentinel.is_successful:
117
- stream = self._stream_name_to_instance[partition.stream_name()]
118
- stream.cursor.close_partition(partition)
119
- except Exception as exception:
120
- self._flag_exception(partition.stream_name(), exception)
121
- yield AirbyteTracedException.from_exception(
122
- exception, stream_descriptor=StreamDescriptor(name=partition.stream_name())
123
- ).as_sanitized_airbyte_message()
124
- finally:
125
- partitions_running = self._streams_to_running_partitions[partition.stream_name()]
126
- if partition in partitions_running:
127
- partitions_running.remove(partition)
128
- # If all partitions were generated and this was the last one, the stream is done
129
- if (
130
- partition.stream_name() not in self._streams_currently_generating_partitions
131
- and len(partitions_running) == 0
132
- ):
133
- yield from self._on_stream_is_done(partition.stream_name())
134
- yield from self._message_repository.consume_queue()
119
+ partitions_running = self._streams_to_running_partitions[partition.stream_name()]
120
+ if partition in partitions_running:
121
+ partitions_running.remove(partition)
122
+ # If all partitions were generated and this was the last one, the stream is done
123
+ if (
124
+ partition.stream_name() not in self._streams_currently_generating_partitions
125
+ and len(partitions_running) == 0
126
+ ):
127
+ yield from self._on_stream_is_done(partition.stream_name())
128
+ yield from self._message_repository.consume_queue()
135
129
 
136
130
  def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
137
131
  """
@@ -160,7 +154,6 @@ class ConcurrentReadProcessor:
160
154
  stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING
161
155
  )
162
156
  self._record_counter[stream.name] += 1
163
- stream.cursor.observe(record)
164
157
  yield message
165
158
  yield from self._message_repository.consume_queue()
166
159
 
@@ -1,10 +1,11 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
+
4
5
  import concurrent
5
6
  import logging
6
7
  from queue import Queue
7
- from typing import Iterable, Iterator, List
8
+ from typing import Iterable, Iterator, List, Optional
8
9
 
9
10
  from airbyte_cdk.models import AirbyteMessage
10
11
  from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
@@ -16,7 +17,7 @@ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPool
16
17
  from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
17
18
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
18
19
  from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
19
- from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
20
+ from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionLogger, PartitionReader
20
21
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
21
22
  from airbyte_cdk.sources.streams.concurrent.partitions.types import (
22
23
  PartitionCompleteSentinel,
@@ -43,6 +44,7 @@ class ConcurrentSource:
43
44
  logger: logging.Logger,
44
45
  slice_logger: SliceLogger,
45
46
  message_repository: MessageRepository,
47
+ queue: Optional[Queue[QueueItem]] = None,
46
48
  timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
47
49
  ) -> "ConcurrentSource":
48
50
  is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
@@ -59,12 +61,13 @@ class ConcurrentSource:
59
61
  logger,
60
62
  )
61
63
  return ConcurrentSource(
62
- threadpool,
63
- logger,
64
- slice_logger,
65
- message_repository,
66
- initial_number_of_partitions_to_generate,
67
- timeout_seconds,
64
+ threadpool=threadpool,
65
+ logger=logger,
66
+ slice_logger=slice_logger,
67
+ queue=queue,
68
+ message_repository=message_repository,
69
+ initial_number_partitions_to_generate=initial_number_of_partitions_to_generate,
70
+ timeout_seconds=timeout_seconds,
68
71
  )
69
72
 
70
73
  def __init__(
@@ -72,6 +75,7 @@ class ConcurrentSource:
72
75
  threadpool: ThreadPoolManager,
73
76
  logger: logging.Logger,
74
77
  slice_logger: SliceLogger = DebugSliceLogger(),
78
+ queue: Optional[Queue[QueueItem]] = None,
75
79
  message_repository: MessageRepository = InMemoryMessageRepository(),
76
80
  initial_number_partitions_to_generate: int = 1,
77
81
  timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
@@ -91,25 +95,28 @@ class ConcurrentSource:
91
95
  self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
92
96
  self._timeout_seconds = timeout_seconds
93
97
 
98
+ # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
99
+ # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
100
+ # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
101
+ # information and might even need to be configurable depending on the source
102
+ self._queue = queue or Queue(maxsize=10_000)
103
+
94
104
  def read(
95
105
  self,
96
106
  streams: List[AbstractStream],
97
107
  ) -> Iterator[AirbyteMessage]:
98
108
  self._logger.info("Starting syncing")
99
-
100
- # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
101
- # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
102
- # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
103
- # information and might even need to be configurable depending on the source
104
- queue: Queue[QueueItem] = Queue(maxsize=10_000)
105
109
  concurrent_stream_processor = ConcurrentReadProcessor(
106
110
  streams,
107
- PartitionEnqueuer(queue, self._threadpool),
111
+ PartitionEnqueuer(self._queue, self._threadpool),
108
112
  self._threadpool,
109
113
  self._logger,
110
114
  self._slice_logger,
111
115
  self._message_repository,
112
- PartitionReader(queue),
116
+ PartitionReader(
117
+ self._queue,
118
+ PartitionLogger(self._slice_logger, self._logger, self._message_repository),
119
+ ),
113
120
  )
114
121
 
115
122
  # Enqueue initial partition generation tasks
@@ -117,7 +124,7 @@ class ConcurrentSource:
117
124
 
118
125
  # Read from the queue until all partitions were generated and read
119
126
  yield from self._consume_from_queue(
120
- queue,
127
+ self._queue,
121
128
  concurrent_stream_processor,
122
129
  )
123
130
  self._threadpool.check_for_errors_and_shutdown()
@@ -141,7 +148,10 @@ class ConcurrentSource:
141
148
  airbyte_message_or_record_or_exception,
142
149
  concurrent_stream_processor,
143
150
  )
144
- if concurrent_stream_processor.is_done() and queue.empty():
151
+ # In the event that a partition raises an exception, anything remaining in
152
+ # the queue will be missed because is_done() can raise an exception and exit
153
+ # out of this loop before remaining items are consumed
154
+ if queue.empty() and concurrent_stream_processor.is_done():
145
155
  # all partitions were generated and processed. we're done here
146
156
  break
147
157
 
@@ -161,5 +171,7 @@ class ConcurrentSource:
161
171
  yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
162
172
  elif isinstance(queue_item, Record):
163
173
  yield from concurrent_stream_processor.on_record(queue_item)
174
+ elif isinstance(queue_item, AirbyteMessage):
175
+ yield queue_item
164
176
  else:
165
177
  raise ValueError(f"Unknown queue item type: {type(queue_item)}")
@@ -3,7 +3,22 @@
3
3
  #
4
4
 
5
5
  import logging
6
- from typing import Any, Generic, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Union
6
+ from dataclasses import dataclass, field
7
+ from queue import Queue
8
+ from typing import (
9
+ Any,
10
+ ClassVar,
11
+ Generic,
12
+ Iterator,
13
+ List,
14
+ Mapping,
15
+ MutableMapping,
16
+ Optional,
17
+ Tuple,
18
+ Union,
19
+ )
20
+
21
+ from airbyte_protocol_dataclasses.models import Level
7
22
 
8
23
  from airbyte_cdk.models import (
9
24
  AirbyteCatalog,
@@ -43,6 +58,8 @@ from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_genera
43
58
  StreamSlicerPartitionGenerator,
44
59
  )
45
60
  from airbyte_cdk.sources.declarative.types import ConnectionDefinition
61
+ from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository
62
+ from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
46
63
  from airbyte_cdk.sources.source import TState
47
64
  from airbyte_cdk.sources.streams import Stream
48
65
  from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
@@ -50,6 +67,22 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import Abstra
50
67
  from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor
51
68
  from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
52
69
  from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
70
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
71
+
72
+
73
+ @dataclass
74
+ class TestLimits:
75
+ __test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name
76
+
77
+ DEFAULT_MAX_PAGES_PER_SLICE: ClassVar[int] = 5
78
+ DEFAULT_MAX_SLICES: ClassVar[int] = 5
79
+ DEFAULT_MAX_RECORDS: ClassVar[int] = 100
80
+ DEFAULT_MAX_STREAMS: ClassVar[int] = 100
81
+
82
+ max_records: int = field(default=DEFAULT_MAX_RECORDS)
83
+ max_pages_per_slice: int = field(default=DEFAULT_MAX_PAGES_PER_SLICE)
84
+ max_slices: int = field(default=DEFAULT_MAX_SLICES)
85
+ max_streams: int = field(default=DEFAULT_MAX_STREAMS)
53
86
 
54
87
 
55
88
  class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
@@ -65,7 +98,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
65
98
  source_config: ConnectionDefinition,
66
99
  debug: bool = False,
67
100
  emit_connector_builder_messages: bool = False,
68
- component_factory: Optional[ModelToComponentFactory] = None,
101
+ migrate_manifest: bool = False,
102
+ normalize_manifest: bool = False,
103
+ limits: Optional[TestLimits] = None,
69
104
  config_path: Optional[str] = None,
70
105
  **kwargs: Any,
71
106
  ) -> None:
@@ -73,21 +108,39 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
73
108
  # no longer needs to store the original incoming state. But maybe there's an edge case?
74
109
  self._connector_state_manager = ConnectorStateManager(state=state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
75
110
 
111
+ # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
112
+ # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
113
+ # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
114
+ # information and might even need to be configurable depending on the source
115
+ queue: Queue[QueueItem] = Queue(maxsize=10_000)
116
+ message_repository = InMemoryMessageRepository(
117
+ Level.DEBUG if emit_connector_builder_messages else Level.INFO
118
+ )
119
+
76
120
  # To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
77
121
  # cursors. We do this by no longer automatically instantiating RFR cursors when converting
78
122
  # the declarative models into runtime components. Concurrent sources will continue to checkpoint
79
123
  # incremental streams running in full refresh.
80
- component_factory = component_factory or ModelToComponentFactory(
124
+ component_factory = ModelToComponentFactory(
81
125
  emit_connector_builder_messages=emit_connector_builder_messages,
126
+ message_repository=ConcurrentMessageRepository(queue, message_repository),
82
127
  connector_state_manager=self._connector_state_manager,
83
128
  max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
129
+ limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None,
130
+ limit_slices_fetched=limits.max_slices if limits else None,
131
+ disable_retries=True if limits else False,
132
+ disable_cache=True if limits else False,
84
133
  )
85
134
 
135
+ self._limits = limits
136
+
86
137
  super().__init__(
87
138
  source_config=source_config,
88
139
  config=config,
89
140
  debug=debug,
90
141
  emit_connector_builder_messages=emit_connector_builder_messages,
142
+ migrate_manifest=migrate_manifest,
143
+ normalize_manifest=normalize_manifest,
91
144
  component_factory=component_factory,
92
145
  config_path=config_path,
93
146
  )
@@ -117,6 +170,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
117
170
  initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate,
118
171
  logger=self.logger,
119
172
  slice_logger=self._slice_logger,
173
+ queue=queue,
120
174
  message_repository=self.message_repository,
121
175
  )
122
176
 
@@ -280,8 +334,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
280
334
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
281
335
  retriever=retriever,
282
336
  message_repository=self.message_repository,
337
+ max_records_limit=self._limits.max_records
338
+ if self._limits
339
+ else None,
283
340
  ),
284
341
  stream_slicer=declarative_stream.retriever.stream_slicer,
342
+ slice_limit=self._limits.max_slices
343
+ if self._limits
344
+ else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
285
345
  )
286
346
  else:
287
347
  if (
@@ -311,8 +371,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
311
371
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
312
372
  retriever=retriever,
313
373
  message_repository=self.message_repository,
374
+ max_records_limit=self._limits.max_records
375
+ if self._limits
376
+ else None,
314
377
  ),
315
378
  stream_slicer=cursor,
379
+ slice_limit=self._limits.max_slices if self._limits else None,
316
380
  )
317
381
 
318
382
  concurrent_streams.append(
@@ -341,8 +405,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
341
405
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
342
406
  retriever=declarative_stream.retriever,
343
407
  message_repository=self.message_repository,
408
+ max_records_limit=self._limits.max_records if self._limits else None,
344
409
  ),
345
410
  declarative_stream.retriever.stream_slicer,
411
+ slice_limit=self._limits.max_slices
412
+ if self._limits
413
+ else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
346
414
  )
347
415
 
348
416
  final_state_cursor = FinalStateCursor(
@@ -401,8 +469,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
401
469
  schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
402
470
  retriever=retriever,
403
471
  message_repository=self.message_repository,
472
+ max_records_limit=self._limits.max_records if self._limits else None,
404
473
  ),
405
474
  perpartition_cursor,
475
+ slice_limit=self._limits.max_slices if self._limits else None,
406
476
  )
407
477
 
408
478
  concurrent_streams.append(
@@ -631,6 +631,10 @@ SCHEMA_TRANSFORMER_TYPE_MAPPING = {
631
631
  SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization,
632
632
  }
633
633
 
634
+ # Ideally this should use the value defined in ConcurrentDeclarativeSource, but
635
+ # this would be a circular import
636
+ MAX_SLICES = 5
637
+
634
638
 
635
639
  class ModelToComponentFactory:
636
640
  EPOCH_DATETIME_FORMAT = "%s"
@@ -1,9 +1,12 @@
1
- # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
2
 
3
- from typing import Any, Iterable, Mapping, Optional
3
+ from typing import Any, Iterable, Mapping, Optional, cast
4
4
 
5
5
  from airbyte_cdk.sources.declarative.retrievers import Retriever
6
6
  from airbyte_cdk.sources.declarative.schema import SchemaLoader
7
+ from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer_test_read_decorator import (
8
+ StreamSlicerTestReadDecorator,
9
+ )
7
10
  from airbyte_cdk.sources.message import MessageRepository
8
11
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
9
12
  from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
@@ -11,6 +14,11 @@ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import Stre
11
14
  from airbyte_cdk.sources.types import Record, StreamSlice
12
15
  from airbyte_cdk.utils.slice_hasher import SliceHasher
13
16
 
17
+ # For Connector Builder test read operations, we track the total number of records
18
+ # read for the stream at the global level so that we can stop reading early if we
19
+ # exceed the record limit
20
+ total_record_counter = 0
21
+
14
22
 
15
23
  class SchemaLoaderCachingDecorator(SchemaLoader):
16
24
  def __init__(self, schema_loader: SchemaLoader):
@@ -31,6 +39,7 @@ class DeclarativePartitionFactory:
31
39
  schema_loader: SchemaLoader,
32
40
  retriever: Retriever,
33
41
  message_repository: MessageRepository,
42
+ max_records_limit: Optional[int] = None,
34
43
  ) -> None:
35
44
  """
36
45
  The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not
@@ -41,6 +50,7 @@ class DeclarativePartitionFactory:
41
50
  self._schema_loader = SchemaLoaderCachingDecorator(schema_loader)
42
51
  self._retriever = retriever
43
52
  self._message_repository = message_repository
53
+ self._max_records_limit = max_records_limit
44
54
 
45
55
  def create(self, stream_slice: StreamSlice) -> Partition:
46
56
  return DeclarativePartition(
@@ -48,6 +58,7 @@ class DeclarativePartitionFactory:
48
58
  schema_loader=self._schema_loader,
49
59
  retriever=self._retriever,
50
60
  message_repository=self._message_repository,
61
+ max_records_limit=self._max_records_limit,
51
62
  stream_slice=stream_slice,
52
63
  )
53
64
 
@@ -59,19 +70,29 @@ class DeclarativePartition(Partition):
59
70
  schema_loader: SchemaLoader,
60
71
  retriever: Retriever,
61
72
  message_repository: MessageRepository,
73
+ max_records_limit: Optional[int],
62
74
  stream_slice: StreamSlice,
63
75
  ):
64
76
  self._stream_name = stream_name
65
77
  self._schema_loader = schema_loader
66
78
  self._retriever = retriever
67
79
  self._message_repository = message_repository
80
+ self._max_records_limit = max_records_limit
68
81
  self._stream_slice = stream_slice
69
82
  self._hash = SliceHasher.hash(self._stream_name, self._stream_slice)
70
83
 
71
84
  def read(self) -> Iterable[Record]:
85
+ if self._max_records_limit is not None:
86
+ global total_record_counter
87
+ if total_record_counter >= self._max_records_limit:
88
+ return
72
89
  for stream_data in self._retriever.read_records(
73
90
  self._schema_loader.get_json_schema(), self._stream_slice
74
91
  ):
92
+ if self._max_records_limit is not None:
93
+ if total_record_counter >= self._max_records_limit:
94
+ break
95
+
75
96
  if isinstance(stream_data, Mapping):
76
97
  record = (
77
98
  stream_data
@@ -86,6 +107,9 @@ class DeclarativePartition(Partition):
86
107
  else:
87
108
  self._message_repository.emit_message(stream_data)
88
109
 
110
+ if self._max_records_limit is not None:
111
+ total_record_counter += 1
112
+
89
113
  def to_slice(self) -> Optional[Mapping[str, Any]]:
90
114
  return self._stream_slice
91
115
 
@@ -98,10 +122,24 @@ class DeclarativePartition(Partition):
98
122
 
99
123
  class StreamSlicerPartitionGenerator(PartitionGenerator):
100
124
  def __init__(
101
- self, partition_factory: DeclarativePartitionFactory, stream_slicer: StreamSlicer
125
+ self,
126
+ partition_factory: DeclarativePartitionFactory,
127
+ stream_slicer: StreamSlicer,
128
+ slice_limit: Optional[int] = None,
129
+ max_records_limit: Optional[int] = None,
102
130
  ) -> None:
103
131
  self._partition_factory = partition_factory
104
- self._stream_slicer = stream_slicer
132
+
133
+ if slice_limit:
134
+ self._stream_slicer = cast(
135
+ StreamSlicer,
136
+ StreamSlicerTestReadDecorator(
137
+ wrapped_slicer=stream_slicer,
138
+ maximum_number_of_slices=slice_limit,
139
+ ),
140
+ )
141
+ else:
142
+ self._stream_slicer = stream_slicer
105
143
 
106
144
  def generate(self) -> Iterable[Partition]:
107
145
  for stream_slice in self._stream_slicer.stream_slices():
@@ -4,10 +4,10 @@
4
4
 
5
5
  from dataclasses import dataclass
6
6
  from itertools import islice
7
- from typing import Any, Iterable, Mapping, Optional, Union
7
+ from typing import Any, Iterable
8
8
 
9
9
  from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
10
- from airbyte_cdk.sources.types import StreamSlice, StreamState
10
+ from airbyte_cdk.sources.types import StreamSlice
11
11
 
12
12
 
13
13
  @dataclass