airbyte-cdk 6.61.6__py3-none-any.whl → 6.62.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. airbyte_cdk/connector_builder/connector_builder_handler.py +7 -7
  2. airbyte_cdk/connector_builder/main.py +2 -2
  3. airbyte_cdk/connector_builder/test_reader/reader.py +2 -2
  4. airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
  5. airbyte_cdk/manifest_server/Dockerfile +2 -2
  6. airbyte_cdk/manifest_server/README.md +0 -22
  7. airbyte_cdk/manifest_server/app.py +0 -6
  8. airbyte_cdk/manifest_server/cli/_common.py +0 -1
  9. airbyte_cdk/manifest_server/command_processor/processor.py +5 -2
  10. airbyte_cdk/manifest_server/command_processor/utils.py +1 -1
  11. airbyte_cdk/manifest_server/routers/manifest.py +1 -1
  12. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +6 -7
  13. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
  14. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
  15. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +208 -278
  16. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +0 -6
  17. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
  18. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
  19. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
  20. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
  21. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
  22. airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
  23. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
  24. airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
  25. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +7 -21
  26. airbyte_cdk/sources/declarative/yaml_declarative_source.py +1 -1
  27. airbyte_cdk/sources/message/repository.py +20 -0
  28. airbyte_cdk/sources/utils/schema_helpers.py +9 -29
  29. airbyte_cdk/sources/utils/transform.py +13 -25
  30. airbyte_cdk/utils/spec_schema_transformations.py +5 -7
  31. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/METADATA +2 -4
  32. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/RECORD +36 -35
  33. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/LICENSE.txt +0 -0
  34. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/LICENSE_SHORT +0 -0
  35. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/WHEEL +0 -0
  36. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.62.0.dev1.dist-info}/entry_points.txt +0 -0
@@ -62,10 +62,10 @@ def should_normalize_manifest(config: Mapping[str, Any]) -> bool:
62
62
 
63
63
  def create_source(
64
64
  config: Mapping[str, Any],
65
- limits: TestLimits | None = None,
66
- catalog: ConfiguredAirbyteCatalog | None = None,
67
- state: List[AirbyteStateMessage] | None = None,
68
- ) -> ConcurrentDeclarativeSource:
65
+ limits: TestLimits,
66
+ catalog: Optional[ConfiguredAirbyteCatalog],
67
+ state: Optional[List[AirbyteStateMessage]],
68
+ ) -> ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]:
69
69
  manifest = config["__injected_declarative_manifest"]
70
70
 
71
71
  # We enforce a concurrency level of 1 so that the stream is processed on a single thread
@@ -88,7 +88,7 @@ def create_source(
88
88
 
89
89
 
90
90
  def read_stream(
91
- source: ConcurrentDeclarativeSource,
91
+ source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]],
92
92
  config: Mapping[str, Any],
93
93
  configured_catalog: ConfiguredAirbyteCatalog,
94
94
  state: List[AirbyteStateMessage],
@@ -127,7 +127,7 @@ def read_stream(
127
127
 
128
128
 
129
129
  def resolve_manifest(
130
- source: ConcurrentDeclarativeSource,
130
+ source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]],
131
131
  ) -> AirbyteMessage:
132
132
  try:
133
133
  return AirbyteMessage(
@@ -146,7 +146,7 @@ def resolve_manifest(
146
146
 
147
147
 
148
148
  def full_resolve_manifest(
149
- source: ConcurrentDeclarativeSource, limits: TestLimits
149
+ source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]], limits: TestLimits
150
150
  ) -> AirbyteMessage:
151
151
  try:
152
152
  manifest = {**source.resolved_manifest}
@@ -34,7 +34,7 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
34
34
 
35
35
  def get_config_and_catalog_from_args(
36
36
  args: List[str],
37
- ) -> Tuple[str, Mapping[str, Any], Optional[ConfiguredAirbyteCatalog], List[AirbyteStateMessage]]:
37
+ ) -> Tuple[str, Mapping[str, Any], Optional[ConfiguredAirbyteCatalog], Any]:
38
38
  # TODO: Add functionality for the `debug` logger.
39
39
  # Currently, no one `debug` level log will be displayed during `read` a stream for a connector created through `connector-builder`.
40
40
  parsed_args = AirbyteEntrypoint.parse_args(args)
@@ -70,7 +70,7 @@ def get_config_and_catalog_from_args(
70
70
 
71
71
 
72
72
  def handle_connector_builder_request(
73
- source: ConcurrentDeclarativeSource,
73
+ source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]],
74
74
  command: str,
75
75
  config: Mapping[str, Any],
76
76
  catalog: Optional[ConfiguredAirbyteCatalog],
@@ -85,7 +85,7 @@ class TestReader:
85
85
 
86
86
  def run_test_read(
87
87
  self,
88
- source: ConcurrentDeclarativeSource,
88
+ source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]],
89
89
  config: Mapping[str, Any],
90
90
  configured_catalog: ConfiguredAirbyteCatalog,
91
91
  stream_name: str,
@@ -383,7 +383,7 @@ class TestReader:
383
383
 
384
384
  def _read_stream(
385
385
  self,
386
- source: ConcurrentDeclarativeSource,
386
+ source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]],
387
387
  config: Mapping[str, Any],
388
388
  configured_catalog: ConfiguredAirbyteCatalog,
389
389
  state: List[AirbyteStateMessage],
@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
146
146
  if "state" in stream_state:
147
147
  self._state_to_migrate_from = stream_state["state"]
148
148
 
149
- # Set parent state for partition routers based on parent streams
150
- self._partition_router.set_initial_state(stream_state)
149
+ # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
150
+ # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
151
+ # We are still keeping this line as a comment to be explicit about the past behavior.
152
+ # self._partition_router.set_initial_state(stream_state)
151
153
 
152
154
  def observe(self, stream_slice: StreamSlice, record: Record) -> None:
153
155
  self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
@@ -11,7 +11,7 @@ FROM python:3.12-slim-bookworm
11
11
  RUN apt-get update && \
12
12
  apt-get install -y git && \
13
13
  rm -rf /var/lib/apt/lists/* && \
14
- pip install poetry==2.0.1
14
+ pip install poetry==1.8.3
15
15
 
16
16
  # Configure poetry to not create virtual environments and disable interactive mode
17
17
  ENV POETRY_NO_INTERACTION=1 \
@@ -42,4 +42,4 @@ USER airbyte:airbyte
42
42
 
43
43
  EXPOSE 8080
44
44
 
45
- CMD ["uvicorn", "airbyte_cdk.manifest_server.app:app", "--host", "0.0.0.0", "--port", "8080"]
45
+ CMD ["uvicorn", "airbyte_cdk.manifest_server.app:app", "--host", "0.0.0.0", "--port", "8080"]
@@ -154,25 +154,3 @@ docker run -p 8080:8080 manifest-server
154
154
  ```
155
155
 
156
156
  Note: The container runs on port 8080 by default.
157
-
158
- ## Datadog APM
159
-
160
- The manifest server supports Datadog APM tracing for monitoring and observability:
161
-
162
- ### Configuration
163
-
164
- To enable Datadog tracing, set the environment variable:
165
-
166
- ```bash
167
- export DD_ENABLED=true
168
- ```
169
-
170
- This requires the `ddtrace` dependency, which is included in the `manifest-server` extra. For additional configuration options via environment variables, see [ddtrace configuration](https://ddtrace.readthedocs.io/en/stable/configuration.html).
171
-
172
- ### Usage
173
-
174
- ```bash
175
- # Run with Datadog tracing enabled
176
- DD_ENABLED=true manifest-server start
177
- ```
178
-
@@ -1,9 +1,3 @@
1
- import os
2
-
3
- if os.getenv("DD_ENABLED", "false").lower() == "true":
4
- # Auto-instrumentation should be imported as early as possible.
5
- import ddtrace.auto # noqa: F401
6
-
7
1
  from fastapi import FastAPI
8
2
 
9
3
  from .routers import capabilities, health, manifest
@@ -7,7 +7,6 @@ import rich_click as click
7
7
 
8
8
  # Import server dependencies with graceful fallback
9
9
  try:
10
- import ddtrace # noqa: F401
11
10
  import fastapi # noqa: F401
12
11
  import uvicorn # noqa: F401
13
12
 
@@ -21,10 +21,12 @@ from airbyte_cdk.test.entrypoint_wrapper import AirbyteEntrypointException, Entr
21
21
 
22
22
 
23
23
  class ManifestCommandProcessor:
24
- _source: ConcurrentDeclarativeSource
24
+ _source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]
25
25
  _logger = logging.getLogger("airbyte.manifest-server")
26
26
 
27
- def __init__(self, source: ConcurrentDeclarativeSource) -> None:
27
+ def __init__(
28
+ self, source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]
29
+ ) -> None:
28
30
  self._source = source
29
31
 
30
32
  def test_read(
@@ -39,6 +41,7 @@ class ManifestCommandProcessor:
39
41
  """
40
42
  Test the read method of the source.
41
43
  """
44
+
42
45
  test_read_handler = TestReader(
43
46
  max_pages_per_slice=page_limit,
44
47
  max_slices=slice_limit,
@@ -63,7 +63,7 @@ def build_source(
63
63
  record_limit: Optional[int] = None,
64
64
  page_limit: Optional[int] = None,
65
65
  slice_limit: Optional[int] = None,
66
- ) -> ConcurrentDeclarativeSource:
66
+ ) -> ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]:
67
67
  # We enforce a concurrency level of 1 so that the stream is processed on a single thread
68
68
  # to retain ordering for the grouping of the builder message responses.
69
69
  definition = copy.deepcopy(manifest)
@@ -40,7 +40,7 @@ def safe_build_source(
40
40
  page_limit: Optional[int] = None,
41
41
  slice_limit: Optional[int] = None,
42
42
  record_limit: Optional[int] = None,
43
- ) -> ConcurrentDeclarativeSource:
43
+ ) -> ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]:
44
44
  """Wrapper around build_source that converts ValidationError to HTTPException."""
45
45
  try:
46
46
  return build_source(
@@ -162,17 +162,16 @@ def _get_declarative_component_schema() -> Dict[str, Any]:
162
162
  # is no longer inherited from since the only external dependency is from that class.
163
163
  #
164
164
  # todo: It is worth investigating removal of the Generic[TState] since it will always be Optional[List[AirbyteStateMessage]]
165
- class ConcurrentDeclarativeSource(AbstractSource):
165
+ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
166
166
  # By default, we defer to a value of 2. A value lower than could cause a PartitionEnqueuer to be stuck in a state of deadlock
167
167
  # because it has hit the limit of futures but not partition reader is consuming them.
168
168
  _LOWEST_SAFE_CONCURRENCY_LEVEL = 2
169
169
 
170
170
  def __init__(
171
171
  self,
172
- catalog: Optional[ConfiguredAirbyteCatalog] = None,
173
- config: Optional[Mapping[str, Any]] = None,
174
- state: Optional[List[AirbyteStateMessage]] = None,
175
- *,
172
+ catalog: Optional[ConfiguredAirbyteCatalog],
173
+ config: Optional[Mapping[str, Any]],
174
+ state: TState,
176
175
  source_config: ConnectionDefinition,
177
176
  debug: bool = False,
178
177
  emit_connector_builder_messages: bool = False,
@@ -704,7 +703,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
704
703
  stream_slicer=declarative_stream.retriever.stream_slicer,
705
704
  slice_limit=self._limits.max_slices
706
705
  if self._limits
707
- else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
706
+ else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
708
707
  )
709
708
  else:
710
709
  if (
@@ -773,7 +772,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
773
772
  declarative_stream.retriever.stream_slicer,
774
773
  slice_limit=self._limits.max_slices
775
774
  if self._limits
776
- else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
775
+ else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
777
776
  )
778
777
 
779
778
  final_state_cursor = FinalStateCursor(
@@ -11,6 +11,13 @@ from copy import deepcopy
11
11
  from datetime import timedelta
12
12
  from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
13
13
 
14
+ from airbyte_cdk.models import (
15
+ AirbyteStateBlob,
16
+ AirbyteStateMessage,
17
+ AirbyteStateType,
18
+ AirbyteStreamState,
19
+ StreamDescriptor,
20
+ )
14
21
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
15
22
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
23
  Timer,
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
48
55
  Manages state per partition when a stream has many partitions, preventing data loss or duplication.
49
56
 
50
57
  Attributes:
51
- DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
58
+ DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
52
59
 
53
60
  - **Partition Limitation Logic**
54
61
  Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
128
135
 
129
136
  # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
130
137
  self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
138
+ self._synced_some_data = False
131
139
 
132
140
  @property
133
141
  def cursor_field(self) -> CursorField:
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
168
176
  with self._lock:
169
177
  self._semaphore_per_partition[partition_key].acquire()
170
178
  if not self._use_global_cursor:
171
- self._cursor_per_partition[partition_key].close_partition(partition=partition)
172
179
  cursor = self._cursor_per_partition[partition_key]
180
+ cursor.close_partition(partition=partition)
173
181
  if (
174
182
  partition_key in self._partitions_done_generating_stream_slices
175
183
  and self._semaphore_per_partition[partition_key]._value == 0
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
213
221
  if not any(
214
222
  semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
215
223
  ):
216
- self._global_cursor = self._new_global_cursor
217
- self._lookback_window = self._timer.finish()
224
+ if self._synced_some_data:
225
+ # we only update those if we actually synced some data
226
+ self._global_cursor = self._new_global_cursor
227
+ self._lookback_window = self._timer.finish()
218
228
  self._parent_state = self._partition_router.get_stream_state()
219
229
  self._emit_state_message(throttle=False)
220
230
 
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
422
432
  if stream_state.get("parent_state"):
423
433
  self._parent_state = stream_state["parent_state"]
424
434
 
425
- # Set parent state for partition routers based on parent streams
426
- self._partition_router.set_initial_state(stream_state)
427
-
428
435
  def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
429
436
  """
430
437
  Initializes the global cursor state from the provided stream state.
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
458
465
  except ValueError:
459
466
  return
460
467
 
468
+ self._synced_some_data = True
461
469
  record_cursor = self._connector_state_converter.output_format(
462
470
  self._connector_state_converter.parse_value(record_cursor_value)
463
471
  )
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
541
549
 
542
550
  def limit_reached(self) -> bool:
543
551
  return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
552
+
553
+ @staticmethod
554
+ def get_parent_state(
555
+ stream_state: Optional[StreamState], parent_stream_name: str
556
+ ) -> Optional[AirbyteStateMessage]:
557
+ if not stream_state:
558
+ return None
559
+
560
+ if "parent_state" not in stream_state:
561
+ logger.warning(
562
+ f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
563
+ )
564
+ return None
565
+ elif parent_stream_name not in stream_state["parent_state"]:
566
+ logger.info(
567
+ f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
568
+ )
569
+ return None
570
+
571
+ return AirbyteStateMessage(
572
+ type=AirbyteStateType.STREAM,
573
+ stream=AirbyteStreamState(
574
+ stream_descriptor=StreamDescriptor(parent_stream_name, None),
575
+ stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
576
+ ),
577
+ )
578
+
579
+ @staticmethod
580
+ def get_global_state(
581
+ stream_state: Optional[StreamState], parent_stream_name: str
582
+ ) -> Optional[AirbyteStateMessage]:
583
+ return (
584
+ AirbyteStateMessage(
585
+ type=AirbyteStateType.STREAM,
586
+ stream=AirbyteStreamState(
587
+ stream_descriptor=StreamDescriptor(parent_stream_name, None),
588
+ stream_state=AirbyteStateBlob(stream_state["state"]),
589
+ ),
590
+ )
591
+ if stream_state and "state" in stream_state
592
+ else None
593
+ )
@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
192
192
  # Example: {"global_state_format_key": "global_state_format_value"}
193
193
  self._stream_cursor.set_initial_state(stream_state)
194
194
 
195
- # Set parent state for partition routers based on parent streams
196
- self._partition_router.set_initial_state(stream_state)
195
+ # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
196
+ # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
197
+ # We are still keeping this line as a comment to be explicit about the past behavior.
198
+ # self._partition_router.set_initial_state(stream_state)
197
199
 
198
200
  def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
199
201
  """