airbyte-cdk 6.8.2.dev1__py3-none-any.whl → 6.8.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
20
20
  ClientSideIncrementalRecordFilterDecorator,
21
21
  )
22
22
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
23
- from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
24
- PerPartitionWithGlobalCursor,
25
- )
26
23
  from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
27
24
  from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
28
25
  from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
@@ -309,59 +306,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
309
306
  cursor=final_state_cursor,
310
307
  )
311
308
  )
312
- elif (
313
- incremental_sync_component_definition
314
- and incremental_sync_component_definition.get("type", "")
315
- == DatetimeBasedCursorModel.__name__
316
- and self._stream_supports_concurrent_partition_processing(
317
- declarative_stream=declarative_stream
318
- )
319
- and hasattr(declarative_stream.retriever, "stream_slicer")
320
- and isinstance(declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor)
321
- ):
322
- stream_state = state_manager.get_stream_state(
323
- stream_name=declarative_stream.name, namespace=declarative_stream.namespace
324
- )
325
- partition_router = declarative_stream.retriever.stream_slicer._partition_router
326
-
327
- cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
328
- state_manager=state_manager,
329
- model_type=DatetimeBasedCursorModel,
330
- component_definition=incremental_sync_component_definition,
331
- stream_name=declarative_stream.name,
332
- stream_namespace=declarative_stream.namespace,
333
- config=config or {},
334
- stream_state=stream_state,
335
- partition_router=partition_router,
336
- )
337
-
338
-
339
- partition_generator = StreamSlicerPartitionGenerator(
340
- DeclarativePartitionFactory(
341
- declarative_stream.name,
342
- declarative_stream.get_json_schema(),
343
- self._retriever_factory(
344
- name_to_stream_mapping[declarative_stream.name],
345
- config,
346
- stream_state,
347
- ),
348
- self.message_repository,
349
- ),
350
- cursor,
351
- )
352
-
353
- concurrent_streams.append(
354
- DefaultStream(
355
- partition_generator=partition_generator,
356
- name=declarative_stream.name,
357
- json_schema=declarative_stream.get_json_schema(),
358
- availability_strategy=AlwaysAvailableAvailabilityStrategy(),
359
- primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
360
- cursor_field=cursor.cursor_field.cursor_field_key,
361
- logger=self.logger,
362
- cursor=cursor,
363
- )
364
- )
365
309
  else:
366
310
  synchronous_streams.append(declarative_stream)
367
311
  else:
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
59
59
 
60
60
  def __init__(
61
61
  self,
62
- cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
62
+ date_time_based_cursor: DatetimeBasedCursor,
63
+ substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
63
64
  **kwargs: Any,
64
65
  ):
65
66
  super().__init__(**kwargs)
66
- self._cursor = cursor
67
+ self._date_time_based_cursor = date_time_based_cursor
68
+ self._substream_cursor = substream_cursor
67
69
 
68
70
  def filter_records(
69
71
  self,
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
75
77
  records = (
76
78
  record
77
79
  for record in records
78
- if self._cursor.should_be_synced(
80
+ if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
79
81
  # Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
80
82
  # Record stream name is empty cause it is not used durig the filtering
81
83
  Record(data=record, associated_slice=stream_slice, stream_name="")
@@ -2,7 +2,6 @@
2
2
  # Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ConcurrentCursorFactory, ConcurrentPerPartitionCursor
6
5
  from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
7
6
  from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
8
7
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import GlobalSubstreamCursor
@@ -15,8 +14,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
15
14
 
16
15
  __all__ = [
17
16
  "CursorFactory",
18
- "ConcurrentCursorFactory"
19
- "ConcurrentPerPartitionCursor",
20
17
  "DatetimeBasedCursor",
21
18
  "DeclarativeCursor",
22
19
  "GlobalSubstreamCursor",
@@ -303,15 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
303
303
  raise ValueError("A partition needs to be provided in order to get request body json")
304
304
 
305
305
  def should_be_synced(self, record: Record) -> bool:
306
- if self._to_partition_key(record.associated_slice.partition) not in self._cursor_per_partition:
307
- partition_state = (
308
- self._state_to_migrate_from
309
- if self._state_to_migrate_from
310
- else self._NO_CURSOR_STATE
311
- )
312
- cursor = self._create_cursor(partition_state)
313
-
314
- self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)] = cursor
315
306
  return self._get_cursor(record).should_be_synced(
316
307
  self._convert_record_to_cursor_record(record)
317
308
  )
@@ -81,8 +81,6 @@ from airbyte_cdk.sources.declarative.extractors.record_selector import (
81
81
  )
82
82
  from airbyte_cdk.sources.declarative.incremental import (
83
83
  ChildPartitionResumableFullRefreshCursor,
84
- ConcurrentCursorFactory,
85
- ConcurrentPerPartitionCursor,
86
84
  CursorFactory,
87
85
  DatetimeBasedCursor,
88
86
  DeclarativeCursor,
@@ -907,62 +905,6 @@ class ModelToComponentFactory:
907
905
  cursor_granularity=cursor_granularity,
908
906
  )
909
907
 
910
- def create_concurrent_cursor_from_perpartition_cursor(
911
- self,
912
- state_manager: ConnectorStateManager,
913
- model_type: Type[BaseModel],
914
- component_definition: ComponentDefinition,
915
- stream_name: str,
916
- stream_namespace: Optional[str],
917
- config: Config,
918
- stream_state: MutableMapping[str, Any],
919
- partition_router,
920
- **kwargs: Any,
921
- ) -> ConcurrentPerPartitionCursor:
922
- component_type = component_definition.get("type")
923
- if component_definition.get("type") != model_type.__name__:
924
- raise ValueError(
925
- f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
926
- )
927
-
928
- datetime_based_cursor_model = model_type.parse_obj(component_definition)
929
-
930
- if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
931
- raise ValueError(
932
- f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
933
- )
934
-
935
- interpolated_cursor_field = InterpolatedString.create(
936
- datetime_based_cursor_model.cursor_field,
937
- parameters=datetime_based_cursor_model.parameters or {},
938
- )
939
- cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
940
-
941
- # Create the cursor factory
942
- cursor_factory = ConcurrentCursorFactory(
943
- partial(
944
- self.create_concurrent_cursor_from_datetime_based_cursor,
945
- state_manager=state_manager,
946
- model_type=model_type,
947
- component_definition=component_definition,
948
- stream_name=stream_name,
949
- stream_namespace=stream_namespace,
950
- config=config,
951
- )
952
- )
953
-
954
- # Return the concurrent cursor and state converter
955
- return ConcurrentPerPartitionCursor(
956
- cursor_factory=cursor_factory,
957
- partition_router=partition_router,
958
- stream_name=stream_name,
959
- stream_namespace=stream_namespace,
960
- stream_state=stream_state,
961
- message_repository=self._message_repository, # type: ignore
962
- connector_state_manager=state_manager,
963
- cursor_field=cursor_field,
964
- )
965
-
966
908
  @staticmethod
967
909
  def create_constant_backoff_strategy(
968
910
  model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
@@ -1245,14 +1187,17 @@ class ModelToComponentFactory:
1245
1187
  raise ValueError(
1246
1188
  "Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
1247
1189
  )
1248
- cursor = combined_slicers if isinstance(
1249
- combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1250
- ) else self._create_component_from_model(
1251
- model=model.incremental_sync, config=config
1252
- )
1253
-
1254
1190
  client_side_incremental_sync = {
1255
- "cursor": cursor
1191
+ "date_time_based_cursor": self._create_component_from_model(
1192
+ model=model.incremental_sync, config=config
1193
+ ),
1194
+ "substream_cursor": (
1195
+ combined_slicers
1196
+ if isinstance(
1197
+ combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
1198
+ )
1199
+ else None
1200
+ ),
1256
1201
  }
1257
1202
 
1258
1203
  if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
@@ -1966,7 +1911,7 @@ class ModelToComponentFactory:
1966
1911
  if (
1967
1912
  not isinstance(stream_slicer, DatetimeBasedCursor)
1968
1913
  or type(stream_slicer) is not DatetimeBasedCursor
1969
- ) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
1914
+ ):
1970
1915
  # Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
1971
1916
  # Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
1972
1917
  # their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
@@ -178,7 +178,7 @@ class SimpleRetriever(Retriever):
178
178
  stream_slice,
179
179
  next_page_token,
180
180
  self._paginator.get_request_headers,
181
- self.request_option_provider.get_request_headers,
181
+ self.stream_slicer.get_request_headers,
182
182
  )
183
183
  if isinstance(headers, str):
184
184
  raise ValueError("Request headers cannot be a string")
@@ -38,6 +38,7 @@ class DeclarativePartitionFactory:
38
38
  stream_slice,
39
39
  )
40
40
 
41
+
41
42
  class DeclarativePartition(Partition):
42
43
  def __init__(
43
44
  self,
@@ -240,15 +240,6 @@ class ConcurrentCursor(Cursor):
240
240
  def _extract_cursor_value(self, record: Record) -> Any:
241
241
  return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
242
242
 
243
- def close_partition_without_emit(self, partition: Partition) -> None:
244
- slice_count_before = len(self.state.get("slices", []))
245
- self._add_slice_to_state(partition)
246
- if slice_count_before < len(
247
- self.state["slices"]
248
- ): # only emit if at least one slice has been processed
249
- self._merge_partitions()
250
- self._has_closed_at_least_one_slice = True
251
-
252
243
  def close_partition(self, partition: Partition) -> None:
253
244
  slice_count_before = len(self.state.get("slices", []))
254
245
  self._add_slice_to_state(partition)
@@ -0,0 +1,55 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+
4
+ import importlib.util
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+ from typing import Optional
8
+
9
+ import pytest
10
+
11
+ # The following fixtures are used to load a manifest-only connector's components module and manifest file.
12
+ # They can be accessed from any test file in the connector's unit_tests directory by importing them as follows:
13
+
14
+ # from airbyte_cdk.test.utils.manifest_only_fixtures import components_module, connector_dir, manifest_path
15
+
16
+ # individual components can then be referenced as: components_module.<CustomComponentClass>
17
+
18
+
19
+ @pytest.fixture(scope="session")
20
+ def connector_dir(request: pytest.FixtureRequest) -> Path:
21
+ """Return the connector's root directory.
22
+
23
+ This assumes tests are being run from the unit_tests directory,
24
+ and that it is a direct child of the connector directory.
25
+ """
26
+ test_dir = Path(request.config.invocation_params.dir)
27
+ return test_dir.parent
28
+
29
+
30
+ @pytest.fixture(scope="session")
31
+ def components_module(connector_dir: Path) -> Optional[ModuleType]:
32
+ """Load and return the components module from the connector directory.
33
+
34
+ This assumes the components module is located at <connector_dir>/components.py.
35
+ """
36
+ components_path = connector_dir / "components.py"
37
+ if not components_path.exists():
38
+ return None
39
+
40
+ components_spec = importlib.util.spec_from_file_location("components", components_path)
41
+ if components_spec is None:
42
+ return None
43
+
44
+ components_module = importlib.util.module_from_spec(components_spec)
45
+ if components_spec.loader is None:
46
+ return None
47
+
48
+ components_spec.loader.exec_module(components_module)
49
+ return components_module
50
+
51
+
52
+ @pytest.fixture(scope="session")
53
+ def manifest_path(connector_dir: Path) -> Path:
54
+ """Return the path to the connector's manifest file."""
55
+ return connector_dir / "manifest.yaml"
@@ -0,0 +1,306 @@
1
+ Metadata-Version: 2.1
2
+ Name: airbyte-cdk
3
+ Version: 6.8.3rc1
4
+ Summary: A framework for writing Airbyte Connectors.
5
+ Home-page: https://airbyte.com
6
+ License: MIT
7
+ Keywords: airbyte,connector-development-kit,cdk
8
+ Author: Airbyte
9
+ Author-email: contact@airbyte.io
10
+ Requires-Python: >=3.10,<3.13
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Provides-Extra: file-based
21
+ Provides-Extra: sphinx-docs
22
+ Provides-Extra: sql
23
+ Provides-Extra: vector-db-based
24
+ Requires-Dist: Jinja2 (>=3.1.2,<3.2.0)
25
+ Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
26
+ Requires-Dist: Sphinx (>=4.2,<4.3) ; extra == "sphinx-docs"
27
+ Requires-Dist: airbyte-protocol-models-dataclasses (>=0.14,<0.15)
28
+ Requires-Dist: avro (>=1.11.2,<1.12.0) ; extra == "file-based"
29
+ Requires-Dist: backoff
30
+ Requires-Dist: cachetools
31
+ Requires-Dist: cohere (==4.21) ; extra == "vector-db-based"
32
+ Requires-Dist: cryptography (>=42.0.5,<44.0.0)
33
+ Requires-Dist: dpath (>=2.1.6,<3.0.0)
34
+ Requires-Dist: dunamai (>=1.22.0,<2.0.0)
35
+ Requires-Dist: fastavro (>=1.8.0,<1.9.0) ; extra == "file-based"
36
+ Requires-Dist: genson (==1.3.0)
37
+ Requires-Dist: isodate (>=0.6.1,<0.7.0)
38
+ Requires-Dist: jsonref (>=0.2,<0.3)
39
+ Requires-Dist: jsonschema (>=4.17.3,<4.18.0)
40
+ Requires-Dist: langchain (==0.1.16) ; extra == "vector-db-based"
41
+ Requires-Dist: langchain_core (==0.1.42)
42
+ Requires-Dist: markdown ; extra == "file-based"
43
+ Requires-Dist: nltk (==3.9.1)
44
+ Requires-Dist: numpy (<2)
45
+ Requires-Dist: openai[embeddings] (==0.27.9) ; extra == "vector-db-based"
46
+ Requires-Dist: orjson (>=3.10.7,<4.0.0)
47
+ Requires-Dist: pandas (==2.2.2)
48
+ Requires-Dist: pdf2image (==1.16.3) ; extra == "file-based"
49
+ Requires-Dist: pdfminer.six (==20221105) ; extra == "file-based"
50
+ Requires-Dist: pendulum (<3.0.0)
51
+ Requires-Dist: psutil (==6.1.0)
52
+ Requires-Dist: pyarrow (>=15.0.0,<15.1.0) ; extra == "file-based"
53
+ Requires-Dist: pydantic (>=2.7,<3.0)
54
+ Requires-Dist: pyjwt (>=2.8.0,<3.0.0)
55
+ Requires-Dist: pyrate-limiter (>=3.1.0,<3.2.0)
56
+ Requires-Dist: pytesseract (==0.3.10) ; extra == "file-based"
57
+ Requires-Dist: python-calamine (==0.2.3) ; extra == "file-based"
58
+ Requires-Dist: python-dateutil
59
+ Requires-Dist: python-snappy (==0.7.3) ; extra == "file-based"
60
+ Requires-Dist: python-ulid (>=3.0.0,<4.0.0)
61
+ Requires-Dist: pytz (==2024.1)
62
+ Requires-Dist: rapidfuzz (>=3.10.1,<4.0.0)
63
+ Requires-Dist: requests
64
+ Requires-Dist: requests_cache
65
+ Requires-Dist: serpyco-rs (>=1.10.2,<2.0.0)
66
+ Requires-Dist: sphinx-rtd-theme (>=1.0,<1.1) ; extra == "sphinx-docs"
67
+ Requires-Dist: sqlalchemy (>=2.0,<3.0,!=2.0.36) ; extra == "sql"
68
+ Requires-Dist: tiktoken (==0.8.0) ; extra == "vector-db-based"
69
+ Requires-Dist: unstructured.pytesseract (>=0.3.12) ; extra == "file-based"
70
+ Requires-Dist: unstructured[docx,pptx] (==0.10.27) ; extra == "file-based"
71
+ Requires-Dist: wcmatch (==10.0)
72
+ Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
73
+ Project-URL: Documentation, https://docs.airbyte.io/
74
+ Project-URL: Repository, https://github.com/airbytehq/airbyte-python-cdk
75
+ Description-Content-Type: text/markdown
76
+
77
+ # Airbyte Python CDK and Low-Code CDK
78
+
79
+ Airbyte Python CDK is a framework for building Airbyte API Source Connectors. It provides a set of
80
+ classes and helpers that make it easy to build a connector against an HTTP API (REST, GraphQL, etc),
81
+ or a generic Python source connector.
82
+
83
+ ## Usage
84
+
85
+ If you're looking to build a connector, we highly recommend that you
86
+ [start with the Connector Builder](https://docs.airbyte.com/connector-development/connector-builder-ui/overview).
87
+ It should be enough for 90% connectors out there. For more flexible and complex connectors, use the
88
+ [low-code CDK and `SourceDeclarativeManifest`](https://docs.airbyte.com/connector-development/config-based/low-code-cdk-overview).
89
+
90
+ If that doesn't work, then consider building on top of the
91
+ [lower-level Python CDK itself](https://docs.airbyte.com/connector-development/cdk-python/).
92
+
93
+ ### Quick Start
94
+
95
+ To get started on a Python CDK based connector or a low-code connector, you can generate a connector
96
+ project from a template:
97
+
98
+ ```bash
99
+ # from the repo root
100
+ cd airbyte-integrations/connector-templates/generator
101
+ ./generate.sh
102
+ ```
103
+
104
+ ### Example Connectors
105
+
106
+ **HTTP Connectors**:
107
+
108
+ - [Stripe](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-stripe/)
109
+ - [Salesforce](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-salesforce/)
110
+
111
+ **Python connectors using the bare-bones `Source` abstraction**:
112
+
113
+ - [Google Sheets](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-google-sheets/google_sheets_source/google_sheets_source.py)
114
+
115
+ This will generate a project with a type and a name of your choice and put it in
116
+ `airbyte-integrations/connectors`. Open the directory with your connector in an editor and follow
117
+ the `TODO` items.
118
+
119
+ ## Python CDK Overview
120
+
121
+ Airbyte CDK code is within `airbyte_cdk` directory. Here's a high level overview of what's inside:
122
+
123
+ - `connector_builder`. Internal wrapper that helps the Connector Builder platform run a declarative
124
+ manifest (low-code connector). You should not use this code directly. If you need to run a
125
+ `SourceDeclarativeManifest`, take a look at
126
+ [`source-declarative-manifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest)
127
+ connector implementation instead.
128
+ - `destinations`. Basic Destination connector support! If you're building a Destination connector in
129
+ Python, try that. Some of our vector DB destinations like `destination-pinecone` are using that
130
+ code.
131
+ - `models` expose `airbyte_protocol.models` as a part of `airbyte_cdk` package.
132
+ - `sources/concurrent_source` is the Concurrent CDK implementation. It supports reading data from
133
+ streams concurrently per slice / partition, useful for connectors with high throughput and high
134
+ number of records.
135
+ - `sources/declarative` is the low-code CDK. It works on top of Airbyte Python CDK, but provides a
136
+ declarative manifest language to define streams, operations, etc. This makes it easier to build
137
+ connectors without writing Python code.
138
+ - `sources/file_based` is the CDK for file-based sources. Examples include S3, Azure, GCS, etc.
139
+
140
+ ## Contributing
141
+
142
+ Thank you for being interested in contributing to Airbyte Python CDK! Here are some guidelines to
143
+ get you started:
144
+
145
+ - We adhere to the [code of conduct](/CODE_OF_CONDUCT.md).
146
+ - You can contribute by reporting bugs, posting github discussions, opening issues, improving
147
+ [documentation](/docs/), and submitting pull requests with bugfixes and new features alike.
148
+ - If you're changing the code, please add unit tests for your change.
149
+ - When submitting issues or PRs, please add a small reproduction project. Using the changes in your
150
+ connector and providing that connector code as an example (or a satellite PR) helps!
151
+
152
+ ### First time setup
153
+
154
+ Install the project dependencies and development tools:
155
+
156
+ ```bash
157
+ poetry install --all-extras
158
+ ```
159
+
160
+ Installing all extras is required to run the full suite of unit tests.
161
+
162
+ #### Running tests locally
163
+
164
+ - Iterate on the CDK code locally
165
+ - Run tests via `poetry run poe unit-test-with-cov`, or `python -m pytest -s unit_tests` if you want
166
+ to pass pytest options.
167
+ - Run `poetry run poe check-local` to lint all code, type-check modified code, and run unit tests
168
+ with coverage in one command.
169
+
170
+ To see all available scripts, run `poetry run poe`.
171
+
172
+ #### Formatting the code
173
+
174
+ - Iterate on the CDK code locally
175
+ - Run `poetry run ruff format` to format your changes.
176
+
177
+ To see all available `ruff` options, run `poetry run ruff`.
178
+
179
+ ##### Autogenerated files
180
+
181
+ Low-code CDK models are generated from `sources/declarative/declarative_component_schema.yaml`. If
182
+ the iteration you are working on includes changes to the models or the connector generator, you
183
+ might want to regenerate them. In order to do that, you can run:
184
+
185
+ ```bash
186
+ poetry run poe build
187
+ ```
188
+
189
+ This will generate the code generator docker image and the component manifest files based on the
190
+ schemas and templates.
191
+
192
+ #### Testing
193
+
194
+ All tests are located in the `unit_tests` directory. Run `poetry run poe unit-test-with-cov` to run
195
+ them. This also presents a test coverage report. For faster iteration with no coverage report and
196
+ more options, `python -m pytest -s unit_tests` is a good place to start.
197
+
198
+ #### Building and testing a connector with your local CDK
199
+
200
+ When developing a new feature in the CDK, you may find it helpful to run a connector that uses that
201
+ new feature. You can test this in one of two ways:
202
+
203
+ - Running a connector locally
204
+ - Building and running a source via Docker
205
+
206
+ ##### Installing your local CDK into a local Python connector
207
+
208
+ Open the connector's `pyproject.toml` file and replace the line with `airbyte_cdk` with the
209
+ following:
210
+
211
+ ```toml
212
+ airbyte_cdk = { path = "../../../airbyte-cdk/python/airbyte_cdk", develop = true }
213
+ ```
214
+
215
+ Then, running `poetry update` should reinstall `airbyte_cdk` from your local working directory.
216
+
217
+ ##### Building a Python connector in Docker with your local CDK installed
218
+
219
+ _Pre-requisite: Install the
220
+ [`airbyte-ci` CLI](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)_
221
+
222
+ You can build your connector image with the local CDK using
223
+
224
+ ```bash
225
+ # from the airbytehq/airbyte base directory
226
+ airbyte-ci connectors --use-local-cdk --name=<CONNECTOR> build
227
+ ```
228
+
229
+ Note that the local CDK is injected at build time, so if you make changes, you will have to run the
230
+ build command again to see them reflected.
231
+
232
+ ##### Running Connector Acceptance Tests for a single connector in Docker with your local CDK installed
233
+
234
+ _Pre-requisite: Install the
235
+ [`airbyte-ci` CLI](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)_
236
+
237
+ To run acceptance tests for a single connectors using the local CDK, from the connector directory,
238
+ run
239
+
240
+ ```bash
241
+ airbyte-ci connectors --use-local-cdk --name=<CONNECTOR> test
242
+ ```
243
+
244
+ #### When you don't have access to the API
245
+
246
+ There may be a time when you do not have access to the API (either because you don't have the
247
+ credentials, network access, etc...) You will probably still want to do end-to-end testing at least
248
+ once. In order to do so, you can emulate the server you would be reaching using a server stubbing
249
+ tool.
250
+
251
+ For example, using [mockserver](https://www.mock-server.com/), you can set up an expectation file
252
+ like this:
253
+
254
+ ```json
255
+ {
256
+ "httpRequest": {
257
+ "method": "GET",
258
+ "path": "/data"
259
+ },
260
+ "httpResponse": {
261
+ "body": "{\"data\": [{\"record_key\": 1}, {\"record_key\": 2}]}"
262
+ }
263
+ }
264
+ ```
265
+
266
+ Assuming this file has been created at `secrets/mock_server_config/expectations.json`, running the
267
+ following command will allow to match any requests on path `/data` to return the response defined in
268
+ the expectation file:
269
+
270
+ ```bash
271
+ docker run -d --rm -v $(pwd)/secrets/mock_server_config:/config -p 8113:8113 --env MOCKSERVER_LOG_LEVEL=TRACE --env MOCKSERVER_SERVER_PORT=8113 --env MOCKSERVER_WATCH_INITIALIZATION_JSON=true --env MOCKSERVER_PERSISTED_EXPECTATIONS_PATH=/config/expectations.json --env MOCKSERVER_INITIALIZATION_JSON_PATH=/config/expectations.json mockserver/mockserver:5.15.0
272
+ ```
273
+
274
+ HTTP requests to `localhost:8113/data` should now return the body defined in the expectations file.
275
+ To test this, the implementer either has to change the code which defines the base URL for Python
276
+ source or update the `url_base` from low-code. With the Connector Builder running in docker, you
277
+ will have to use domain `host.docker.internal` instead of `localhost` as the requests are executed
278
+ within docker.
279
+
280
+ #### Publishing a new version to PyPi
281
+
282
+ Python CDK has a
283
+ [GitHub workflow](https://github.com/airbytehq/airbyte/actions/workflows/publish-cdk-command-manually.yml)
284
+ that manages the CDK changelog, making a new release for `airbyte_cdk`, publishing it to PyPI, and
285
+ then making a commit to update (and subsequently auto-release)
286
+ [`source-declarative-m anifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest)
287
+ and Connector Builder (in the platform repository).
288
+
289
+ > [!Note]: The workflow will handle the `CHANGELOG.md` entry for you. You should not add changelog
290
+ > lines in your PRs to the CDK itself.
291
+
292
+ > [!Warning]: The workflow bumps version on it's own, please don't change the CDK version in
293
+ > `pyproject.toml` manually.
294
+
295
+ 1. You only trigger the release workflow once all the PRs that you want to be included are already
296
+ merged into the `master` branch.
297
+ 2. The
298
+ [`Publish CDK Manually`](https://github.com/airbytehq/airbyte/actions/workflows/publish-cdk-command-manually.yml)
299
+ workflow from master using `release-type=major|manor|patch` and setting the changelog message.
300
+ 3. When the workflow runs, it will commit a new version directly to master branch.
301
+ 4. The workflow will bump the version of `source-declarative-manifest` according to the
302
+ `release-type` of the CDK, then commit these changes back to master. The commit to master will
303
+ kick off a publish of the new version of `source-declarative-manifest`.
304
+ 5. The workflow will also add a pull request to `airbyte-platform-internal` repo to bump the
305
+ dependency in Connector Builder.
306
+
@@ -62,7 +62,7 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
62
62
  airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
63
63
  airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
64
64
  airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
65
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=PWqtQ6xzRZiM0XrMO_zCJjl9tvbFMJMwSec0nN2ZekA,26434
65
+ airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=F2X2ZS9eDfrohNbxG2TgPW-f4YP8IAkMjO1XHtD6NIg,23464
66
66
  airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
67
67
  airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
68
68
  airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
@@ -80,15 +80,14 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=YFuL4D4RuuB8E1DNSb
80
80
  airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
81
81
  airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
82
82
  airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
83
- airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=yTdEkyDUSW2KbFkEwJJMlS963C955LgCCOVfTmmScpQ,3367
83
+ airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
84
84
  airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=AkXPOWyp741cpYLBl9AbmVmOQmQ2BzZ2XjgsMEB6gGc,6583
85
85
  airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
86
- airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=zEERPIXz1WxCJypqlSXZCFIpT4-mIsjzRdmFlX2-nMg,1210
87
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=-ECXZbDh3nw7G4mBncsTT_68LWQvS8TySIgckBTZZQQ,11899
86
+ airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=CmZl9ddwMZFo8L7mEl_OFHN3ahIFRSYrJjMbR_cJaFA,1006
88
87
  airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
89
88
  airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
90
89
  airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
91
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=cdk4gSuYQmqcxxIOclhms6cnI1qm-FrSu7lmZULxOPM,16199
90
+ airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
92
91
  airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
93
92
  airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
94
93
  airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
@@ -110,7 +109,7 @@ airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQ
110
109
  airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
111
110
  airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=jVZ3ZV5YZrmDNIX5cM2mugXmnbH27zHRcD22_3oatpo,8454
112
111
  airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
113
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=mOO0HahnHP0yv5LHFCayIx98R-yYHw6qkY9T5BxSlBg,98683
112
+ airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=tO7xkv4y5iH6wGkj5As1T5ItUQxlw6cLflHAH48PKwc,96355
114
113
  airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=8uGos2u7TFTx_EJBdcjdUGn3Eyx6jUuEa1_VB8UP_dI,631
115
114
  airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
116
115
  airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py,sha256=t7pRdFWfFWJtQQG19c9PVeMODyO2BknRTakpM5U9N-8,4844
@@ -156,7 +155,7 @@ airbyte_cdk/sources/declarative/requesters/requester.py,sha256=iVVpXQ4KEd9OyZNwm
156
155
  airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=FVQpUGVwp2Gibk4gp07VmLKX5AafUlsZWFSrDpUDuJM,443
157
156
  airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=WDFnjrXLz3-YEjFhmlMkWAn9AJvnZ0mk9FyC8DAhEYk,4976
158
157
  airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
159
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=6IP6e9cjGEU2y77lcOKj1bqn3bYGBAsP8vJU4Skzp30,24182
158
+ airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=N4swGw5mfuTXJ2R7AKX18CHzizsr69pXwt5uSHLPi48,24172
160
159
  airbyte_cdk/sources/declarative/schema/__init__.py,sha256=ul8L9S0-__AMEdbCLHBq-PMEeA928NVp8BB83BMotfU,517
161
160
  airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
162
161
  airbyte_cdk/sources/declarative/schema/inline_schema_loader.py,sha256=bVETE10hRsatRJq3R3BeyRR0wIoK3gcP1gcpVRQ_P5U,464
@@ -165,7 +164,7 @@ airbyte_cdk/sources/declarative/schema/schema_loader.py,sha256=kjt8v0N5wWKA5zyLn
165
164
  airbyte_cdk/sources/declarative/spec/__init__.py,sha256=H0UwoRhgucbKBIzg85AXrifybVmfpwWpPdy22vZKVuo,141
166
165
  airbyte_cdk/sources/declarative/spec/spec.py,sha256=ODSNUgkDOhnLQnwLjgSaME6R3kNeywjROvbNrWEnsgU,1876
167
166
  airbyte_cdk/sources/declarative/stream_slicers/__init__.py,sha256=sI9vhc95RwJYOnA0VKjcbtKgFcmAbWjhdWBXFbAijOs,176
168
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py,sha256=7KE_qBBP3QYA7qQdOE42u3fwUM5S1FD5rowf7gtu3qk,3462
167
+ airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py,sha256=E7feZ5xkHwFHODq8FSjwdGe291RZoCMCRHT1rWnQ1lI,3463
169
168
  airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py,sha256=SOkIPBi2Wu7yxIvA15yFzUAB95a3IzA8LPq5DEqHQQc,725
170
169
  airbyte_cdk/sources/declarative/transformations/__init__.py,sha256=CPJ8TlMpiUmvG3624VYu_NfTzxwKcfBjM2Q2wJ7fkSA,919
171
170
  airbyte_cdk/sources/declarative/transformations/add_fields.py,sha256=r4YdAuAk2bQtNWJMztIIy2CC-NglD9NeK1s1TeO9wkw,5027
@@ -246,7 +245,7 @@ airbyte_cdk/sources/streams/concurrent/abstract_stream.py,sha256=3OB5VsvOkJmCxIM
246
245
  airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py,sha256=QTry1QCBUwJDw1QSCEvz23s7zIEx_7QMxkPq9j-oPIQ,1358
247
246
  airbyte_cdk/sources/streams/concurrent/adapters.py,sha256=QP_64kQo-b3sRNHZA5aqrgCJqAhIVegRM3vJ8jGyuSY,15213
248
247
  airbyte_cdk/sources/streams/concurrent/availability_strategy.py,sha256=4La5v2UffSjGnhmF4kwNIKt_g3RXk2ux1mSHA1ejgYM,2898
249
- airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=SbkWn2t5uxVhT6W657zrENWnxC74oyp_WU9ol-_w5so,21215
248
+ airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=Hke6CpD8Sq1FS4g1Xuht39UN7hKkGy1mvOxvQrm1lLM,20810
250
249
  airbyte_cdk/sources/streams/concurrent/default_stream.py,sha256=K3rLMpYhS7nnmvwQ52lqBy7DQdFMJpvvT7sgBg_ckA8,3207
251
250
  airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=JOZ446MCLpmF26r9KfS6OO_6rGjcjgJNZdcw6jccjEI,468
252
251
  airbyte_cdk/sources/streams/concurrent/helpers.py,sha256=S6AW8TgIASCZ2UuUcQLE8OzgYUHWt2-KPOvNPwnQf-Q,1596
@@ -314,6 +313,7 @@ airbyte_cdk/test/state_builder.py,sha256=kLPql9lNzUJaBg5YYRLJlY_Hy5JLHJDVyKPMZMo
314
313
  airbyte_cdk/test/utils/__init__.py,sha256=Hu-1XT2KDoYjDF7-_ziDwv5bY3PueGjANOCbzeOegDg,57
315
314
  airbyte_cdk/test/utils/data.py,sha256=CkCR1_-rujWNmPXFR1IXTMwx1rAl06wAyIKWpDcN02w,820
316
315
  airbyte_cdk/test/utils/http_mocking.py,sha256=F2hpm2q4ijojQN5u2XtgTAp8aNgHgJ64eZNkZ9BW0ig,550
316
+ airbyte_cdk/test/utils/manifest_only_fixtures.py,sha256=kGg8kSmEouHPDCJf8GKkKqEAQaCLYfgdPEvRTb64dCI,1898
317
317
  airbyte_cdk/test/utils/reading.py,sha256=SOTDYlps6Te9KumfTJ3vVDSm9EUXhvKtE8aD7gvdPlg,965
318
318
  airbyte_cdk/utils/__init__.py,sha256=gHjOCoUkolS_nKtgFSudXUY-ObK2vUo6aNQLvW7o8q8,347
319
319
  airbyte_cdk/utils/airbyte_secrets_utils.py,sha256=wEtRnl5KRhN6eLJwrDrC4FJjyqt_4vkA1F65mdl8c24,3142
@@ -331,8 +331,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
331
331
  airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
332
332
  airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
333
333
  airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
334
- airbyte_cdk-6.8.2.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
335
- airbyte_cdk-6.8.2.dev1.dist-info/METADATA,sha256=yATvM83Zo6tZfb5wnnP-1YGmBjL1ZR2zWzZFr09J1R8,6112
336
- airbyte_cdk-6.8.2.dev1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
337
- airbyte_cdk-6.8.2.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
338
- airbyte_cdk-6.8.2.dev1.dist-info/RECORD,,
334
+ airbyte_cdk-6.8.3rc1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
335
+ airbyte_cdk-6.8.3rc1.dist-info/METADATA,sha256=BJ498EOIPCD0I5hhKWn_RkGxSLEu_ewOVqDo75QpFAs,13483
336
+ airbyte_cdk-6.8.3rc1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
337
+ airbyte_cdk-6.8.3rc1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
338
+ airbyte_cdk-6.8.3rc1.dist-info/RECORD,,
@@ -1,270 +0,0 @@
1
- import copy
2
-
3
- #
4
- # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
5
- #
6
- import logging
7
- from collections import OrderedDict
8
- from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
9
-
10
- from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
11
- from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
12
- from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
13
- from airbyte_cdk.sources.message import MessageRepository
14
- from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
15
- PerPartitionKeySerializer,
16
- )
17
- from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
18
- from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
19
- from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
20
-
21
- logger = logging.getLogger("airbyte")
22
-
23
-
24
- class ConcurrentCursorFactory:
25
- def __init__(self, create_function: Callable[..., Cursor]):
26
- self._create_function = create_function
27
-
28
- def create(self, stream_state: Mapping[str, Any]) -> Cursor:
29
- return self._create_function(stream_state=stream_state)
30
-
31
-
32
- class ConcurrentPerPartitionCursor(Cursor):
33
- """
34
- Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
35
-
36
- **Partition Limitation and Limit Reached Logic**
37
-
38
- - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
39
- - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
40
- - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
41
-
42
- The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
43
-
44
- - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
45
- - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
46
-
47
- This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
48
- """
49
-
50
- DEFAULT_MAX_PARTITIONS_NUMBER = 10000
51
- _NO_STATE: Mapping[str, Any] = {}
52
- _NO_CURSOR_STATE: Mapping[str, Any] = {}
53
- _KEY = 0
54
- _VALUE = 1
55
- _state_to_migrate_from: Mapping[str, Any] = {}
56
-
57
- def __init__(
58
- self,
59
- cursor_factory: ConcurrentCursorFactory,
60
- partition_router: PartitionRouter,
61
- stream_name: str,
62
- stream_namespace: Optional[str],
63
- stream_state: Any,
64
- message_repository: MessageRepository,
65
- connector_state_manager: ConnectorStateManager,
66
- cursor_field: CursorField,
67
- ) -> None:
68
- self._stream_name = stream_name
69
- self._stream_namespace = stream_namespace
70
- self._message_repository = message_repository
71
- self._connector_state_manager = connector_state_manager
72
- self._cursor_field = cursor_field
73
-
74
- self._cursor_factory = cursor_factory
75
- self._partition_router = partition_router
76
-
77
- # The dict is ordered to ensure that once the maximum number of partitions is reached,
78
- # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
79
- self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
80
- self._over_limit = 0
81
- self._partition_serializer = PerPartitionKeySerializer()
82
-
83
- self._set_initial_state(stream_state)
84
-
85
- @property
86
- def cursor_field(self) -> CursorField:
87
- return self._cursor_field
88
-
89
- @property
90
- def state(self) -> MutableMapping[str, Any]:
91
- states = []
92
- for partition_tuple, cursor in self._cursor_per_partition.items():
93
- cursor_state = cursor._connector_state_converter.convert_to_state_message(
94
- cursor._cursor_field, cursor.state
95
- )
96
- if cursor_state:
97
- states.append(
98
- {
99
- "partition": self._to_dict(partition_tuple),
100
- "cursor": copy.deepcopy(cursor_state),
101
- }
102
- )
103
- state: dict[str, Any] = {"states": states}
104
- return state
105
-
106
- def close_partition(self, partition: Partition) -> None:
107
- self._cursor_per_partition[self._to_partition_key(partition._stream_slice.partition)].close_partition_without_emit(partition=partition)
108
-
109
- def ensure_at_least_one_state_emitted(self) -> None:
110
- """
111
- The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
112
- called.
113
- """
114
- self._emit_state_message()
115
-
116
- def _emit_state_message(self) -> None:
117
- self._connector_state_manager.update_state_for_stream(
118
- self._stream_name,
119
- self._stream_namespace,
120
- self.state,
121
- )
122
- state_message = self._connector_state_manager.create_state_message(
123
- self._stream_name, self._stream_namespace
124
- )
125
- self._message_repository.emit_message(state_message)
126
-
127
-
128
- def stream_slices(self) -> Iterable[StreamSlice]:
129
- slices = self._partition_router.stream_slices()
130
- for partition in slices:
131
- yield from self.generate_slices_from_partition(partition)
132
-
133
- def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
134
- # Ensure the maximum number of partitions is not exceeded
135
- self._ensure_partition_limit()
136
-
137
- cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
138
- if not cursor:
139
- partition_state = (
140
- self._state_to_migrate_from
141
- if self._state_to_migrate_from
142
- else self._NO_CURSOR_STATE
143
- )
144
- cursor = self._create_cursor(partition_state)
145
- self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
146
-
147
- for cursor_slice in cursor.stream_slices():
148
- yield StreamSlice(
149
- partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
150
- )
151
-
152
- def _ensure_partition_limit(self) -> None:
153
- """
154
- Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
155
- """
156
- while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
157
- self._over_limit += 1
158
- oldest_partition = self._cursor_per_partition.popitem(last=False)[
159
- 0
160
- ] # Remove the oldest partition
161
- logger.warning(
162
- f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
163
- )
164
-
165
- def limit_reached(self) -> bool:
166
- return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
167
-
168
- def _set_initial_state(self, stream_state: StreamState) -> None:
169
- """
170
- Set the initial state for the cursors.
171
-
172
- This method initializes the state for each partition cursor using the provided stream state.
173
- If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
174
-
175
- Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
176
- does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
177
-
178
- Args:
179
- stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
180
- {
181
- "states": [
182
- {
183
- "partition": {
184
- "partition_key": "value"
185
- },
186
- "cursor": {
187
- "last_updated": "2023-05-27T00:00:00Z"
188
- }
189
- }
190
- ],
191
- "parent_state": {
192
- "parent_stream_name": {
193
- "last_updated": "2023-05-27T00:00:00Z"
194
- }
195
- }
196
- }
197
- """
198
- if not stream_state:
199
- return
200
-
201
- if "states" not in stream_state:
202
- # We assume that `stream_state` is in a global format that can be applied to all partitions.
203
- # Example: {"global_state_format_key": "global_state_format_value"}
204
- self._state_to_migrate_from = stream_state
205
-
206
- else:
207
- for state in stream_state["states"]:
208
- self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
209
- self._create_cursor(state["cursor"])
210
- )
211
-
212
- # set default state for missing partitions if it is per partition with fallback to global
213
- if "state" in stream_state:
214
- self._state_to_migrate_from = stream_state["state"]
215
-
216
- # Set parent state for partition routers based on parent streams
217
- self._partition_router.set_initial_state(stream_state)
218
-
219
- def observe(self, record: Record) -> None:
220
- self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)].observe(record)
221
-
222
- def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
223
- return self._partition_serializer.to_partition_key(partition)
224
-
225
- def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
226
- return self._partition_serializer.to_partition(partition_key)
227
-
228
- def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor:
229
- cursor = self._cursor_factory.create(stream_state=cursor_state)
230
- return cursor
231
-
232
- def should_be_synced(self, record: Record) -> bool:
233
- return self._get_cursor(record).should_be_synced(record)
234
-
235
- def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
236
- if not first.associated_slice or not second.associated_slice:
237
- raise ValueError(
238
- f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
239
- )
240
- if first.associated_slice.partition != second.associated_slice.partition:
241
- raise ValueError(
242
- f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
243
- )
244
-
245
- return self._get_cursor(first).is_greater_than_or_equal(
246
- self._convert_record_to_cursor_record(first),
247
- self._convert_record_to_cursor_record(second),
248
- )
249
-
250
- @staticmethod
251
- def _convert_record_to_cursor_record(record: Record) -> Record:
252
- return Record(
253
- record.data,
254
- StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
255
- if record.associated_slice
256
- else None,
257
- )
258
-
259
- def _get_cursor(self, record: Record) -> Cursor:
260
- if not record.associated_slice:
261
- raise ValueError(
262
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
263
- )
264
- partition_key = self._to_partition_key(record.associated_slice.partition)
265
- if partition_key not in self._cursor_per_partition:
266
- raise ValueError(
267
- "Invalid state as stream slices that are emitted should refer to an existing cursor"
268
- )
269
- cursor = self._cursor_per_partition[partition_key]
270
- return cursor
@@ -1,111 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: airbyte-cdk
3
- Version: 6.8.2.dev1
4
- Summary: A framework for writing Airbyte Connectors.
5
- Home-page: https://airbyte.com
6
- License: MIT
7
- Keywords: airbyte,connector-development-kit,cdk
8
- Author: Airbyte
9
- Author-email: contact@airbyte.io
10
- Requires-Python: >=3.10,<3.13
11
- Classifier: Development Status :: 3 - Alpha
12
- Classifier: Intended Audience :: Developers
13
- Classifier: License :: OSI Approved :: MIT License
14
- Classifier: Programming Language :: Python :: 3
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Classifier: Programming Language :: Python :: 3.12
18
- Classifier: Topic :: Scientific/Engineering
19
- Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
- Provides-Extra: file-based
21
- Provides-Extra: sphinx-docs
22
- Provides-Extra: sql
23
- Provides-Extra: vector-db-based
24
- Requires-Dist: Jinja2 (>=3.1.2,<3.2.0)
25
- Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
26
- Requires-Dist: Sphinx (>=4.2,<4.3) ; extra == "sphinx-docs"
27
- Requires-Dist: airbyte-protocol-models-dataclasses (>=0.14,<0.15)
28
- Requires-Dist: avro (>=1.11.2,<1.12.0) ; extra == "file-based"
29
- Requires-Dist: backoff
30
- Requires-Dist: cachetools
31
- Requires-Dist: cohere (==4.21) ; extra == "vector-db-based"
32
- Requires-Dist: cryptography (>=42.0.5,<44.0.0)
33
- Requires-Dist: dpath (>=2.1.6,<3.0.0)
34
- Requires-Dist: dunamai (>=1.22.0,<2.0.0)
35
- Requires-Dist: fastavro (>=1.8.0,<1.9.0) ; extra == "file-based"
36
- Requires-Dist: genson (==1.3.0)
37
- Requires-Dist: isodate (>=0.6.1,<0.7.0)
38
- Requires-Dist: jsonref (>=0.2,<0.3)
39
- Requires-Dist: jsonschema (>=4.17.3,<4.18.0)
40
- Requires-Dist: langchain (==0.1.16) ; extra == "vector-db-based"
41
- Requires-Dist: langchain_core (==0.1.42)
42
- Requires-Dist: markdown ; extra == "file-based"
43
- Requires-Dist: nltk (==3.9.1)
44
- Requires-Dist: numpy (<2)
45
- Requires-Dist: openai[embeddings] (==0.27.9) ; extra == "vector-db-based"
46
- Requires-Dist: orjson (>=3.10.7,<4.0.0)
47
- Requires-Dist: pandas (==2.2.2)
48
- Requires-Dist: pdf2image (==1.16.3) ; extra == "file-based"
49
- Requires-Dist: pdfminer.six (==20221105) ; extra == "file-based"
50
- Requires-Dist: pendulum (<3.0.0)
51
- Requires-Dist: psutil (==6.1.0)
52
- Requires-Dist: pyarrow (>=15.0.0,<15.1.0) ; extra == "file-based"
53
- Requires-Dist: pydantic (>=2.7,<3.0)
54
- Requires-Dist: pyjwt (>=2.8.0,<3.0.0)
55
- Requires-Dist: pyrate-limiter (>=3.1.0,<3.2.0)
56
- Requires-Dist: pytesseract (==0.3.10) ; extra == "file-based"
57
- Requires-Dist: python-calamine (==0.2.3) ; extra == "file-based"
58
- Requires-Dist: python-dateutil
59
- Requires-Dist: python-snappy (==0.7.3) ; extra == "file-based"
60
- Requires-Dist: python-ulid (>=3.0.0,<4.0.0)
61
- Requires-Dist: pytz (==2024.1)
62
- Requires-Dist: rapidfuzz (>=3.10.1,<4.0.0)
63
- Requires-Dist: requests
64
- Requires-Dist: requests_cache
65
- Requires-Dist: serpyco-rs (>=1.10.2,<2.0.0)
66
- Requires-Dist: sphinx-rtd-theme (>=1.0,<1.1) ; extra == "sphinx-docs"
67
- Requires-Dist: sqlalchemy (>=2.0,<3.0,!=2.0.36) ; extra == "sql"
68
- Requires-Dist: tiktoken (==0.8.0) ; extra == "vector-db-based"
69
- Requires-Dist: unstructured.pytesseract (>=0.3.12) ; extra == "file-based"
70
- Requires-Dist: unstructured[docx,pptx] (==0.10.27) ; extra == "file-based"
71
- Requires-Dist: wcmatch (==10.0)
72
- Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
73
- Project-URL: Documentation, https://docs.airbyte.io/
74
- Project-URL: Repository, https://github.com/airbytehq/airbyte-python-cdk
75
- Description-Content-Type: text/markdown
76
-
77
- # Airbyte Python CDK and Low-Code CDK
78
-
79
- Airbyte Python CDK is a framework for building Airbyte API Source Connectors. It provides a set of
80
- classes and helpers that make it easy to build a connector against an HTTP API (REST, GraphQL, etc),
81
- or a generic Python source connector.
82
-
83
- ## Building Connectors with the CDK
84
-
85
- If you're looking to build a connector, we highly recommend that you first
86
- [start with the Connector Builder](https://docs.airbyte.com/connector-development/connector-builder-ui/overview).
87
- It should be enough for 90% connectors out there. For more flexible and complex connectors, use the
88
- [low-code CDK and `SourceDeclarativeManifest`](https://docs.airbyte.com/connector-development/config-based/low-code-cdk-overview).
89
-
90
- For more information on building connectors, please see the [Connector Development](https://docs.airbyte.com/connector-development/) guide on [docs.airbyte.com](https://docs.airbyte.com).
91
-
92
- ## Python CDK Overview
93
-
94
- Airbyte CDK code is within `airbyte_cdk` directory. Here's a high level overview of what's inside:
95
-
96
- - `airbyte_cdk/connector_builder`. Internal wrapper that helps the Connector Builder platform run a declarative manifest (low-code connector). You should not use this code directly. If you need to run a `SourceDeclarativeManifest`, take a look at [`source-declarative-manifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest) connector implementation instead.
97
- - `airbyte_cdk/cli/source_declarative_manifest`. This module defines the `source-declarative-manifest` (aka "SDM") connector execution logic and associated CLI.
98
- - `airbyte_cdk/destinations`. Basic Destination connector support! If you're building a Destination connector in Python, try that. Some of our vector DB destinations like `destination-pinecone` are using that code.
99
- - `airbyte_cdk/models` expose `airbyte_protocol.models` as a part of `airbyte_cdk` package.
100
- - `airbyte_cdk/sources/concurrent_source` is the Concurrent CDK implementation. It supports reading data from streams concurrently per slice / partition, useful for connectors with high throughput and high number of records.
101
- - `airbyte_cdk/sources/declarative` is the low-code CDK. It works on top of Airbyte Python CDK, but provides a declarative manifest language to define streams, operations, etc. This makes it easier to build connectors without writing Python code.
102
- - `airbyte_cdk/sources/file_based` is the CDK for file-based sources. Examples include S3, Azure, GCS, etc.
103
-
104
- ## Contributing
105
-
106
- For instructions on how to contribute, please see our [Contributing Guide](docs/CONTRIBUTING.md).
107
-
108
- ## Release Management
109
-
110
- Please see the [Release Management](docs/RELEASES.md) guide for information on how to perform releases and pre-releases.
111
-