airbyte-cdk 6.9.1__py3-none-any.whl → 6.9.1.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +35 -30
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3 -101
- airbyte_cdk/sources/declarative/interpolation/jinja.py +35 -36
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +2 -53
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2 -95
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +0 -6
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +21 -95
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +1 -2
- airbyte_cdk/sources/streams/http/http_client.py +5 -15
- airbyte_cdk/test/utils/manifest_only_fixtures.py +80 -0
- airbyte_cdk-6.9.1.dev1.dist-info/METADATA +306 -0
- {airbyte_cdk-6.9.1.dist-info → airbyte_cdk-6.9.1.dev1.dist-info}/RECORD +15 -17
- airbyte_cdk/sources/declarative/resolvers/__init__.py +0 -13
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +0 -55
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +0 -106
- airbyte_cdk-6.9.1.dist-info/METADATA +0 -108
- {airbyte_cdk-6.9.1.dist-info → airbyte_cdk-6.9.1.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.9.1.dist-info → airbyte_cdk-6.9.1.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.9.1.dist-info → airbyte_cdk-6.9.1.dev1.dist-info}/entry_points.txt +0 -0
@@ -119,9 +119,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
119
119
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
120
120
|
CheckStream as CheckStreamModel,
|
121
121
|
)
|
122
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
123
|
-
ComponentMappingDefinition as ComponentMappingDefinitionModel,
|
124
|
-
)
|
125
122
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
126
123
|
CompositeErrorHandler as CompositeErrorHandlerModel,
|
127
124
|
)
|
@@ -194,9 +191,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
194
191
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
195
192
|
GzipJsonDecoder as GzipJsonDecoderModel,
|
196
193
|
)
|
197
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
198
|
-
HttpComponentsResolver as HttpComponentsResolverModel,
|
199
|
-
)
|
200
194
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
201
195
|
HttpRequester as HttpRequesterModel,
|
202
196
|
)
|
@@ -304,7 +298,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
304
298
|
from airbyte_cdk.sources.declarative.partition_routers import (
|
305
299
|
CartesianProductStreamSlicer,
|
306
300
|
ListPartitionRouter,
|
307
|
-
PartitionRouter,
|
308
301
|
SinglePartitionRouter,
|
309
302
|
SubstreamPartitionRouter,
|
310
303
|
)
|
@@ -345,10 +338,6 @@ from airbyte_cdk.sources.declarative.requesters.request_options import (
|
|
345
338
|
)
|
346
339
|
from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
|
347
340
|
from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod
|
348
|
-
from airbyte_cdk.sources.declarative.resolvers import (
|
349
|
-
ComponentMappingDefinition,
|
350
|
-
HttpComponentsResolver,
|
351
|
-
)
|
352
341
|
from airbyte_cdk.sources.declarative.retrievers import (
|
353
342
|
AsyncRetriever,
|
354
343
|
SimpleRetriever,
|
@@ -478,8 +467,6 @@ class ModelToComponentFactory:
|
|
478
467
|
WaitTimeFromHeaderModel: self.create_wait_time_from_header,
|
479
468
|
WaitUntilTimeFromHeaderModel: self.create_wait_until_time_from_header,
|
480
469
|
AsyncRetrieverModel: self.create_async_retriever,
|
481
|
-
HttpComponentsResolverModel: self.create_http_components_resolver,
|
482
|
-
ComponentMappingDefinitionModel: self.create_components_mapping_definition,
|
483
470
|
}
|
484
471
|
|
485
472
|
# Needed for the case where we need to perform a second parse on the fields of a custom component
|
@@ -1294,20 +1281,19 @@ class ModelToComponentFactory:
|
|
1294
1281
|
parameters=model.parameters or {},
|
1295
1282
|
)
|
1296
1283
|
|
1297
|
-
def
|
1298
|
-
self,
|
1299
|
-
|
1300
|
-
|
1301
|
-
) -> Optional[PartitionRouter]:
|
1284
|
+
def _merge_stream_slicers(
|
1285
|
+
self, model: DeclarativeStreamModel, config: Config
|
1286
|
+
) -> Optional[StreamSlicer]:
|
1287
|
+
stream_slicer = None
|
1302
1288
|
if (
|
1303
|
-
hasattr(model, "partition_router")
|
1304
|
-
and isinstance(model, SimpleRetrieverModel)
|
1305
|
-
and model.partition_router
|
1289
|
+
hasattr(model.retriever, "partition_router")
|
1290
|
+
and isinstance(model.retriever, SimpleRetrieverModel)
|
1291
|
+
and model.retriever.partition_router
|
1306
1292
|
):
|
1307
|
-
stream_slicer_model = model.partition_router
|
1293
|
+
stream_slicer_model = model.retriever.partition_router
|
1308
1294
|
|
1309
1295
|
if isinstance(stream_slicer_model, list):
|
1310
|
-
|
1296
|
+
stream_slicer = CartesianProductStreamSlicer(
|
1311
1297
|
[
|
1312
1298
|
self._create_component_from_model(model=slicer, config=config)
|
1313
1299
|
for slicer in stream_slicer_model
|
@@ -1315,24 +1301,9 @@ class ModelToComponentFactory:
|
|
1315
1301
|
parameters={},
|
1316
1302
|
)
|
1317
1303
|
else:
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
def _build_resumable_cursor_from_paginator(
|
1323
|
-
self,
|
1324
|
-
model: Union[AsyncRetrieverModel, CustomRetrieverModel, SimpleRetrieverModel],
|
1325
|
-
stream_slicer: Optional[StreamSlicer],
|
1326
|
-
) -> Optional[StreamSlicer]:
|
1327
|
-
if hasattr(model, "paginator") and model.paginator and not stream_slicer:
|
1328
|
-
# For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor`
|
1329
|
-
return ResumableFullRefreshCursor(parameters={})
|
1330
|
-
return None
|
1331
|
-
|
1332
|
-
def _merge_stream_slicers(
|
1333
|
-
self, model: DeclarativeStreamModel, config: Config
|
1334
|
-
) -> Optional[StreamSlicer]:
|
1335
|
-
stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config)
|
1304
|
+
stream_slicer = self._create_component_from_model(
|
1305
|
+
model=stream_slicer_model, config=config
|
1306
|
+
)
|
1336
1307
|
|
1337
1308
|
if model.incremental_sync and stream_slicer:
|
1338
1309
|
incremental_sync_model = model.incremental_sync
|
@@ -1375,7 +1346,15 @@ class ModelToComponentFactory:
|
|
1375
1346
|
),
|
1376
1347
|
partition_router=stream_slicer,
|
1377
1348
|
)
|
1378
|
-
|
1349
|
+
elif (
|
1350
|
+
hasattr(model.retriever, "paginator")
|
1351
|
+
and model.retriever.paginator
|
1352
|
+
and not stream_slicer
|
1353
|
+
):
|
1354
|
+
# For the regular Full-Refresh streams, we use the high level `ResumableFullRefreshCursor`
|
1355
|
+
return ResumableFullRefreshCursor(parameters={})
|
1356
|
+
else:
|
1357
|
+
return None
|
1379
1358
|
|
1380
1359
|
def create_default_error_handler(
|
1381
1360
|
self, model: DefaultErrorHandlerModel, config: Config, **kwargs: Any
|
@@ -2239,56 +2218,3 @@ class ModelToComponentFactory:
|
|
2239
2218
|
|
2240
2219
|
def _evaluate_log_level(self, emit_connector_builder_messages: bool) -> Level:
|
2241
2220
|
return Level.DEBUG if emit_connector_builder_messages else Level.INFO
|
2242
|
-
|
2243
|
-
@staticmethod
|
2244
|
-
def create_components_mapping_definition(
|
2245
|
-
model: ComponentMappingDefinitionModel, config: Config, **kwargs: Any
|
2246
|
-
) -> ComponentMappingDefinition:
|
2247
|
-
interpolated_value = InterpolatedString.create(
|
2248
|
-
model.value, parameters=model.parameters or {}
|
2249
|
-
)
|
2250
|
-
field_path = [
|
2251
|
-
InterpolatedString.create(path, parameters=model.parameters or {})
|
2252
|
-
for path in model.field_path
|
2253
|
-
]
|
2254
|
-
return ComponentMappingDefinition(
|
2255
|
-
field_path=field_path, # type: ignore[arg-type] # field_path can be str and InterpolatedString
|
2256
|
-
value=interpolated_value,
|
2257
|
-
value_type=ModelToComponentFactory._json_schema_type_name_to_type(model.value_type),
|
2258
|
-
parameters=model.parameters or {},
|
2259
|
-
)
|
2260
|
-
|
2261
|
-
def create_http_components_resolver(
|
2262
|
-
self, model: HttpComponentsResolverModel, config: Config
|
2263
|
-
) -> Any:
|
2264
|
-
stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config)
|
2265
|
-
combined_slicers = self._build_resumable_cursor_from_paginator(
|
2266
|
-
model.retriever, stream_slicer
|
2267
|
-
)
|
2268
|
-
|
2269
|
-
retriever = self._create_component_from_model(
|
2270
|
-
model=model.retriever,
|
2271
|
-
config=config,
|
2272
|
-
name="",
|
2273
|
-
primary_key=None,
|
2274
|
-
stream_slicer=combined_slicers,
|
2275
|
-
transformations=[],
|
2276
|
-
)
|
2277
|
-
|
2278
|
-
components_mapping = [
|
2279
|
-
self._create_component_from_model(
|
2280
|
-
model=components_mapping_definition_model,
|
2281
|
-
value_type=ModelToComponentFactory._json_schema_type_name_to_type(
|
2282
|
-
components_mapping_definition_model.value_type
|
2283
|
-
),
|
2284
|
-
config=config,
|
2285
|
-
)
|
2286
|
-
for components_mapping_definition_model in model.components_mapping
|
2287
|
-
]
|
2288
|
-
|
2289
|
-
return HttpComponentsResolver(
|
2290
|
-
retriever=retriever,
|
2291
|
-
config=config,
|
2292
|
-
components_mapping=components_mapping,
|
2293
|
-
parameters=model.parameters or {},
|
2294
|
-
)
|
@@ -6,6 +6,5 @@ from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_
|
|
6
6
|
from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import ListPartitionRouter
|
7
7
|
from airbyte_cdk.sources.declarative.partition_routers.single_partition_router import SinglePartitionRouter
|
8
8
|
from airbyte_cdk.sources.declarative.partition_routers.substream_partition_router import SubstreamPartitionRouter
|
9
|
-
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
10
9
|
|
11
|
-
__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter"
|
10
|
+
__all__ = ["CartesianProductStreamSlicer", "ListPartitionRouter", "SinglePartitionRouter", "SubstreamPartitionRouter"]
|
@@ -138,22 +138,12 @@ class HttpClient:
|
|
138
138
|
cache_dir = os.getenv(ENV_REQUEST_CACHE_PATH)
|
139
139
|
# Use in-memory cache if cache_dir is not set
|
140
140
|
# This is a non-obvious interface, but it ensures we don't write sql files when running unit tests
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
if cache_dir
|
146
|
-
else "file::memory:?cache=shared"
|
147
|
-
)
|
148
|
-
# By using `PRAGMA synchronous=OFF` and `PRAGMA journal_mode=WAL`, we reduce the possible occurrences of `database table is locked` errors.
|
149
|
-
# Note that those were blindly added at the same time and one or the other might be sufficient to prevent the issues but we have seen good results with both. Feel free to revisit given more information.
|
150
|
-
# There are strong signals that `fast_save` might create problems but if the sync crashes, we start back from the beginning in terms of sqlite anyway so the impact should be minimal. Signals are:
|
151
|
-
# * https://github.com/requests-cache/requests-cache/commit/7fa89ffda300331c37d8fad7f773348a3b5b0236#diff-f43db4a5edf931647c32dec28ea7557aae4cae8444af4b26c8ecbe88d8c925aaR238
|
152
|
-
# * https://github.com/requests-cache/requests-cache/commit/7fa89ffda300331c37d8fad7f773348a3b5b0236#diff-2e7f95b7d7be270ff1a8118f817ea3e6663cdad273592e536a116c24e6d23c18R164-R168
|
153
|
-
# * `If the application running SQLite crashes, the data will be safe, but the database [might become corrupted](https://www.sqlite.org/howtocorrupt.html#cfgerr) if the operating system crashes or the computer loses power before that data has been written to the disk surface.` in [this description](https://www.sqlite.org/pragma.html#pragma_synchronous).
|
154
|
-
backend = requests_cache.SQLiteCache(sqlite_path, fast_save=True, wal=True)
|
141
|
+
if cache_dir:
|
142
|
+
sqlite_path = str(Path(cache_dir) / self.cache_filename)
|
143
|
+
else:
|
144
|
+
sqlite_path = "file::memory:?cache=shared"
|
155
145
|
return CachedLimiterSession(
|
156
|
-
sqlite_path, backend=
|
146
|
+
sqlite_path, backend="sqlite", api_budget=self._api_budget, match_headers=True
|
157
147
|
)
|
158
148
|
else:
|
159
149
|
return LimiterSession(api_budget=self._api_budget)
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
|
4
|
+
import importlib.util
|
5
|
+
from pathlib import Path
|
6
|
+
from types import ModuleType
|
7
|
+
from typing import Optional
|
8
|
+
|
9
|
+
import pytest
|
10
|
+
|
11
|
+
# The following fixtures are used to load a manifest-only connector's components module and manifest file.
|
12
|
+
# They can be accessed from any test file in the connector's unit_tests directory by importing them as follows:
|
13
|
+
|
14
|
+
# from airbyte_cdk.test.utils.manifest_only_fixtures import components_module, connector_dir, manifest_path
|
15
|
+
|
16
|
+
# individual components can then be referenced as: components_module.<CustomComponentClass>
|
17
|
+
|
18
|
+
|
19
|
+
import os
|
20
|
+
from typing import Any, Optional
|
21
|
+
import pytest
|
22
|
+
from pathlib import Path
|
23
|
+
import importlib.util
|
24
|
+
from types import ModuleType
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.fixture(scope="session")
|
28
|
+
def connector_dir(request: pytest.FixtureRequest) -> Path:
|
29
|
+
"""Return the connector's root directory."""
|
30
|
+
print("\n=== CDK Path Resolution Debug ===")
|
31
|
+
print(f"Config root path: {request.config.rootpath}")
|
32
|
+
print(f"Invocation dir: {request.config.invocation_params.dir}")
|
33
|
+
print(f"Current working dir: {os.getcwd()}")
|
34
|
+
print(f"Test file dir: {getattr(request.module, '__file__', 'No file attribute')}")
|
35
|
+
print(f"Environment variables: {dict(os.environ)}")
|
36
|
+
print(f"Directory contents: {os.listdir(os.getcwd())}")
|
37
|
+
print("==============================\n")
|
38
|
+
|
39
|
+
path = Path(request.config.invocation_params.dir)
|
40
|
+
resolved_path = path.parent
|
41
|
+
print(f"Resolved connector dir: {resolved_path}")
|
42
|
+
print(f"Resolved dir contents: {os.listdir(resolved_path) if resolved_path.exists() else 'Directory not found'}")
|
43
|
+
|
44
|
+
return resolved_path
|
45
|
+
|
46
|
+
|
47
|
+
@pytest.fixture(scope="session")
|
48
|
+
def components_module(connector_dir: Path) -> Optional[ModuleType]:
|
49
|
+
print("\n=== Components Module Debug ===")
|
50
|
+
components_path = connector_dir / "components.py"
|
51
|
+
print(f"Looking for components.py at: {components_path}")
|
52
|
+
print(f"File exists: {components_path.exists()}")
|
53
|
+
|
54
|
+
if not components_path.exists():
|
55
|
+
print("components.py not found")
|
56
|
+
return None
|
57
|
+
|
58
|
+
spec = importlib.util.spec_from_file_location("components", components_path)
|
59
|
+
print(f"Import spec created: {spec is not None}")
|
60
|
+
|
61
|
+
if spec is None:
|
62
|
+
return None
|
63
|
+
|
64
|
+
module = importlib.util.module_from_spec(spec)
|
65
|
+
print(f"Module created: {module is not None}")
|
66
|
+
|
67
|
+
if spec.loader is None:
|
68
|
+
return None
|
69
|
+
|
70
|
+
spec.loader.exec_module(module)
|
71
|
+
print("Module loaded successfully")
|
72
|
+
print("===========================\n")
|
73
|
+
|
74
|
+
return module
|
75
|
+
|
76
|
+
|
77
|
+
@pytest.fixture(scope="session")
|
78
|
+
def manifest_path(connector_dir: Path) -> Path:
|
79
|
+
"""Return the path to the connector's manifest file."""
|
80
|
+
return connector_dir / "manifest.yaml"
|
@@ -0,0 +1,306 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: airbyte-cdk
|
3
|
+
Version: 6.9.1.dev1
|
4
|
+
Summary: A framework for writing Airbyte Connectors.
|
5
|
+
Home-page: https://airbyte.com
|
6
|
+
License: MIT
|
7
|
+
Keywords: airbyte,connector-development-kit,cdk
|
8
|
+
Author: Airbyte
|
9
|
+
Author-email: contact@airbyte.io
|
10
|
+
Requires-Python: >=3.10,<3.13
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
12
|
+
Classifier: Intended Audience :: Developers
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
+
Provides-Extra: file-based
|
21
|
+
Provides-Extra: sphinx-docs
|
22
|
+
Provides-Extra: sql
|
23
|
+
Provides-Extra: vector-db-based
|
24
|
+
Requires-Dist: Jinja2 (>=3.1.2,<3.2.0)
|
25
|
+
Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
|
26
|
+
Requires-Dist: Sphinx (>=4.2,<4.3) ; extra == "sphinx-docs"
|
27
|
+
Requires-Dist: airbyte-protocol-models-dataclasses (>=0.14,<0.15)
|
28
|
+
Requires-Dist: avro (>=1.11.2,<1.12.0) ; extra == "file-based"
|
29
|
+
Requires-Dist: backoff
|
30
|
+
Requires-Dist: cachetools
|
31
|
+
Requires-Dist: cohere (==4.21) ; extra == "vector-db-based"
|
32
|
+
Requires-Dist: cryptography (>=42.0.5,<44.0.0)
|
33
|
+
Requires-Dist: dpath (>=2.1.6,<3.0.0)
|
34
|
+
Requires-Dist: dunamai (>=1.22.0,<2.0.0)
|
35
|
+
Requires-Dist: fastavro (>=1.8.0,<1.9.0) ; extra == "file-based"
|
36
|
+
Requires-Dist: genson (==1.3.0)
|
37
|
+
Requires-Dist: isodate (>=0.6.1,<0.7.0)
|
38
|
+
Requires-Dist: jsonref (>=0.2,<0.3)
|
39
|
+
Requires-Dist: jsonschema (>=4.17.3,<4.18.0)
|
40
|
+
Requires-Dist: langchain (==0.1.16) ; extra == "vector-db-based"
|
41
|
+
Requires-Dist: langchain_core (==0.1.42)
|
42
|
+
Requires-Dist: markdown ; extra == "file-based"
|
43
|
+
Requires-Dist: nltk (==3.9.1)
|
44
|
+
Requires-Dist: numpy (<2)
|
45
|
+
Requires-Dist: openai[embeddings] (==0.27.9) ; extra == "vector-db-based"
|
46
|
+
Requires-Dist: orjson (>=3.10.7,<4.0.0)
|
47
|
+
Requires-Dist: pandas (==2.2.2)
|
48
|
+
Requires-Dist: pdf2image (==1.16.3) ; extra == "file-based"
|
49
|
+
Requires-Dist: pdfminer.six (==20221105) ; extra == "file-based"
|
50
|
+
Requires-Dist: pendulum (<3.0.0)
|
51
|
+
Requires-Dist: psutil (==6.1.0)
|
52
|
+
Requires-Dist: pyarrow (>=15.0.0,<15.1.0) ; extra == "file-based"
|
53
|
+
Requires-Dist: pydantic (>=2.7,<3.0)
|
54
|
+
Requires-Dist: pyjwt (>=2.8.0,<3.0.0)
|
55
|
+
Requires-Dist: pyrate-limiter (>=3.1.0,<3.2.0)
|
56
|
+
Requires-Dist: pytesseract (==0.3.10) ; extra == "file-based"
|
57
|
+
Requires-Dist: python-calamine (==0.2.3) ; extra == "file-based"
|
58
|
+
Requires-Dist: python-dateutil
|
59
|
+
Requires-Dist: python-snappy (==0.7.3) ; extra == "file-based"
|
60
|
+
Requires-Dist: python-ulid (>=3.0.0,<4.0.0)
|
61
|
+
Requires-Dist: pytz (==2024.1)
|
62
|
+
Requires-Dist: rapidfuzz (>=3.10.1,<4.0.0)
|
63
|
+
Requires-Dist: requests
|
64
|
+
Requires-Dist: requests_cache
|
65
|
+
Requires-Dist: serpyco-rs (>=1.10.2,<2.0.0)
|
66
|
+
Requires-Dist: sphinx-rtd-theme (>=1.0,<1.1) ; extra == "sphinx-docs"
|
67
|
+
Requires-Dist: sqlalchemy (>=2.0,<3.0,!=2.0.36) ; extra == "sql"
|
68
|
+
Requires-Dist: tiktoken (==0.8.0) ; extra == "vector-db-based"
|
69
|
+
Requires-Dist: unstructured.pytesseract (>=0.3.12) ; extra == "file-based"
|
70
|
+
Requires-Dist: unstructured[docx,pptx] (==0.10.27) ; extra == "file-based"
|
71
|
+
Requires-Dist: wcmatch (==10.0)
|
72
|
+
Requires-Dist: xmltodict (>=0.13.0,<0.14.0)
|
73
|
+
Project-URL: Documentation, https://docs.airbyte.io/
|
74
|
+
Project-URL: Repository, https://github.com/airbytehq/airbyte-python-cdk
|
75
|
+
Description-Content-Type: text/markdown
|
76
|
+
|
77
|
+
# Airbyte Python CDK and Low-Code CDK
|
78
|
+
|
79
|
+
Airbyte Python CDK is a framework for building Airbyte API Source Connectors. It provides a set of
|
80
|
+
classes and helpers that make it easy to build a connector against an HTTP API (REST, GraphQL, etc),
|
81
|
+
or a generic Python source connector.
|
82
|
+
|
83
|
+
## Usage
|
84
|
+
|
85
|
+
If you're looking to build a connector, we highly recommend that you
|
86
|
+
[start with the Connector Builder](https://docs.airbyte.com/connector-development/connector-builder-ui/overview).
|
87
|
+
It should be enough for 90% connectors out there. For more flexible and complex connectors, use the
|
88
|
+
[low-code CDK and `SourceDeclarativeManifest`](https://docs.airbyte.com/connector-development/config-based/low-code-cdk-overview).
|
89
|
+
|
90
|
+
If that doesn't work, then consider building on top of the
|
91
|
+
[lower-level Python CDK itself](https://docs.airbyte.com/connector-development/cdk-python/).
|
92
|
+
|
93
|
+
### Quick Start
|
94
|
+
|
95
|
+
To get started on a Python CDK based connector or a low-code connector, you can generate a connector
|
96
|
+
project from a template:
|
97
|
+
|
98
|
+
```bash
|
99
|
+
# from the repo root
|
100
|
+
cd airbyte-integrations/connector-templates/generator
|
101
|
+
./generate.sh
|
102
|
+
```
|
103
|
+
|
104
|
+
### Example Connectors
|
105
|
+
|
106
|
+
**HTTP Connectors**:
|
107
|
+
|
108
|
+
- [Stripe](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-stripe/)
|
109
|
+
- [Salesforce](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-salesforce/)
|
110
|
+
|
111
|
+
**Python connectors using the bare-bones `Source` abstraction**:
|
112
|
+
|
113
|
+
- [Google Sheets](https://github.com/airbytehq/airbyte/blob/master/airbyte-integrations/connectors/source-google-sheets/google_sheets_source/google_sheets_source.py)
|
114
|
+
|
115
|
+
This will generate a project with a type and a name of your choice and put it in
|
116
|
+
`airbyte-integrations/connectors`. Open the directory with your connector in an editor and follow
|
117
|
+
the `TODO` items.
|
118
|
+
|
119
|
+
## Python CDK Overview
|
120
|
+
|
121
|
+
Airbyte CDK code is within `airbyte_cdk` directory. Here's a high level overview of what's inside:
|
122
|
+
|
123
|
+
- `connector_builder`. Internal wrapper that helps the Connector Builder platform run a declarative
|
124
|
+
manifest (low-code connector). You should not use this code directly. If you need to run a
|
125
|
+
`SourceDeclarativeManifest`, take a look at
|
126
|
+
[`source-declarative-manifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest)
|
127
|
+
connector implementation instead.
|
128
|
+
- `destinations`. Basic Destination connector support! If you're building a Destination connector in
|
129
|
+
Python, try that. Some of our vector DB destinations like `destination-pinecone` are using that
|
130
|
+
code.
|
131
|
+
- `models` expose `airbyte_protocol.models` as a part of `airbyte_cdk` package.
|
132
|
+
- `sources/concurrent_source` is the Concurrent CDK implementation. It supports reading data from
|
133
|
+
streams concurrently per slice / partition, useful for connectors with high throughput and high
|
134
|
+
number of records.
|
135
|
+
- `sources/declarative` is the low-code CDK. It works on top of Airbyte Python CDK, but provides a
|
136
|
+
declarative manifest language to define streams, operations, etc. This makes it easier to build
|
137
|
+
connectors without writing Python code.
|
138
|
+
- `sources/file_based` is the CDK for file-based sources. Examples include S3, Azure, GCS, etc.
|
139
|
+
|
140
|
+
## Contributing
|
141
|
+
|
142
|
+
Thank you for being interested in contributing to Airbyte Python CDK! Here are some guidelines to
|
143
|
+
get you started:
|
144
|
+
|
145
|
+
- We adhere to the [code of conduct](/CODE_OF_CONDUCT.md).
|
146
|
+
- You can contribute by reporting bugs, posting github discussions, opening issues, improving
|
147
|
+
[documentation](/docs/), and submitting pull requests with bugfixes and new features alike.
|
148
|
+
- If you're changing the code, please add unit tests for your change.
|
149
|
+
- When submitting issues or PRs, please add a small reproduction project. Using the changes in your
|
150
|
+
connector and providing that connector code as an example (or a satellite PR) helps!
|
151
|
+
|
152
|
+
### First time setup
|
153
|
+
|
154
|
+
Install the project dependencies and development tools:
|
155
|
+
|
156
|
+
```bash
|
157
|
+
poetry install --all-extras
|
158
|
+
```
|
159
|
+
|
160
|
+
Installing all extras is required to run the full suite of unit tests.
|
161
|
+
|
162
|
+
#### Running tests locally
|
163
|
+
|
164
|
+
- Iterate on the CDK code locally
|
165
|
+
- Run tests via `poetry run poe unit-test-with-cov`, or `python -m pytest -s unit_tests` if you want
|
166
|
+
to pass pytest options.
|
167
|
+
- Run `poetry run poe check-local` to lint all code, type-check modified code, and run unit tests
|
168
|
+
with coverage in one command.
|
169
|
+
|
170
|
+
To see all available scripts, run `poetry run poe`.
|
171
|
+
|
172
|
+
#### Formatting the code
|
173
|
+
|
174
|
+
- Iterate on the CDK code locally
|
175
|
+
- Run `poetry run ruff format` to format your changes.
|
176
|
+
|
177
|
+
To see all available `ruff` options, run `poetry run ruff`.
|
178
|
+
|
179
|
+
##### Autogenerated files
|
180
|
+
|
181
|
+
Low-code CDK models are generated from `sources/declarative/declarative_component_schema.yaml`. If
|
182
|
+
the iteration you are working on includes changes to the models or the connector generator, you
|
183
|
+
might want to regenerate them. In order to do that, you can run:
|
184
|
+
|
185
|
+
```bash
|
186
|
+
poetry run poe build
|
187
|
+
```
|
188
|
+
|
189
|
+
This will generate the code generator docker image and the component manifest files based on the
|
190
|
+
schemas and templates.
|
191
|
+
|
192
|
+
#### Testing
|
193
|
+
|
194
|
+
All tests are located in the `unit_tests` directory. Run `poetry run poe unit-test-with-cov` to run
|
195
|
+
them. This also presents a test coverage report. For faster iteration with no coverage report and
|
196
|
+
more options, `python -m pytest -s unit_tests` is a good place to start.
|
197
|
+
|
198
|
+
#### Building and testing a connector with your local CDK
|
199
|
+
|
200
|
+
When developing a new feature in the CDK, you may find it helpful to run a connector that uses that
|
201
|
+
new feature. You can test this in one of two ways:
|
202
|
+
|
203
|
+
- Running a connector locally
|
204
|
+
- Building and running a source via Docker
|
205
|
+
|
206
|
+
##### Installing your local CDK into a local Python connector
|
207
|
+
|
208
|
+
Open the connector's `pyproject.toml` file and replace the line with `airbyte_cdk` with the
|
209
|
+
following:
|
210
|
+
|
211
|
+
```toml
|
212
|
+
airbyte_cdk = { path = "../../../airbyte-cdk/python/airbyte_cdk", develop = true }
|
213
|
+
```
|
214
|
+
|
215
|
+
Then, running `poetry update` should reinstall `airbyte_cdk` from your local working directory.
|
216
|
+
|
217
|
+
##### Building a Python connector in Docker with your local CDK installed
|
218
|
+
|
219
|
+
_Pre-requisite: Install the
|
220
|
+
[`airbyte-ci` CLI](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)_
|
221
|
+
|
222
|
+
You can build your connector image with the local CDK using
|
223
|
+
|
224
|
+
```bash
|
225
|
+
# from the airbytehq/airbyte base directory
|
226
|
+
airbyte-ci connectors --use-local-cdk --name=<CONNECTOR> build
|
227
|
+
```
|
228
|
+
|
229
|
+
Note that the local CDK is injected at build time, so if you make changes, you will have to run the
|
230
|
+
build command again to see them reflected.
|
231
|
+
|
232
|
+
##### Running Connector Acceptance Tests for a single connector in Docker with your local CDK installed
|
233
|
+
|
234
|
+
_Pre-requisite: Install the
|
235
|
+
[`airbyte-ci` CLI](https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/pipelines/README.md)_
|
236
|
+
|
237
|
+
To run acceptance tests for a single connectors using the local CDK, from the connector directory,
|
238
|
+
run
|
239
|
+
|
240
|
+
```bash
|
241
|
+
airbyte-ci connectors --use-local-cdk --name=<CONNECTOR> test
|
242
|
+
```
|
243
|
+
|
244
|
+
#### When you don't have access to the API
|
245
|
+
|
246
|
+
There may be a time when you do not have access to the API (either because you don't have the
|
247
|
+
credentials, network access, etc...) You will probably still want to do end-to-end testing at least
|
248
|
+
once. In order to do so, you can emulate the server you would be reaching using a server stubbing
|
249
|
+
tool.
|
250
|
+
|
251
|
+
For example, using [mockserver](https://www.mock-server.com/), you can set up an expectation file
|
252
|
+
like this:
|
253
|
+
|
254
|
+
```json
|
255
|
+
{
|
256
|
+
"httpRequest": {
|
257
|
+
"method": "GET",
|
258
|
+
"path": "/data"
|
259
|
+
},
|
260
|
+
"httpResponse": {
|
261
|
+
"body": "{\"data\": [{\"record_key\": 1}, {\"record_key\": 2}]}"
|
262
|
+
}
|
263
|
+
}
|
264
|
+
```
|
265
|
+
|
266
|
+
Assuming this file has been created at `secrets/mock_server_config/expectations.json`, running the
|
267
|
+
following command will allow to match any requests on path `/data` to return the response defined in
|
268
|
+
the expectation file:
|
269
|
+
|
270
|
+
```bash
|
271
|
+
docker run -d --rm -v $(pwd)/secrets/mock_server_config:/config -p 8113:8113 --env MOCKSERVER_LOG_LEVEL=TRACE --env MOCKSERVER_SERVER_PORT=8113 --env MOCKSERVER_WATCH_INITIALIZATION_JSON=true --env MOCKSERVER_PERSISTED_EXPECTATIONS_PATH=/config/expectations.json --env MOCKSERVER_INITIALIZATION_JSON_PATH=/config/expectations.json mockserver/mockserver:5.15.0
|
272
|
+
```
|
273
|
+
|
274
|
+
HTTP requests to `localhost:8113/data` should now return the body defined in the expectations file.
|
275
|
+
To test this, the implementer either has to change the code which defines the base URL for Python
|
276
|
+
source or update the `url_base` from low-code. With the Connector Builder running in docker, you
|
277
|
+
will have to use domain `host.docker.internal` instead of `localhost` as the requests are executed
|
278
|
+
within docker.
|
279
|
+
|
280
|
+
#### Publishing a new version to PyPi
|
281
|
+
|
282
|
+
Python CDK has a
|
283
|
+
[GitHub workflow](https://github.com/airbytehq/airbyte/actions/workflows/publish-cdk-command-manually.yml)
|
284
|
+
that manages the CDK changelog, making a new release for `airbyte_cdk`, publishing it to PyPI, and
|
285
|
+
then making a commit to update (and subsequently auto-release)
|
286
|
+
[`source-declarative-manifest`](https://github.com/airbytehq/airbyte/tree/master/airbyte-integrations/connectors/source-declarative-manifest)
|
287
|
+
and Connector Builder (in the platform repository).
|
288
|
+
|
289
|
+
> [!Note]: The workflow will handle the `CHANGELOG.md` entry for you. You should not add changelog
|
290
|
+
> lines in your PRs to the CDK itself.
|
291
|
+
|
292
|
+
> [!Warning]: The workflow bumps version on it's own, please don't change the CDK version in
|
293
|
+
> `pyproject.toml` manually.
|
294
|
+
|
295
|
+
1. You only trigger the release workflow once all the PRs that you want to be included are already
|
296
|
+
merged into the `master` branch.
|
297
|
+
2. The
|
298
|
+
[`Publish CDK Manually`](https://github.com/airbytehq/airbyte/actions/workflows/publish-cdk-command-manually.yml)
|
299
|
+
workflow from master using `release-type=major|manor|patch` and setting the changelog message.
|
300
|
+
3. When the workflow runs, it will commit a new version directly to master branch.
|
301
|
+
4. The workflow will bump the version of `source-declarative-manifest` according to the
|
302
|
+
`release-type` of the CDK, then commit these changes back to master. The commit to master will
|
303
|
+
kick off a publish of the new version of `source-declarative-manifest`.
|
304
|
+
5. The workflow will also add a pull request to `airbyte-platform-internal` repo to bump the
|
305
|
+
dependency in Connector Builder.
|
306
|
+
|