dagster-airbyte 0.26.18__py3-none-any.whl → 0.28.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_airbyte/__init__.py +12 -3
- dagster_airbyte/asset_decorator.py +5 -4
- dagster_airbyte/asset_defs.py +78 -31
- dagster_airbyte/components/__init__.py +0 -0
- dagster_airbyte/components/workspace_component/__init__.py +0 -0
- dagster_airbyte/components/workspace_component/component.py +433 -0
- dagster_airbyte/components/workspace_component/scaffolder.py +30 -0
- dagster_airbyte/legacy_resources.py +826 -0
- dagster_airbyte/managed/reconciliation.py +9 -7
- dagster_airbyte/ops.py +2 -1
- dagster_airbyte/resources.py +591 -937
- dagster_airbyte/translator.py +20 -1
- dagster_airbyte/utils.py +1 -1
- dagster_airbyte/version.py +1 -1
- {dagster_airbyte-0.26.18.dist-info → dagster_airbyte-0.28.3.dist-info}/METADATA +5 -5
- dagster_airbyte-0.28.3.dist-info/RECORD +28 -0
- {dagster_airbyte-0.26.18.dist-info → dagster_airbyte-0.28.3.dist-info}/entry_points.txt +3 -0
- dagster_airbyte-0.26.18.dist-info/RECORD +0 -23
- {dagster_airbyte-0.26.18.dist-info → dagster_airbyte-0.28.3.dist-info}/WHEEL +0 -0
- {dagster_airbyte-0.26.18.dist-info → dagster_airbyte-0.28.3.dist-info}/licenses/LICENSE +0 -0
- {dagster_airbyte-0.26.18.dist-info → dagster_airbyte-0.28.3.dist-info}/top_level.txt +0 -0
dagster_airbyte/resources.py
CHANGED
|
@@ -1,37 +1,30 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
|
-
import sys
|
|
5
2
|
import time
|
|
6
|
-
from abc import
|
|
7
|
-
from collections.abc import Mapping, Sequence
|
|
3
|
+
from collections.abc import Callable, Iterator, Mapping, Sequence
|
|
8
4
|
from contextlib import contextmanager
|
|
9
5
|
from datetime import datetime, timedelta
|
|
10
|
-
from typing import Any, Optional,
|
|
6
|
+
from typing import Any, ClassVar, Optional, Union
|
|
7
|
+
from urllib.parse import parse_qsl, urlparse
|
|
11
8
|
|
|
12
9
|
import requests
|
|
13
10
|
from dagster import (
|
|
14
11
|
AssetExecutionContext,
|
|
15
12
|
AssetMaterialization,
|
|
13
|
+
AssetSpec,
|
|
16
14
|
ConfigurableResource,
|
|
17
15
|
Definitions,
|
|
18
16
|
Failure,
|
|
19
|
-
InitResourceContext,
|
|
20
17
|
MaterializeResult,
|
|
21
18
|
_check as check,
|
|
22
19
|
get_dagster_logger,
|
|
23
|
-
resource,
|
|
24
20
|
)
|
|
25
|
-
from dagster._annotations import
|
|
26
|
-
from dagster._config.pythonic_config import infer_schema_from_config_class
|
|
27
|
-
from dagster._core.definitions.asset_spec import AssetSpec
|
|
21
|
+
from dagster._annotations import superseded
|
|
28
22
|
from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
|
|
29
|
-
from dagster.
|
|
30
|
-
from dagster._record import record
|
|
31
|
-
from dagster._utils.cached_method import cached_method
|
|
32
|
-
from dagster._utils.merger import deep_merge_dicts
|
|
23
|
+
from dagster._symbol_annotations import beta, public
|
|
33
24
|
from dagster_shared.dagster_model import DagsterModel
|
|
34
|
-
from
|
|
25
|
+
from dagster_shared.record import record
|
|
26
|
+
from dagster_shared.utils.cached_method import cached_method
|
|
27
|
+
from pydantic import Field, PrivateAttr, model_validator
|
|
35
28
|
from requests.exceptions import RequestException
|
|
36
29
|
|
|
37
30
|
from dagster_airbyte.translator import (
|
|
@@ -51,810 +44,55 @@ from dagster_airbyte.utils import (
|
|
|
51
44
|
get_translator_from_airbyte_assets,
|
|
52
45
|
)
|
|
53
46
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
47
|
+
AIRBYTE_CLOUD_REST_API_BASE = "https://api.airbyte.com"
|
|
48
|
+
AIRBYTE_CLOUD_REST_API_VERSION = "v1"
|
|
49
|
+
AIRBYTE_CLOUD_REST_API_BASE_URL = f"{AIRBYTE_CLOUD_REST_API_BASE}/{AIRBYTE_CLOUD_REST_API_VERSION}"
|
|
50
|
+
AIRBYTE_CLOUD_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
|
|
51
|
+
AIRBYTE_CLOUD_CONFIGURATION_API_VERSION = "v1"
|
|
52
|
+
AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL = (
|
|
53
|
+
f"{AIRBYTE_CLOUD_CONFIGURATION_API_BASE}/{AIRBYTE_CLOUD_CONFIGURATION_API_VERSION}"
|
|
54
|
+
)
|
|
59
55
|
|
|
60
56
|
DEFAULT_POLL_INTERVAL_SECONDS = 10
|
|
61
57
|
|
|
62
58
|
# The access token expire every 3 minutes in Airbyte Cloud.
|
|
63
59
|
# Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
|
|
64
|
-
|
|
60
|
+
AIRBYTE_REFRESH_TIMEDELTA_SECONDS = 150
|
|
65
61
|
|
|
66
62
|
AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
|
|
67
63
|
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
# Int in case we nest contexts
|
|
73
|
-
self.cache_enabled = 0
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
class BaseAirbyteResource(ConfigurableResource):
|
|
77
|
-
request_max_retries: int = Field(
|
|
78
|
-
default=3,
|
|
79
|
-
description=(
|
|
80
|
-
"The maximum number of times requests to the Airbyte API should be retried "
|
|
81
|
-
"before failing."
|
|
82
|
-
),
|
|
83
|
-
)
|
|
84
|
-
request_retry_delay: float = Field(
|
|
85
|
-
default=0.25,
|
|
86
|
-
description="Time (in seconds) to wait between each request retry.",
|
|
87
|
-
)
|
|
88
|
-
request_timeout: int = Field(
|
|
89
|
-
default=15,
|
|
90
|
-
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
91
|
-
)
|
|
92
|
-
cancel_sync_on_run_termination: bool = Field(
|
|
93
|
-
default=True,
|
|
94
|
-
description=(
|
|
95
|
-
"Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"
|
|
96
|
-
" be useful to disable if using Airbyte sources that cannot be cancelled and"
|
|
97
|
-
" resumed easily, or if your Dagster deployment may experience runner interruptions"
|
|
98
|
-
" that do not impact your Airbyte deployment."
|
|
99
|
-
),
|
|
100
|
-
)
|
|
101
|
-
poll_interval: float = Field(
|
|
102
|
-
default=DEFAULT_POLL_INTERVAL_SECONDS,
|
|
103
|
-
description="Time (in seconds) to wait between checking a sync's status.",
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
@classmethod
|
|
107
|
-
def _is_dagster_maintained(cls) -> bool:
|
|
108
|
-
return True
|
|
109
|
-
|
|
110
|
-
@property
|
|
111
|
-
@cached_method
|
|
112
|
-
def _log(self) -> logging.Logger:
|
|
113
|
-
return get_dagster_logger()
|
|
114
|
-
|
|
115
|
-
@property
|
|
116
|
-
@abstractmethod
|
|
117
|
-
def api_base_url(self) -> str:
|
|
118
|
-
raise NotImplementedError()
|
|
119
|
-
|
|
120
|
-
@property
|
|
121
|
-
@abstractmethod
|
|
122
|
-
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
123
|
-
raise NotImplementedError()
|
|
124
|
-
|
|
125
|
-
def make_request(
|
|
126
|
-
self,
|
|
127
|
-
endpoint: str,
|
|
128
|
-
data: Optional[Mapping[str, object]] = None,
|
|
129
|
-
method: str = "POST",
|
|
130
|
-
include_additional_request_params: bool = True,
|
|
131
|
-
) -> Optional[Mapping[str, object]]:
|
|
132
|
-
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
133
|
-
|
|
134
|
-
Args:
|
|
135
|
-
endpoint (str): The Airbyte API endpoint to send this request to.
|
|
136
|
-
data (Optional[str]): JSON-formatted data string to be included in the request.
|
|
137
|
-
|
|
138
|
-
Returns:
|
|
139
|
-
Optional[Dict[str, Any]]: Parsed json data from the response to this request
|
|
140
|
-
"""
|
|
141
|
-
url = self.api_base_url + endpoint
|
|
142
|
-
headers = {"accept": "application/json"}
|
|
143
|
-
|
|
144
|
-
num_retries = 0
|
|
145
|
-
while True:
|
|
146
|
-
try:
|
|
147
|
-
request_args: dict[str, Any] = dict(
|
|
148
|
-
method=method,
|
|
149
|
-
url=url,
|
|
150
|
-
headers=headers,
|
|
151
|
-
timeout=self.request_timeout,
|
|
152
|
-
)
|
|
153
|
-
if data:
|
|
154
|
-
request_args["json"] = data
|
|
155
|
-
|
|
156
|
-
if include_additional_request_params:
|
|
157
|
-
request_args = deep_merge_dicts(
|
|
158
|
-
request_args,
|
|
159
|
-
self.all_additional_request_params,
|
|
160
|
-
)
|
|
161
|
-
|
|
162
|
-
response = requests.request(
|
|
163
|
-
**request_args,
|
|
164
|
-
)
|
|
165
|
-
response.raise_for_status()
|
|
166
|
-
if response.status_code == 204:
|
|
167
|
-
return None
|
|
168
|
-
return response.json()
|
|
169
|
-
except RequestException as e:
|
|
170
|
-
self._log.error("Request to Airbyte API failed: %s", e)
|
|
171
|
-
if num_retries == self.request_max_retries:
|
|
172
|
-
break
|
|
173
|
-
num_retries += 1
|
|
174
|
-
time.sleep(self.request_retry_delay)
|
|
175
|
-
|
|
176
|
-
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
177
|
-
|
|
178
|
-
@abstractmethod
|
|
179
|
-
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
180
|
-
raise NotImplementedError()
|
|
181
|
-
|
|
182
|
-
@abstractmethod
|
|
183
|
-
def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
|
|
184
|
-
raise NotImplementedError()
|
|
185
|
-
|
|
186
|
-
@abstractmethod
|
|
187
|
-
def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
|
|
188
|
-
raise NotImplementedError()
|
|
189
|
-
|
|
190
|
-
@abstractmethod
|
|
191
|
-
def cancel_job(self, job_id: int):
|
|
192
|
-
raise NotImplementedError()
|
|
193
|
-
|
|
194
|
-
@property
|
|
195
|
-
@abstractmethod
|
|
196
|
-
def _should_forward_logs(self) -> bool:
|
|
197
|
-
raise NotImplementedError()
|
|
198
|
-
|
|
199
|
-
def sync_and_poll(
|
|
200
|
-
self,
|
|
201
|
-
connection_id: str,
|
|
202
|
-
poll_interval: Optional[float] = None,
|
|
203
|
-
poll_timeout: Optional[float] = None,
|
|
204
|
-
) -> AirbyteOutput:
|
|
205
|
-
"""Initializes a sync operation for the given connector, and polls until it completes.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
|
|
209
|
-
"Connection" tab of a given connection in the Arbyte UI.
|
|
210
|
-
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
211
|
-
poll_timeout (float): The maximum time that will waited before this operation is timed
|
|
212
|
-
out. By default, this will never time out.
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
:py:class:`~AirbyteOutput`:
|
|
216
|
-
Details of the sync job.
|
|
217
|
-
"""
|
|
218
|
-
connection_details = self.get_connection_details(connection_id)
|
|
219
|
-
job_details = self.start_sync(connection_id)
|
|
220
|
-
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
221
|
-
job_id = cast("int", job_info.get("id"))
|
|
222
|
-
|
|
223
|
-
self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
|
|
224
|
-
start = time.monotonic()
|
|
225
|
-
logged_attempts = 0
|
|
226
|
-
logged_lines = 0
|
|
227
|
-
state = None
|
|
228
|
-
|
|
229
|
-
try:
|
|
230
|
-
while True:
|
|
231
|
-
if poll_timeout and start + poll_timeout < time.monotonic():
|
|
232
|
-
raise Failure(
|
|
233
|
-
f"Timeout: Airbyte job {job_id} is not ready after the timeout"
|
|
234
|
-
f" {poll_timeout} seconds"
|
|
235
|
-
)
|
|
236
|
-
time.sleep(poll_interval or self.poll_interval)
|
|
237
|
-
job_details = self.get_job_status(connection_id, job_id)
|
|
238
|
-
attempts = cast("list", job_details.get("attempts", []))
|
|
239
|
-
cur_attempt = len(attempts)
|
|
240
|
-
# spit out the available Airbyte log info
|
|
241
|
-
if cur_attempt:
|
|
242
|
-
if self._should_forward_logs:
|
|
243
|
-
log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
|
|
244
|
-
|
|
245
|
-
for line in log_lines[logged_lines:]:
|
|
246
|
-
sys.stdout.write(line + "\n")
|
|
247
|
-
sys.stdout.flush()
|
|
248
|
-
logged_lines = len(log_lines)
|
|
249
|
-
|
|
250
|
-
# if there's a next attempt, this one will have no more log messages
|
|
251
|
-
if logged_attempts < cur_attempt - 1:
|
|
252
|
-
logged_lines = 0
|
|
253
|
-
logged_attempts += 1
|
|
254
|
-
|
|
255
|
-
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
256
|
-
state = job_info.get("status")
|
|
257
|
-
|
|
258
|
-
if state in (
|
|
259
|
-
AirbyteJobStatusType.RUNNING,
|
|
260
|
-
AirbyteJobStatusType.PENDING,
|
|
261
|
-
AirbyteJobStatusType.INCOMPLETE,
|
|
262
|
-
):
|
|
263
|
-
continue
|
|
264
|
-
elif state == AirbyteJobStatusType.SUCCEEDED:
|
|
265
|
-
break
|
|
266
|
-
elif state == AirbyteJobStatusType.ERROR:
|
|
267
|
-
raise Failure(f"Job failed: {job_id}")
|
|
268
|
-
elif state == AirbyteJobStatusType.CANCELLED:
|
|
269
|
-
raise Failure(f"Job was cancelled: {job_id}")
|
|
270
|
-
else:
|
|
271
|
-
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
272
|
-
finally:
|
|
273
|
-
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
274
|
-
# the python process
|
|
275
|
-
if (
|
|
276
|
-
state
|
|
277
|
-
not in (
|
|
278
|
-
AirbyteJobStatusType.SUCCEEDED,
|
|
279
|
-
AirbyteJobStatusType.ERROR,
|
|
280
|
-
AirbyteJobStatusType.CANCELLED,
|
|
281
|
-
)
|
|
282
|
-
and self.cancel_sync_on_run_termination
|
|
283
|
-
):
|
|
284
|
-
self.cancel_job(job_id)
|
|
285
|
-
|
|
286
|
-
return AirbyteOutput(job_details=job_details, connection_details=connection_details)
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
@superseded(
|
|
290
|
-
additional_warn_text=(
|
|
291
|
-
"Using `AirbyteCloudResource` with `build_airbyte_assets`is no longer best practice. "
|
|
292
|
-
"Use `AirbyteCloudWorkspace` with `build_airbyte_assets_definitions` instead."
|
|
293
|
-
)
|
|
294
|
-
)
|
|
295
|
-
class AirbyteCloudResource(BaseAirbyteResource):
|
|
296
|
-
"""This resource allows users to programmatically interface with the Airbyte Cloud API to launch
|
|
297
|
-
syncs and monitor their progress.
|
|
298
|
-
|
|
299
|
-
**Examples:**
|
|
300
|
-
|
|
301
|
-
.. code-block:: python
|
|
302
|
-
|
|
303
|
-
from dagster import job, EnvVar
|
|
304
|
-
from dagster_airbyte import AirbyteResource
|
|
305
|
-
|
|
306
|
-
my_airbyte_resource = AirbyteCloudResource(
|
|
307
|
-
client_id=EnvVar("AIRBYTE_CLIENT_ID"),
|
|
308
|
-
client_secret=EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
309
|
-
)
|
|
310
|
-
|
|
311
|
-
airbyte_assets = build_airbyte_assets(
|
|
312
|
-
connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
|
|
313
|
-
destination_tables=["releases", "tags", "teams"],
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
defs = Definitions(
|
|
317
|
-
assets=[airbyte_assets],
|
|
318
|
-
resources={"airbyte": my_airbyte_resource},
|
|
319
|
-
)
|
|
320
|
-
"""
|
|
321
|
-
|
|
322
|
-
client_id: str = Field(..., description="The Airbyte Cloud client ID.")
|
|
323
|
-
client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
|
|
324
|
-
|
|
325
|
-
_access_token_value: Optional[str] = PrivateAttr(default=None)
|
|
326
|
-
_access_token_timestamp: Optional[float] = PrivateAttr(default=None)
|
|
327
|
-
|
|
328
|
-
def setup_for_execution(self, context: InitResourceContext) -> None:
|
|
329
|
-
# Refresh access token when the resource is initialized
|
|
330
|
-
self._refresh_access_token()
|
|
331
|
-
|
|
332
|
-
@property
|
|
333
|
-
def api_base_url(self) -> str:
|
|
334
|
-
return "https://api.airbyte.com/v1"
|
|
335
|
-
|
|
336
|
-
@property
|
|
337
|
-
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
338
|
-
# Make sure the access token is refreshed before using it when calling the API.
|
|
339
|
-
if self._needs_refreshed_access_token():
|
|
340
|
-
self._refresh_access_token()
|
|
341
|
-
return {
|
|
342
|
-
"headers": {
|
|
343
|
-
"Authorization": f"Bearer {self._access_token_value}",
|
|
344
|
-
"User-Agent": "dagster",
|
|
345
|
-
}
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
def make_request(
|
|
349
|
-
self,
|
|
350
|
-
endpoint: str,
|
|
351
|
-
data: Optional[Mapping[str, object]] = None,
|
|
352
|
-
method: str = "POST",
|
|
353
|
-
include_additional_request_params: bool = True,
|
|
354
|
-
) -> Optional[Mapping[str, object]]:
|
|
355
|
-
# Make sure the access token is refreshed before using it when calling the API.
|
|
356
|
-
if include_additional_request_params and self._needs_refreshed_access_token():
|
|
357
|
-
self._refresh_access_token()
|
|
358
|
-
return super().make_request(
|
|
359
|
-
endpoint=endpoint,
|
|
360
|
-
data=data,
|
|
361
|
-
method=method,
|
|
362
|
-
include_additional_request_params=include_additional_request_params,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
366
|
-
job_sync = check.not_none(
|
|
367
|
-
self.make_request(
|
|
368
|
-
endpoint="/jobs",
|
|
369
|
-
data={
|
|
370
|
-
"connectionId": connection_id,
|
|
371
|
-
"jobType": "sync",
|
|
372
|
-
},
|
|
373
|
-
)
|
|
374
|
-
)
|
|
375
|
-
return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}
|
|
376
|
-
|
|
377
|
-
def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
|
|
378
|
-
return {}
|
|
379
|
-
|
|
380
|
-
def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
|
|
381
|
-
job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))
|
|
382
|
-
return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}
|
|
383
|
-
|
|
384
|
-
def cancel_job(self, job_id: int):
|
|
385
|
-
self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")
|
|
386
|
-
|
|
387
|
-
@property
|
|
388
|
-
def _should_forward_logs(self) -> bool:
|
|
389
|
-
# Airbyte Cloud does not support streaming logs yet
|
|
390
|
-
return False
|
|
391
|
-
|
|
392
|
-
def _refresh_access_token(self) -> None:
|
|
393
|
-
response = check.not_none(
|
|
394
|
-
self.make_request(
|
|
395
|
-
endpoint="/applications/token",
|
|
396
|
-
data={
|
|
397
|
-
"client_id": self.client_id,
|
|
398
|
-
"client_secret": self.client_secret,
|
|
399
|
-
},
|
|
400
|
-
# Must not pass the bearer access token when refreshing it.
|
|
401
|
-
include_additional_request_params=False,
|
|
402
|
-
)
|
|
403
|
-
)
|
|
404
|
-
self._access_token_value = str(response["access_token"])
|
|
405
|
-
self._access_token_timestamp = datetime.now().timestamp()
|
|
406
|
-
|
|
407
|
-
def _needs_refreshed_access_token(self) -> bool:
|
|
408
|
-
return (
|
|
409
|
-
not self._access_token_value
|
|
410
|
-
or not self._access_token_timestamp
|
|
411
|
-
or self._access_token_timestamp
|
|
412
|
-
<= datetime.timestamp(
|
|
413
|
-
datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
|
|
414
|
-
)
|
|
415
|
-
)
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
class AirbyteResource(BaseAirbyteResource):
|
|
419
|
-
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
420
|
-
syncs and monitor their progress.
|
|
421
|
-
|
|
422
|
-
**Examples:**
|
|
423
|
-
|
|
424
|
-
.. code-block:: python
|
|
425
|
-
|
|
426
|
-
from dagster import job, EnvVar
|
|
427
|
-
from dagster_airbyte import AirbyteResource
|
|
428
|
-
|
|
429
|
-
my_airbyte_resource = AirbyteResource(
|
|
430
|
-
host=EnvVar("AIRBYTE_HOST"),
|
|
431
|
-
port=EnvVar("AIRBYTE_PORT"),
|
|
432
|
-
# If using basic auth
|
|
433
|
-
username=EnvVar("AIRBYTE_USERNAME"),
|
|
434
|
-
password=EnvVar("AIRBYTE_PASSWORD"),
|
|
435
|
-
)
|
|
436
|
-
|
|
437
|
-
airbyte_assets = build_airbyte_assets(
|
|
438
|
-
connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
|
|
439
|
-
destination_tables=["releases", "tags", "teams"],
|
|
440
|
-
)
|
|
441
|
-
|
|
442
|
-
defs = Definitions(
|
|
443
|
-
assets=[airbyte_assets],
|
|
444
|
-
resources={"airbyte": my_airbyte_resource},
|
|
445
|
-
)
|
|
446
|
-
"""
|
|
65
|
+
@beta
|
|
66
|
+
class AirbyteClient(DagsterModel):
|
|
67
|
+
"""This class exposes methods on top of the Airbyte APIs for Airbyte."""
|
|
447
68
|
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
username: Optional[str] = Field(default=None, description="Username if using basic auth.")
|
|
451
|
-
password: Optional[str] = Field(default=None, description="Password if using basic auth.")
|
|
452
|
-
use_https: bool = Field(
|
|
453
|
-
default=False, description="Whether to use HTTPS to connect to the Airbyte server."
|
|
454
|
-
)
|
|
455
|
-
forward_logs: bool = Field(
|
|
456
|
-
default=True,
|
|
69
|
+
rest_api_base_url: str = Field(
|
|
70
|
+
default=AIRBYTE_CLOUD_REST_API_BASE_URL,
|
|
457
71
|
description=(
|
|
458
|
-
"
|
|
459
|
-
"
|
|
72
|
+
"The base URL for the Airbyte REST API. "
|
|
73
|
+
"For Airbyte Cloud, leave this as the default. "
|
|
74
|
+
"For self-managed Airbyte, this is usually <your Airbyte host>/api/public/v1."
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
configuration_api_base_url: str = Field(
|
|
78
|
+
default=AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL,
|
|
79
|
+
description=(
|
|
80
|
+
"The base URL for the Airbyte Configuration API. "
|
|
81
|
+
"For Airbyte Cloud, leave this as the default. "
|
|
82
|
+
"For self-managed Airbyte, this is usually <your Airbyte host>/api/v1."
|
|
460
83
|
),
|
|
461
84
|
)
|
|
462
|
-
request_additional_params: Mapping[str, Any] = Field(
|
|
463
|
-
default=dict(),
|
|
464
|
-
description=(
|
|
465
|
-
"Any additional kwargs to pass to the requests library when making requests to Airbyte."
|
|
466
|
-
),
|
|
467
|
-
)
|
|
468
|
-
|
|
469
|
-
@property
|
|
470
|
-
@cached_method
|
|
471
|
-
def _state(self) -> AirbyteResourceState:
|
|
472
|
-
return AirbyteResourceState()
|
|
473
|
-
|
|
474
|
-
@property
|
|
475
|
-
@cached_method
|
|
476
|
-
def _log(self) -> logging.Logger:
|
|
477
|
-
return get_dagster_logger()
|
|
478
|
-
|
|
479
|
-
@property
|
|
480
|
-
def api_base_url(self) -> str:
|
|
481
|
-
return (
|
|
482
|
-
("https://" if self.use_https else "http://")
|
|
483
|
-
+ (f"{self.host}:{self.port}" if self.port else self.host)
|
|
484
|
-
+ "/api/v1"
|
|
485
|
-
)
|
|
486
|
-
|
|
487
|
-
@property
|
|
488
|
-
def _should_forward_logs(self) -> bool:
|
|
489
|
-
return self.forward_logs
|
|
490
|
-
|
|
491
|
-
@contextmanager
|
|
492
|
-
def cache_requests(self):
|
|
493
|
-
"""Context manager that enables caching certain requests to the Airbyte API,
|
|
494
|
-
cleared when the context is exited.
|
|
495
|
-
"""
|
|
496
|
-
self.clear_request_cache()
|
|
497
|
-
self._state.cache_enabled += 1
|
|
498
|
-
try:
|
|
499
|
-
yield
|
|
500
|
-
finally:
|
|
501
|
-
self.clear_request_cache()
|
|
502
|
-
self._state.cache_enabled -= 1
|
|
503
|
-
|
|
504
|
-
def clear_request_cache(self) -> None:
|
|
505
|
-
self._state.request_cache = {}
|
|
506
|
-
|
|
507
|
-
def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):
|
|
508
|
-
if not self._state.cache_enabled > 0:
|
|
509
|
-
return self.make_request(endpoint, data)
|
|
510
|
-
data_json = json.dumps(data, sort_keys=True)
|
|
511
|
-
sha = hashlib.sha1()
|
|
512
|
-
sha.update(endpoint.encode("utf-8"))
|
|
513
|
-
sha.update(data_json.encode("utf-8"))
|
|
514
|
-
digest = sha.hexdigest()
|
|
515
|
-
|
|
516
|
-
if digest not in self._state.request_cache:
|
|
517
|
-
self._state.request_cache[digest] = self.make_request(endpoint, data)
|
|
518
|
-
return self._state.request_cache[digest]
|
|
519
|
-
|
|
520
|
-
@property
|
|
521
|
-
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
522
|
-
auth_param = (
|
|
523
|
-
{"auth": (self.username, self.password)} if self.username and self.password else {}
|
|
524
|
-
)
|
|
525
|
-
return {**auth_param, **self.request_additional_params}
|
|
526
|
-
|
|
527
|
-
def make_request( # pyright: ignore[reportIncompatibleMethodOverride]
|
|
528
|
-
self, endpoint: str, data: Optional[Mapping[str, object]]
|
|
529
|
-
) -> Optional[Mapping[str, object]]:
|
|
530
|
-
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
531
|
-
|
|
532
|
-
Args:
|
|
533
|
-
endpoint (str): The Airbyte API endpoint to send this request to.
|
|
534
|
-
data (Optional[str]): JSON-formatted data string to be included in the request.
|
|
535
|
-
|
|
536
|
-
Returns:
|
|
537
|
-
Optional[Dict[str, Any]]: Parsed json data from the response to this request
|
|
538
|
-
"""
|
|
539
|
-
url = self.api_base_url + endpoint
|
|
540
|
-
headers = {"accept": "application/json"}
|
|
541
|
-
|
|
542
|
-
num_retries = 0
|
|
543
|
-
while True:
|
|
544
|
-
try:
|
|
545
|
-
response = requests.request(
|
|
546
|
-
**deep_merge_dicts( # type: ignore
|
|
547
|
-
dict(
|
|
548
|
-
method="POST",
|
|
549
|
-
url=url,
|
|
550
|
-
headers=headers,
|
|
551
|
-
json=data,
|
|
552
|
-
timeout=self.request_timeout,
|
|
553
|
-
auth=(
|
|
554
|
-
(self.username, self.password)
|
|
555
|
-
if self.username and self.password
|
|
556
|
-
else None
|
|
557
|
-
),
|
|
558
|
-
),
|
|
559
|
-
self.request_additional_params,
|
|
560
|
-
),
|
|
561
|
-
)
|
|
562
|
-
response.raise_for_status()
|
|
563
|
-
if response.status_code == 204:
|
|
564
|
-
return None
|
|
565
|
-
return response.json()
|
|
566
|
-
except RequestException as e:
|
|
567
|
-
self._log.error("Request to Airbyte API failed: %s", e)
|
|
568
|
-
if num_retries == self.request_max_retries:
|
|
569
|
-
break
|
|
570
|
-
num_retries += 1
|
|
571
|
-
time.sleep(self.request_retry_delay)
|
|
572
|
-
|
|
573
|
-
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
574
|
-
|
|
575
|
-
def cancel_job(self, job_id: int):
|
|
576
|
-
self.make_request(endpoint="/jobs/cancel", data={"id": job_id})
|
|
577
|
-
|
|
578
|
-
def get_default_workspace(self) -> str:
|
|
579
|
-
workspaces = cast(
|
|
580
|
-
"list[dict[str, Any]]",
|
|
581
|
-
check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
|
|
582
|
-
"workspaces", []
|
|
583
|
-
),
|
|
584
|
-
)
|
|
585
|
-
return workspaces[0]["workspaceId"]
|
|
586
|
-
|
|
587
|
-
def get_source_definition_by_name(self, name: str) -> Optional[str]:
|
|
588
|
-
name_lower = name.lower()
|
|
589
|
-
definitions = check.not_none(
|
|
590
|
-
self.make_request_cached(endpoint="/source_definitions/list", data={})
|
|
591
|
-
)
|
|
592
|
-
source_definitions = cast("list[dict[str, Any]]", definitions["sourceDefinitions"])
|
|
593
|
-
|
|
594
|
-
return next(
|
|
595
|
-
(
|
|
596
|
-
definition["sourceDefinitionId"]
|
|
597
|
-
for definition in source_definitions
|
|
598
|
-
if definition["name"].lower() == name_lower
|
|
599
|
-
),
|
|
600
|
-
None,
|
|
601
|
-
)
|
|
602
|
-
|
|
603
|
-
def get_destination_definition_by_name(self, name: str):
|
|
604
|
-
name_lower = name.lower()
|
|
605
|
-
definitions = cast(
|
|
606
|
-
"dict[str, list[dict[str, str]]]",
|
|
607
|
-
check.not_none(
|
|
608
|
-
self.make_request_cached(endpoint="/destination_definitions/list", data={})
|
|
609
|
-
),
|
|
610
|
-
)
|
|
611
|
-
return next(
|
|
612
|
-
(
|
|
613
|
-
definition["destinationDefinitionId"]
|
|
614
|
-
for definition in definitions["destinationDefinitions"]
|
|
615
|
-
if definition["name"].lower() == name_lower
|
|
616
|
-
),
|
|
617
|
-
None,
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
def get_source_catalog_id(self, source_id: str):
|
|
621
|
-
result = cast(
|
|
622
|
-
"dict[str, Any]",
|
|
623
|
-
check.not_none(
|
|
624
|
-
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
625
|
-
),
|
|
626
|
-
)
|
|
627
|
-
return result["catalogId"]
|
|
628
|
-
|
|
629
|
-
def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
|
|
630
|
-
return cast(
|
|
631
|
-
"dict[str, Any]",
|
|
632
|
-
check.not_none(
|
|
633
|
-
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
634
|
-
),
|
|
635
|
-
)
|
|
636
|
-
|
|
637
|
-
def does_dest_support_normalization(
|
|
638
|
-
self, destination_definition_id: str, workspace_id: str
|
|
639
|
-
) -> bool:
|
|
640
|
-
# Airbyte API changed source of truth for normalization in PR
|
|
641
|
-
# https://github.com/airbytehq/airbyte/pull/21005
|
|
642
|
-
norm_dest_def_spec: bool = cast(
|
|
643
|
-
"dict[str, Any]",
|
|
644
|
-
check.not_none(
|
|
645
|
-
self.make_request_cached(
|
|
646
|
-
endpoint="/destination_definition_specifications/get",
|
|
647
|
-
data={
|
|
648
|
-
"destinationDefinitionId": destination_definition_id,
|
|
649
|
-
"workspaceId": workspace_id,
|
|
650
|
-
},
|
|
651
|
-
)
|
|
652
|
-
),
|
|
653
|
-
).get("supportsNormalization", False)
|
|
654
|
-
|
|
655
|
-
norm_dest_def: bool = (
|
|
656
|
-
cast(
|
|
657
|
-
"dict[str, Any]",
|
|
658
|
-
check.not_none(
|
|
659
|
-
self.make_request_cached(
|
|
660
|
-
endpoint="/destination_definitions/get",
|
|
661
|
-
data={
|
|
662
|
-
"destinationDefinitionId": destination_definition_id,
|
|
663
|
-
},
|
|
664
|
-
)
|
|
665
|
-
),
|
|
666
|
-
)
|
|
667
|
-
.get("normalizationConfig", {})
|
|
668
|
-
.get("supported", False)
|
|
669
|
-
)
|
|
670
|
-
|
|
671
|
-
return any([norm_dest_def_spec, norm_dest_def])
|
|
672
|
-
|
|
673
|
-
def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
|
|
674
|
-
if self.forward_logs:
|
|
675
|
-
return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))
|
|
676
|
-
else:
|
|
677
|
-
# the "list all jobs" endpoint doesn't return logs, which actually makes it much more
|
|
678
|
-
# lightweight for long-running syncs with many logs
|
|
679
|
-
out = check.not_none(
|
|
680
|
-
self.make_request(
|
|
681
|
-
endpoint="/jobs/list",
|
|
682
|
-
data={
|
|
683
|
-
"configTypes": ["sync"],
|
|
684
|
-
"configId": connection_id,
|
|
685
|
-
# sync should be the most recent, so pageSize 5 is sufficient
|
|
686
|
-
"pagination": {"pageSize": 5},
|
|
687
|
-
},
|
|
688
|
-
)
|
|
689
|
-
)
|
|
690
|
-
job = next(
|
|
691
|
-
(job for job in cast("list", out["jobs"]) if job["job"]["id"] == job_id), None
|
|
692
|
-
)
|
|
693
|
-
|
|
694
|
-
return check.not_none(job)
|
|
695
|
-
|
|
696
|
-
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
697
|
-
return check.not_none(
|
|
698
|
-
self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})
|
|
699
|
-
)
|
|
700
|
-
|
|
701
|
-
def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
|
|
702
|
-
return check.not_none(
|
|
703
|
-
self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})
|
|
704
|
-
)
|
|
705
|
-
|
|
706
|
-
def sync_and_poll(
|
|
707
|
-
self,
|
|
708
|
-
connection_id: str,
|
|
709
|
-
poll_interval: Optional[float] = None,
|
|
710
|
-
poll_timeout: Optional[float] = None,
|
|
711
|
-
) -> AirbyteOutput:
|
|
712
|
-
"""Initializes a sync operation for the given connector, and polls until it completes.
|
|
713
|
-
|
|
714
|
-
Args:
|
|
715
|
-
connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
|
|
716
|
-
"Connection" tab of a given connection in the Arbyte UI.
|
|
717
|
-
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
718
|
-
poll_timeout (float): The maximum time that will waited before this operation is timed
|
|
719
|
-
out. By default, this will never time out.
|
|
720
|
-
|
|
721
|
-
Returns:
|
|
722
|
-
:py:class:`~AirbyteOutput`:
|
|
723
|
-
Details of the sync job.
|
|
724
|
-
"""
|
|
725
|
-
connection_details = self.get_connection_details(connection_id)
|
|
726
|
-
job_details = self.start_sync(connection_id)
|
|
727
|
-
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
728
|
-
job_id = cast("int", job_info.get("id"))
|
|
729
|
-
|
|
730
|
-
self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
|
|
731
|
-
start = time.monotonic()
|
|
732
|
-
logged_attempts = 0
|
|
733
|
-
logged_lines = 0
|
|
734
|
-
state = None
|
|
735
|
-
|
|
736
|
-
try:
|
|
737
|
-
while True:
|
|
738
|
-
if poll_timeout and start + poll_timeout < time.monotonic():
|
|
739
|
-
raise Failure(
|
|
740
|
-
f"Timeout: Airbyte job {job_id} is not ready after the timeout"
|
|
741
|
-
f" {poll_timeout} seconds"
|
|
742
|
-
)
|
|
743
|
-
time.sleep(poll_interval or self.poll_interval)
|
|
744
|
-
job_details = self.get_job_status(connection_id, job_id)
|
|
745
|
-
attempts = cast("list", job_details.get("attempts", []))
|
|
746
|
-
cur_attempt = len(attempts)
|
|
747
|
-
# spit out the available Airbyte log info
|
|
748
|
-
if cur_attempt:
|
|
749
|
-
if self.forward_logs:
|
|
750
|
-
log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
|
|
751
|
-
|
|
752
|
-
for line in log_lines[logged_lines:]:
|
|
753
|
-
sys.stdout.write(line + "\n")
|
|
754
|
-
sys.stdout.flush()
|
|
755
|
-
logged_lines = len(log_lines)
|
|
756
|
-
|
|
757
|
-
# if there's a next attempt, this one will have no more log messages
|
|
758
|
-
if logged_attempts < cur_attempt - 1:
|
|
759
|
-
logged_lines = 0
|
|
760
|
-
logged_attempts += 1
|
|
761
|
-
|
|
762
|
-
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
763
|
-
state = job_info.get("status")
|
|
764
|
-
|
|
765
|
-
if state in (
|
|
766
|
-
AirbyteJobStatusType.RUNNING,
|
|
767
|
-
AirbyteJobStatusType.PENDING,
|
|
768
|
-
AirbyteJobStatusType.INCOMPLETE,
|
|
769
|
-
):
|
|
770
|
-
continue
|
|
771
|
-
elif state == AirbyteJobStatusType.SUCCEEDED:
|
|
772
|
-
break
|
|
773
|
-
elif state == AirbyteJobStatusType.ERROR:
|
|
774
|
-
raise Failure(f"Job failed: {job_id}")
|
|
775
|
-
elif state == AirbyteJobStatusType.CANCELLED:
|
|
776
|
-
raise Failure(f"Job was cancelled: {job_id}")
|
|
777
|
-
else:
|
|
778
|
-
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
779
|
-
finally:
|
|
780
|
-
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
781
|
-
# the python process
|
|
782
|
-
if (
|
|
783
|
-
state
|
|
784
|
-
not in (
|
|
785
|
-
AirbyteJobStatusType.SUCCEEDED,
|
|
786
|
-
AirbyteJobStatusType.ERROR,
|
|
787
|
-
AirbyteJobStatusType.CANCELLED,
|
|
788
|
-
)
|
|
789
|
-
and self.cancel_sync_on_run_termination
|
|
790
|
-
):
|
|
791
|
-
self.cancel_job(job_id)
|
|
792
|
-
|
|
793
|
-
return AirbyteOutput(job_details=job_details, connection_details=connection_details)
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
@dagster_maintained_resource
|
|
797
|
-
@resource(config_schema=AirbyteResource.to_config_schema())
|
|
798
|
-
def airbyte_resource(context) -> AirbyteResource:
|
|
799
|
-
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
800
|
-
syncs and monitor their progress. This currently implements only a subset of the functionality
|
|
801
|
-
exposed by the API.
|
|
802
|
-
|
|
803
|
-
For a complete set of documentation on the Airbyte REST API, including expected response JSON
|
|
804
|
-
schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.
|
|
805
|
-
|
|
806
|
-
To configure this resource, we recommend using the `configured
|
|
807
|
-
<https://legacy-docs.dagster.io/concepts/configuration/configured>`_ method.
|
|
808
|
-
|
|
809
|
-
**Examples:**
|
|
810
|
-
|
|
811
|
-
.. code-block:: python
|
|
812
|
-
|
|
813
|
-
from dagster import job
|
|
814
|
-
from dagster_airbyte import airbyte_resource
|
|
815
|
-
|
|
816
|
-
my_airbyte_resource = airbyte_resource.configured(
|
|
817
|
-
{
|
|
818
|
-
"host": {"env": "AIRBYTE_HOST"},
|
|
819
|
-
"port": {"env": "AIRBYTE_PORT"},
|
|
820
|
-
# If using basic auth
|
|
821
|
-
"username": {"env": "AIRBYTE_USERNAME"},
|
|
822
|
-
"password": {"env": "AIRBYTE_PASSWORD"},
|
|
823
|
-
}
|
|
824
|
-
)
|
|
825
|
-
|
|
826
|
-
@job(resource_defs={"airbyte":my_airbyte_resource})
|
|
827
|
-
def my_airbyte_job():
|
|
828
|
-
...
|
|
829
|
-
|
|
830
|
-
"""
|
|
831
|
-
return AirbyteResource.from_resource_context(context)
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
@superseded(additional_warn_text=("Use `AirbyteCloudWorkspace` instead."))
|
|
835
|
-
@dagster_maintained_resource
|
|
836
|
-
@resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))
|
|
837
|
-
def airbyte_cloud_resource(context) -> AirbyteCloudResource:
|
|
838
|
-
"""This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
|
|
839
|
-
syncs and monitor their progress. Currently, this resource may only be used with the more basic
|
|
840
|
-
`dagster-airbyte` APIs, including the ops and assets.
|
|
841
|
-
|
|
842
|
-
"""
|
|
843
|
-
return AirbyteCloudResource.from_resource_context(context)
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
# -------------
|
|
847
|
-
# Resources v2
|
|
848
|
-
# -------------
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
@beta
|
|
852
|
-
class AirbyteCloudClient(DagsterModel):
|
|
853
|
-
"""This class exposes methods on top of the Airbyte APIs for Airbyte Cloud."""
|
|
854
|
-
|
|
855
85
|
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
856
|
-
client_id: str = Field(
|
|
857
|
-
client_secret: str = Field(
|
|
86
|
+
client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
|
|
87
|
+
client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
|
|
88
|
+
username: Optional[str] = Field(
|
|
89
|
+
default=None,
|
|
90
|
+
description="The Airbyte username for authentication. Used for self-managed Airbyte with basic auth.",
|
|
91
|
+
)
|
|
92
|
+
password: Optional[str] = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="The Airbyte password for authentication. Used for self-managed Airbyte with basic auth.",
|
|
95
|
+
)
|
|
858
96
|
request_max_retries: int = Field(
|
|
859
97
|
...,
|
|
860
98
|
description=(
|
|
@@ -870,30 +108,82 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
870
108
|
...,
|
|
871
109
|
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
872
110
|
)
|
|
111
|
+
max_items_per_page: int = Field(
|
|
112
|
+
default=100,
|
|
113
|
+
description=(
|
|
114
|
+
"The maximum number of items per page. "
|
|
115
|
+
"Used for paginated resources like connections, destinations, etc. "
|
|
116
|
+
),
|
|
117
|
+
)
|
|
118
|
+
poll_interval: float = Field(
|
|
119
|
+
default=DEFAULT_POLL_INTERVAL_SECONDS,
|
|
120
|
+
description="The time (in seconds) that will be waited between successive polls.",
|
|
121
|
+
)
|
|
122
|
+
poll_timeout: Optional[float] = Field(
|
|
123
|
+
default=None,
|
|
124
|
+
description=(
|
|
125
|
+
"The maximum time that will wait before this operation is timed "
|
|
126
|
+
"out. By default, this will never time out."
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
cancel_on_termination: bool = Field(
|
|
130
|
+
default=True,
|
|
131
|
+
description=(
|
|
132
|
+
"Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
|
|
133
|
+
"This may be useful to disable if using Airbyte sources that cannot be cancelled and "
|
|
134
|
+
"resumed easily, or if your Dagster deployment may experience runner interruptions "
|
|
135
|
+
"that do not impact your Airbyte deployment."
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
poll_previous_running_sync: bool = Field(
|
|
139
|
+
default=False,
|
|
140
|
+
description=(
|
|
141
|
+
"If set to True, Dagster will check for previous running sync for the same connection "
|
|
142
|
+
"and begin polling it instead of starting a new sync."
|
|
143
|
+
),
|
|
144
|
+
)
|
|
873
145
|
|
|
874
146
|
_access_token_value: Optional[str] = PrivateAttr(default=None)
|
|
875
147
|
_access_token_timestamp: Optional[float] = PrivateAttr(default=None)
|
|
876
148
|
|
|
149
|
+
@model_validator(mode="before")
|
|
150
|
+
def validate_authentication(cls, values):
|
|
151
|
+
has_client_id = values.get("client_id") is not None
|
|
152
|
+
has_client_secret = values.get("client_secret") is not None
|
|
153
|
+
has_username = values.get("username") is not None
|
|
154
|
+
has_password = values.get("password") is not None
|
|
155
|
+
|
|
156
|
+
check.invariant(
|
|
157
|
+
has_username == has_password,
|
|
158
|
+
"Missing config: both username and password are required for Airbyte authentication.",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
check.invariant(
|
|
162
|
+
has_client_id == has_client_secret,
|
|
163
|
+
"Missing config: both client_id and client_secret are required for Airbyte authentication.",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
check.invariant(
|
|
167
|
+
not ((has_client_id or has_client_secret) and (has_username or has_password)),
|
|
168
|
+
"Invalid config: cannot provide both client_id/client_secret and username/password for Airbyte authentication.",
|
|
169
|
+
)
|
|
170
|
+
return values
|
|
171
|
+
|
|
877
172
|
@property
|
|
878
173
|
@cached_method
|
|
879
174
|
def _log(self) -> logging.Logger:
|
|
880
175
|
return get_dagster_logger()
|
|
881
176
|
|
|
882
177
|
@property
|
|
883
|
-
def
|
|
884
|
-
return
|
|
885
|
-
|
|
886
|
-
@property
|
|
887
|
-
def configuration_api_base_url(self) -> str:
|
|
888
|
-
return f"{AIRBYTE_CONFIGURATION_API_BASE}/{AIRBYTE_CONFIGURATION_API_VERSION}"
|
|
178
|
+
def all_additional_request_headers(self) -> Mapping[str, Any]:
|
|
179
|
+
return {**self.authorization_request_headers, **self.user_agent_request_headers}
|
|
889
180
|
|
|
890
181
|
@property
|
|
891
|
-
def
|
|
892
|
-
return {**self.authorization_request_params, **self.user_agent_request_params}
|
|
893
|
-
|
|
894
|
-
@property
|
|
895
|
-
def authorization_request_params(self) -> Mapping[str, Any]:
|
|
182
|
+
def authorization_request_headers(self) -> Mapping[str, Any]:
|
|
896
183
|
# Make sure the access token is refreshed before using it when calling the API.
|
|
184
|
+
if not (self.client_id and self.client_secret):
|
|
185
|
+
return {}
|
|
186
|
+
|
|
897
187
|
if self._needs_refreshed_access_token():
|
|
898
188
|
self._refresh_access_token()
|
|
899
189
|
return {
|
|
@@ -901,23 +191,22 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
901
191
|
}
|
|
902
192
|
|
|
903
193
|
@property
|
|
904
|
-
def
|
|
194
|
+
def user_agent_request_headers(self) -> Mapping[str, Any]:
|
|
905
195
|
return {
|
|
906
196
|
"User-Agent": "dagster",
|
|
907
197
|
}
|
|
908
198
|
|
|
909
199
|
def _refresh_access_token(self) -> None:
|
|
910
200
|
response = check.not_none(
|
|
911
|
-
self.
|
|
201
|
+
self._single_request(
|
|
912
202
|
method="POST",
|
|
913
|
-
|
|
914
|
-
base_url=self.rest_api_base_url,
|
|
203
|
+
url=f"{self.rest_api_base_url}/applications/token",
|
|
915
204
|
data={
|
|
916
205
|
"client_id": self.client_id,
|
|
917
206
|
"client_secret": self.client_secret,
|
|
918
207
|
},
|
|
919
208
|
# Must not pass the bearer access token when refreshing it.
|
|
920
|
-
|
|
209
|
+
include_additional_request_headers=False,
|
|
921
210
|
)
|
|
922
211
|
)
|
|
923
212
|
self._access_token_value = str(response["access_token"])
|
|
@@ -928,52 +217,38 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
928
217
|
not self._access_token_value
|
|
929
218
|
or not self._access_token_timestamp
|
|
930
219
|
or self._access_token_timestamp
|
|
931
|
-
<= (
|
|
932
|
-
datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
|
|
933
|
-
).timestamp()
|
|
220
|
+
<= (datetime.now() - timedelta(seconds=AIRBYTE_REFRESH_TIMEDELTA_SECONDS)).timestamp()
|
|
934
221
|
)
|
|
935
222
|
|
|
936
|
-
def _get_session(self,
|
|
223
|
+
def _get_session(self, include_additional_request_headers: bool) -> requests.Session:
|
|
937
224
|
headers = {"accept": "application/json"}
|
|
938
|
-
if
|
|
225
|
+
if include_additional_request_headers:
|
|
939
226
|
headers = {
|
|
940
227
|
**headers,
|
|
941
|
-
**self.
|
|
228
|
+
**self.all_additional_request_headers,
|
|
942
229
|
}
|
|
943
230
|
session = requests.Session()
|
|
944
231
|
session.headers.update(headers)
|
|
232
|
+
|
|
233
|
+
if self.username and self.password:
|
|
234
|
+
session.auth = (self.username, self.password)
|
|
235
|
+
|
|
945
236
|
return session
|
|
946
237
|
|
|
947
|
-
def
|
|
238
|
+
def _single_request(
|
|
948
239
|
self,
|
|
949
240
|
method: str,
|
|
950
|
-
|
|
951
|
-
base_url: str,
|
|
241
|
+
url: str,
|
|
952
242
|
data: Optional[Mapping[str, Any]] = None,
|
|
953
243
|
params: Optional[Mapping[str, Any]] = None,
|
|
954
|
-
|
|
244
|
+
include_additional_request_headers: bool = True,
|
|
955
245
|
) -> Mapping[str, Any]:
|
|
956
|
-
"""
|
|
957
|
-
|
|
958
|
-
Args:
|
|
959
|
-
method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").
|
|
960
|
-
endpoint (str): The Airbyte API endpoint to send this request to.
|
|
961
|
-
base_url (str): The base url to the Airbyte API to use.
|
|
962
|
-
data (Optional[Dict[str, Any]]): JSON-formatted data string to be included in the request.
|
|
963
|
-
params (Optional[Dict[str, Any]]): JSON-formatted query params to be included in the request.
|
|
964
|
-
include_additional_request_params (bool): Whether to include authorization and user-agent headers
|
|
965
|
-
to the request parameters. Defaults to True.
|
|
966
|
-
|
|
967
|
-
Returns:
|
|
968
|
-
Dict[str, Any]: Parsed json data from the response to this request
|
|
969
|
-
"""
|
|
970
|
-
url = f"{base_url}/{endpoint}"
|
|
971
|
-
|
|
246
|
+
"""Execute a single HTTP request with retry logic."""
|
|
972
247
|
num_retries = 0
|
|
973
248
|
while True:
|
|
974
249
|
try:
|
|
975
250
|
session = self._get_session(
|
|
976
|
-
|
|
251
|
+
include_additional_request_headers=include_additional_request_headers
|
|
977
252
|
)
|
|
978
253
|
response = session.request(
|
|
979
254
|
method=method, url=url, json=data, params=params, timeout=self.request_timeout
|
|
@@ -989,17 +264,75 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
989
264
|
num_retries += 1
|
|
990
265
|
time.sleep(self.request_retry_delay)
|
|
991
266
|
|
|
992
|
-
|
|
267
|
+
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
268
|
+
|
|
269
|
+
return {}
|
|
270
|
+
|
|
271
|
+
def _paginated_request(
|
|
272
|
+
self,
|
|
273
|
+
method: str,
|
|
274
|
+
url: str,
|
|
275
|
+
params: dict[str, Any],
|
|
276
|
+
data: Optional[Mapping[str, Any]] = None,
|
|
277
|
+
include_additional_request_params: bool = True,
|
|
278
|
+
) -> Sequence[Mapping[str, Any]]:
|
|
279
|
+
"""Execute paginated requests and yield all items."""
|
|
280
|
+
result_data = []
|
|
281
|
+
params = {"limit": self.max_items_per_page, **params}
|
|
282
|
+
while True:
|
|
283
|
+
response = self._single_request(
|
|
284
|
+
method=method,
|
|
285
|
+
url=url,
|
|
286
|
+
data=data,
|
|
287
|
+
params=params,
|
|
288
|
+
include_additional_request_headers=include_additional_request_params,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Handle different response structures
|
|
292
|
+
result_data.extend(response.get("data", []))
|
|
293
|
+
next_url = response.get("next", "")
|
|
294
|
+
if not next_url:
|
|
295
|
+
break
|
|
296
|
+
|
|
297
|
+
# Parse the query string for the next page
|
|
298
|
+
next_params = parse_qsl(urlparse(next_url).query)
|
|
299
|
+
# Overwrite the pagination params with the ones for the next page
|
|
300
|
+
params.update(dict(next_params))
|
|
301
|
+
|
|
302
|
+
return result_data
|
|
993
303
|
|
|
994
|
-
def
|
|
304
|
+
def validate_workspace_id(self) -> None:
|
|
305
|
+
"""Fetches workspace details. This is used to validate that the workspace exists."""
|
|
306
|
+
self._single_request(
|
|
307
|
+
method="GET",
|
|
308
|
+
url=f"{self.rest_api_base_url}/workspaces/{self.workspace_id}",
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def get_connections(self) -> Sequence[Mapping[str, Any]]:
|
|
995
312
|
"""Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
|
|
996
|
-
return self.
|
|
313
|
+
return self._paginated_request(
|
|
997
314
|
method="GET",
|
|
998
|
-
|
|
999
|
-
base_url=self.rest_api_base_url,
|
|
315
|
+
url=f"{self.rest_api_base_url}/connections",
|
|
1000
316
|
params={"workspaceIds": self.workspace_id},
|
|
1001
317
|
)
|
|
1002
318
|
|
|
319
|
+
def get_jobs_for_connection(
|
|
320
|
+
self, connection_id: str, created_after: datetime | None = None
|
|
321
|
+
) -> Sequence[AirbyteJob]:
|
|
322
|
+
"""Fetches all jobs for a specific connection of an Airbyte workspace from the Airbyte REST API."""
|
|
323
|
+
params = {"workspaceIds": self.workspace_id, "connectionId": connection_id}
|
|
324
|
+
if created_after:
|
|
325
|
+
params["createdAtStart"] = created_after.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
326
|
+
|
|
327
|
+
return [
|
|
328
|
+
AirbyteJob.from_job_details(job_details=job_details)
|
|
329
|
+
for job_details in self._paginated_request(
|
|
330
|
+
method="GET",
|
|
331
|
+
url=f"{self.rest_api_base_url}/jobs",
|
|
332
|
+
params=params,
|
|
333
|
+
)
|
|
334
|
+
]
|
|
335
|
+
|
|
1003
336
|
def get_connection_details(self, connection_id) -> Mapping[str, Any]:
|
|
1004
337
|
"""Fetches details about a given connection from the Airbyte Configuration API.
|
|
1005
338
|
The Airbyte Configuration API is an internal and may change in the future.
|
|
@@ -1007,26 +340,23 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
1007
340
|
# Using the Airbyte Configuration API to get the connection details, including streams and their configs.
|
|
1008
341
|
# https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
|
|
1009
342
|
# https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
|
|
1010
|
-
return self.
|
|
343
|
+
return self._single_request(
|
|
1011
344
|
method="POST",
|
|
1012
|
-
|
|
1013
|
-
base_url=self.configuration_api_base_url,
|
|
345
|
+
url=f"{self.configuration_api_base_url}/connections/get",
|
|
1014
346
|
data={"connectionId": connection_id},
|
|
1015
347
|
)
|
|
1016
348
|
|
|
1017
349
|
def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
|
|
1018
350
|
"""Fetches details about a given destination from the Airbyte REST API."""
|
|
1019
|
-
return self.
|
|
351
|
+
return self._single_request(
|
|
1020
352
|
method="GET",
|
|
1021
|
-
|
|
1022
|
-
base_url=self.rest_api_base_url,
|
|
353
|
+
url=f"{self.rest_api_base_url}/destinations/{destination_id}",
|
|
1023
354
|
)
|
|
1024
355
|
|
|
1025
356
|
def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
|
|
1026
|
-
return self.
|
|
357
|
+
return self._single_request(
|
|
1027
358
|
method="POST",
|
|
1028
|
-
|
|
1029
|
-
base_url=self.rest_api_base_url,
|
|
359
|
+
url=f"{self.rest_api_base_url}/jobs",
|
|
1030
360
|
data={
|
|
1031
361
|
"connectionId": connection_id,
|
|
1032
362
|
"jobType": "sync",
|
|
@@ -1034,59 +364,76 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
1034
364
|
)
|
|
1035
365
|
|
|
1036
366
|
def get_job_details(self, job_id: int) -> Mapping[str, Any]:
|
|
1037
|
-
return self.
|
|
1038
|
-
method="GET",
|
|
367
|
+
return self._single_request(
|
|
368
|
+
method="GET",
|
|
369
|
+
url=f"{self.rest_api_base_url}/jobs/{job_id}",
|
|
1039
370
|
)
|
|
1040
371
|
|
|
1041
372
|
def cancel_job(self, job_id: int) -> Mapping[str, Any]:
|
|
1042
|
-
return self.
|
|
1043
|
-
method="DELETE",
|
|
373
|
+
return self._single_request(
|
|
374
|
+
method="DELETE",
|
|
375
|
+
url=f"{self.rest_api_base_url}/jobs/{job_id}",
|
|
1044
376
|
)
|
|
1045
377
|
|
|
1046
|
-
def sync_and_poll(
|
|
1047
|
-
self,
|
|
1048
|
-
connection_id: str,
|
|
1049
|
-
poll_interval: Optional[float] = None,
|
|
1050
|
-
poll_timeout: Optional[float] = None,
|
|
1051
|
-
cancel_on_termination: bool = True,
|
|
1052
|
-
) -> AirbyteOutput:
|
|
378
|
+
def sync_and_poll(self, connection_id: str) -> AirbyteOutput:
|
|
1053
379
|
"""Initializes a sync operation for the given connection, and polls until it completes.
|
|
1054
380
|
|
|
1055
381
|
Args:
|
|
1056
382
|
connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
|
|
1057
383
|
"Connection" tab of a given connection in the Airbyte UI.
|
|
1058
|
-
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
1059
|
-
poll_timeout (float): The maximum time that will wait before this operation is timed
|
|
1060
|
-
out. By default, this will never time out.
|
|
1061
|
-
cancel_on_termination (bool): Whether to cancel a sync in Airbyte if the Dagster runner is terminated.
|
|
1062
|
-
This may be useful to disable if using Airbyte sources that cannot be cancelled and
|
|
1063
|
-
resumed easily, or if your Dagster deployment may experience runner interruptions
|
|
1064
|
-
that do not impact your Airbyte deployment.
|
|
1065
384
|
|
|
1066
385
|
Returns:
|
|
1067
386
|
:py:class:`~AirbyteOutput`:
|
|
1068
387
|
Details of the sync job.
|
|
1069
388
|
"""
|
|
1070
389
|
connection_details = self.get_connection_details(connection_id)
|
|
1071
|
-
start_job_details = self.start_sync_job(connection_id)
|
|
1072
|
-
job = AirbyteJob.from_job_details(job_details=start_job_details)
|
|
1073
390
|
|
|
1074
|
-
|
|
391
|
+
existing_jobs = [
|
|
392
|
+
job
|
|
393
|
+
for job in self.get_jobs_for_connection(
|
|
394
|
+
connection_id=connection_id,
|
|
395
|
+
created_after=datetime.now() - timedelta(days=2),
|
|
396
|
+
)
|
|
397
|
+
if job.status
|
|
398
|
+
in (
|
|
399
|
+
AirbyteJobStatusType.RUNNING,
|
|
400
|
+
AirbyteJobStatusType.PENDING,
|
|
401
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
402
|
+
)
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
if not existing_jobs:
|
|
406
|
+
start_job_details = self.start_sync_job(connection_id)
|
|
407
|
+
job = AirbyteJob.from_job_details(job_details=start_job_details)
|
|
408
|
+
self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
|
|
409
|
+
else:
|
|
410
|
+
if self.poll_previous_running_sync:
|
|
411
|
+
if len(existing_jobs) == 1:
|
|
412
|
+
job = existing_jobs[0]
|
|
413
|
+
self._log.info(
|
|
414
|
+
f"Job {job.id} already running for connection_id={connection_id}. Resume polling."
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
417
|
+
raise Failure(f"Found multiple running jobs for connection_id={connection_id}.")
|
|
418
|
+
else:
|
|
419
|
+
raise Failure(f"Found sync job for connection_id={connection_id} already running.")
|
|
420
|
+
|
|
1075
421
|
poll_start = datetime.now()
|
|
1076
|
-
|
|
1077
|
-
poll_interval if poll_interval is not None else DEFAULT_POLL_INTERVAL_SECONDS
|
|
1078
|
-
)
|
|
422
|
+
|
|
1079
423
|
try:
|
|
1080
424
|
while True:
|
|
1081
|
-
if poll_timeout and datetime.now() > poll_start + timedelta(
|
|
425
|
+
if self.poll_timeout and datetime.now() > poll_start + timedelta(
|
|
426
|
+
seconds=self.poll_timeout
|
|
427
|
+
):
|
|
1082
428
|
raise Failure(
|
|
1083
429
|
f"Timeout: Airbyte job {job.id} is not ready after the timeout"
|
|
1084
|
-
f" {poll_timeout} seconds"
|
|
430
|
+
f" {self.poll_timeout} seconds"
|
|
1085
431
|
)
|
|
1086
432
|
|
|
1087
|
-
time.sleep(poll_interval)
|
|
433
|
+
time.sleep(self.poll_interval)
|
|
1088
434
|
# We return these job details in the AirbyteOutput when the job succeeds
|
|
1089
435
|
poll_job_details = self.get_job_details(job.id)
|
|
436
|
+
self._log.debug(poll_job_details)
|
|
1090
437
|
job = AirbyteJob.from_job_details(job_details=poll_job_details)
|
|
1091
438
|
if job.status in (
|
|
1092
439
|
AirbyteJobStatusType.RUNNING,
|
|
@@ -1107,7 +454,7 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
1107
454
|
finally:
|
|
1108
455
|
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
1109
456
|
# the python process
|
|
1110
|
-
if cancel_on_termination and job.status not in (
|
|
457
|
+
if self.cancel_on_termination and job.status not in (
|
|
1111
458
|
AirbyteJobStatusType.SUCCEEDED,
|
|
1112
459
|
AirbyteJobStatusType.ERROR,
|
|
1113
460
|
AirbyteJobStatusType.CANCELLED,
|
|
@@ -1119,14 +466,11 @@ class AirbyteCloudClient(DagsterModel):
|
|
|
1119
466
|
|
|
1120
467
|
|
|
1121
468
|
@beta
|
|
1122
|
-
class
|
|
1123
|
-
"""This class represents a Airbyte
|
|
469
|
+
class BaseAirbyteWorkspace(ConfigurableResource):
|
|
470
|
+
"""This class represents a Airbyte workspace and provides utilities
|
|
1124
471
|
to interact with Airbyte APIs.
|
|
1125
472
|
"""
|
|
1126
473
|
|
|
1127
|
-
workspace_id: str = Field(..., description="The Airbyte Cloud workspace ID")
|
|
1128
|
-
client_id: str = Field(..., description="The Airbyte Cloud client ID.")
|
|
1129
|
-
client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
|
|
1130
474
|
request_max_retries: int = Field(
|
|
1131
475
|
default=3,
|
|
1132
476
|
description=(
|
|
@@ -1142,19 +486,42 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1142
486
|
default=15,
|
|
1143
487
|
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
1144
488
|
)
|
|
489
|
+
max_items_per_page: int = Field(
|
|
490
|
+
default=100,
|
|
491
|
+
description=(
|
|
492
|
+
"The maximum number of items per page. "
|
|
493
|
+
"Used for paginated resources like connections, destinations, etc. "
|
|
494
|
+
),
|
|
495
|
+
)
|
|
496
|
+
poll_interval: float = Field(
|
|
497
|
+
default=DEFAULT_POLL_INTERVAL_SECONDS,
|
|
498
|
+
description="The time (in seconds) that will be waited between successive polls.",
|
|
499
|
+
)
|
|
500
|
+
poll_timeout: Optional[float] = Field(
|
|
501
|
+
default=None,
|
|
502
|
+
description=(
|
|
503
|
+
"The maximum time that will wait before this operation is timed "
|
|
504
|
+
"out. By default, this will never time out."
|
|
505
|
+
),
|
|
506
|
+
)
|
|
507
|
+
cancel_on_termination: bool = Field(
|
|
508
|
+
default=True,
|
|
509
|
+
description=(
|
|
510
|
+
"Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
|
|
511
|
+
"This may be useful to disable if using Airbyte sources that cannot be cancelled and "
|
|
512
|
+
"resumed easily, or if your Dagster deployment may experience runner interruptions "
|
|
513
|
+
"that do not impact your Airbyte deployment."
|
|
514
|
+
),
|
|
515
|
+
)
|
|
516
|
+
poll_previous_running_sync: bool = Field(
|
|
517
|
+
default=False,
|
|
518
|
+
description=(
|
|
519
|
+
"If set to True, Dagster will check for previous running sync for the same connection "
|
|
520
|
+
"and begin polling it instead of starting a new sync."
|
|
521
|
+
),
|
|
522
|
+
)
|
|
1145
523
|
|
|
1146
|
-
_client:
|
|
1147
|
-
|
|
1148
|
-
@cached_method
|
|
1149
|
-
def get_client(self) -> AirbyteCloudClient:
|
|
1150
|
-
return AirbyteCloudClient(
|
|
1151
|
-
workspace_id=self.workspace_id,
|
|
1152
|
-
client_id=self.client_id,
|
|
1153
|
-
client_secret=self.client_secret,
|
|
1154
|
-
request_max_retries=self.request_max_retries,
|
|
1155
|
-
request_retry_delay=self.request_retry_delay,
|
|
1156
|
-
request_timeout=self.request_timeout,
|
|
1157
|
-
)
|
|
524
|
+
_client: AirbyteClient = PrivateAttr(default=None) # type: ignore
|
|
1158
525
|
|
|
1159
526
|
@cached_method
|
|
1160
527
|
def fetch_airbyte_workspace_data(
|
|
@@ -1169,7 +536,10 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1169
536
|
destinations_by_id = {}
|
|
1170
537
|
|
|
1171
538
|
client = self.get_client()
|
|
1172
|
-
|
|
539
|
+
|
|
540
|
+
client.validate_workspace_id()
|
|
541
|
+
|
|
542
|
+
connections = client.get_connections()
|
|
1173
543
|
|
|
1174
544
|
for partial_connection_details in connections:
|
|
1175
545
|
full_connection_details = client.get_connection_details(
|
|
@@ -1197,6 +567,7 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1197
567
|
def load_asset_specs(
|
|
1198
568
|
self,
|
|
1199
569
|
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
570
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
|
|
1200
571
|
) -> Sequence[AssetSpec]:
|
|
1201
572
|
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
1202
573
|
|
|
@@ -1204,6 +575,8 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1204
575
|
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
1205
576
|
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
1206
577
|
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
578
|
+
connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
|
|
579
|
+
which Airbyte connection assets are created for.
|
|
1207
580
|
|
|
1208
581
|
Returns:
|
|
1209
582
|
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
@@ -1212,23 +585,25 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1212
585
|
Loading the asset specs for a given Airbyte workspace:
|
|
1213
586
|
.. code-block:: python
|
|
1214
587
|
|
|
1215
|
-
from dagster_airbyte import
|
|
588
|
+
from dagster_airbyte import AirbyteWorkspace
|
|
1216
589
|
|
|
1217
590
|
import dagster as dg
|
|
1218
591
|
|
|
1219
|
-
airbyte_workspace =
|
|
1220
|
-
workspace_id=dg.EnvVar("
|
|
1221
|
-
client_id=dg.EnvVar("
|
|
1222
|
-
client_secret=dg.EnvVar("
|
|
592
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
593
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
594
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
595
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
1223
596
|
)
|
|
1224
597
|
|
|
1225
598
|
airbyte_specs = airbyte_workspace.load_asset_specs()
|
|
1226
|
-
|
|
599
|
+
dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace})
|
|
1227
600
|
"""
|
|
1228
601
|
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
1229
602
|
|
|
1230
|
-
return
|
|
1231
|
-
workspace=self,
|
|
603
|
+
return load_airbyte_asset_specs(
|
|
604
|
+
workspace=self,
|
|
605
|
+
dagster_airbyte_translator=dagster_airbyte_translator,
|
|
606
|
+
connection_selector_fn=connection_selector_fn,
|
|
1232
607
|
)
|
|
1233
608
|
|
|
1234
609
|
def _generate_materialization(
|
|
@@ -1263,7 +638,7 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1263
638
|
yield AssetMaterialization(
|
|
1264
639
|
asset_key=stream_asset_spec.key,
|
|
1265
640
|
description=(
|
|
1266
|
-
f"Table generated via Airbyte
|
|
641
|
+
f"Table generated via Airbyte sync "
|
|
1267
642
|
f"for connection {connection.name}: {connection_table_name}"
|
|
1268
643
|
),
|
|
1269
644
|
metadata=stream_asset_spec.metadata,
|
|
@@ -1272,7 +647,7 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1272
647
|
@public
|
|
1273
648
|
@beta
|
|
1274
649
|
def sync_and_poll(self, context: AssetExecutionContext):
|
|
1275
|
-
"""Executes a sync and poll process to materialize Airbyte
|
|
650
|
+
"""Executes a sync and poll process to materialize Airbyte assets.
|
|
1276
651
|
This method can only be used in the context of an asset execution.
|
|
1277
652
|
|
|
1278
653
|
Args:
|
|
@@ -1317,53 +692,264 @@ class AirbyteCloudWorkspace(ConfigurableResource):
|
|
|
1317
692
|
if unmaterialized_asset_keys:
|
|
1318
693
|
context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
|
|
1319
694
|
|
|
695
|
+
@contextmanager
|
|
696
|
+
def process_config_and_initialize_cm_cached(self) -> Iterator["AirbyteWorkspace"]:
|
|
697
|
+
# Hack to avoid reconstructing initialized copies of this resource, which invalidates
|
|
698
|
+
# @cached_method caches. This means that multiple calls to load_airbyte_asset_specs
|
|
699
|
+
# will not trigger multiple API calls to fetch the workspace data.
|
|
700
|
+
# Bespoke impl since @cached_method doesn't play nice with iterators; it's exhausted after
|
|
701
|
+
# the first call.
|
|
702
|
+
if hasattr(self, "_initialized"):
|
|
703
|
+
yield getattr(self, "_initialized")
|
|
704
|
+
else:
|
|
705
|
+
with self.process_config_and_initialize_cm() as initialized_workspace:
|
|
706
|
+
initialized = initialized_workspace
|
|
707
|
+
setattr(self, "_initialized", initialized)
|
|
708
|
+
yield initialized
|
|
709
|
+
|
|
1320
710
|
|
|
1321
711
|
@beta
|
|
1322
|
-
|
|
1323
|
-
|
|
712
|
+
class AirbyteWorkspace(BaseAirbyteWorkspace):
|
|
713
|
+
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
714
|
+
syncs and monitor their progress for a given Airbyte workspace.
|
|
715
|
+
|
|
716
|
+
**Examples:**
|
|
717
|
+
Using OAuth client credentials:
|
|
718
|
+
|
|
719
|
+
.. code-block:: python
|
|
720
|
+
|
|
721
|
+
import dagster as dg
|
|
722
|
+
from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
|
|
723
|
+
|
|
724
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
725
|
+
rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
|
|
726
|
+
configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
|
|
727
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
728
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
729
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
733
|
+
|
|
734
|
+
defs = dg.Definitions(
|
|
735
|
+
assets=all_airbyte_assets,
|
|
736
|
+
resources={"airbyte": airbyte_workspace},
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
Using basic Authentication:
|
|
740
|
+
|
|
741
|
+
.. code-block:: python
|
|
742
|
+
|
|
743
|
+
import dagster as dg
|
|
744
|
+
from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
|
|
745
|
+
|
|
746
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
747
|
+
rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
|
|
748
|
+
configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
|
|
749
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
750
|
+
username=dg.EnvVar("AIRBYTE_USERNAME"),
|
|
751
|
+
password=dg.EnvVar("AIRBYTE_PASSWORD"),
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
755
|
+
|
|
756
|
+
defs = dg.Definitions(
|
|
757
|
+
assets=all_airbyte_assets,
|
|
758
|
+
resources={"airbyte": airbyte_workspace},
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
Using no authentication:
|
|
762
|
+
|
|
763
|
+
.. code-block:: python
|
|
764
|
+
|
|
765
|
+
import dagster as dg
|
|
766
|
+
from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
|
|
767
|
+
|
|
768
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
769
|
+
rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
|
|
770
|
+
configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
|
|
771
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
775
|
+
|
|
776
|
+
defs = dg.Definitions(
|
|
777
|
+
assets=all_airbyte_assets,
|
|
778
|
+
resources={"airbyte": airbyte_workspace},
|
|
779
|
+
)
|
|
780
|
+
"""
|
|
781
|
+
|
|
782
|
+
rest_api_base_url: str = Field(
|
|
783
|
+
...,
|
|
784
|
+
description="The base URL for the Airbyte REST API.",
|
|
785
|
+
examples=[
|
|
786
|
+
"http://localhost:8000/api/public/v1",
|
|
787
|
+
"https://my-airbyte-server.com/api/public/v1",
|
|
788
|
+
"http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/public/v1",
|
|
789
|
+
],
|
|
790
|
+
)
|
|
791
|
+
configuration_api_base_url: str = Field(
|
|
792
|
+
...,
|
|
793
|
+
description="The base URL for the Airbyte Configuration API.",
|
|
794
|
+
examples=[
|
|
795
|
+
"http://localhost:8000/api/v1",
|
|
796
|
+
"https://my-airbyte-server.com/api/v1",
|
|
797
|
+
"http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/v1",
|
|
798
|
+
],
|
|
799
|
+
)
|
|
800
|
+
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
801
|
+
client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
|
|
802
|
+
client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
|
|
803
|
+
username: Optional[str] = Field(
|
|
804
|
+
default=None, description="The Airbyte username for authentication."
|
|
805
|
+
)
|
|
806
|
+
password: Optional[str] = Field(
|
|
807
|
+
default=None, description="The Airbyte password for authentication."
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
@cached_method
|
|
811
|
+
def get_client(self) -> AirbyteClient:
|
|
812
|
+
return AirbyteClient(
|
|
813
|
+
rest_api_base_url=self.rest_api_base_url,
|
|
814
|
+
configuration_api_base_url=self.configuration_api_base_url,
|
|
815
|
+
workspace_id=self.workspace_id,
|
|
816
|
+
client_id=self.client_id,
|
|
817
|
+
client_secret=self.client_secret,
|
|
818
|
+
username=self.username,
|
|
819
|
+
password=self.password,
|
|
820
|
+
request_max_retries=self.request_max_retries,
|
|
821
|
+
request_retry_delay=self.request_retry_delay,
|
|
822
|
+
request_timeout=self.request_timeout,
|
|
823
|
+
max_items_per_page=self.max_items_per_page,
|
|
824
|
+
poll_interval=self.poll_interval,
|
|
825
|
+
poll_timeout=self.poll_timeout,
|
|
826
|
+
cancel_on_termination=self.cancel_on_termination,
|
|
827
|
+
poll_previous_running_sync=self.poll_previous_running_sync,
|
|
828
|
+
)
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
@beta
|
|
832
|
+
class AirbyteCloudWorkspace(BaseAirbyteWorkspace):
|
|
833
|
+
"""This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
|
|
834
|
+
syncs and monitor their progress for a given Airbyte Cloud workspace.
|
|
835
|
+
|
|
836
|
+
**Examples:**
|
|
837
|
+
|
|
838
|
+
.. code-block:: python
|
|
839
|
+
|
|
840
|
+
from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
|
|
841
|
+
|
|
842
|
+
import dagster as dg
|
|
843
|
+
|
|
844
|
+
airbyte_workspace = AirbyteCloudWorkspace(
|
|
845
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
846
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
847
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
848
|
+
)
|
|
849
|
+
|
|
850
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
851
|
+
|
|
852
|
+
defs = dg.Definitions(
|
|
853
|
+
assets=all_airbyte_assets,
|
|
854
|
+
resources={"airbyte": airbyte_workspace},
|
|
855
|
+
)
|
|
856
|
+
"""
|
|
857
|
+
|
|
858
|
+
rest_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_REST_API_BASE_URL
|
|
859
|
+
configuration_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL
|
|
860
|
+
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
861
|
+
client_id: str = Field(..., description="The Airbyte client ID.")
|
|
862
|
+
client_secret: str = Field(..., description="The Airbyte client secret.")
|
|
863
|
+
|
|
864
|
+
@cached_method
|
|
865
|
+
def get_client(self) -> AirbyteClient:
|
|
866
|
+
return AirbyteClient(
|
|
867
|
+
rest_api_base_url=self.rest_api_base_url,
|
|
868
|
+
configuration_api_base_url=self.configuration_api_base_url,
|
|
869
|
+
workspace_id=self.workspace_id,
|
|
870
|
+
client_id=self.client_id,
|
|
871
|
+
client_secret=self.client_secret,
|
|
872
|
+
request_max_retries=self.request_max_retries,
|
|
873
|
+
request_retry_delay=self.request_retry_delay,
|
|
874
|
+
request_timeout=self.request_timeout,
|
|
875
|
+
max_items_per_page=self.max_items_per_page,
|
|
876
|
+
poll_interval=self.poll_interval,
|
|
877
|
+
poll_timeout=self.poll_timeout,
|
|
878
|
+
cancel_on_termination=self.cancel_on_termination,
|
|
879
|
+
poll_previous_running_sync=self.poll_previous_running_sync,
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
@public
|
|
884
|
+
@beta
|
|
885
|
+
def load_airbyte_asset_specs(
|
|
886
|
+
workspace: BaseAirbyteWorkspace,
|
|
1324
887
|
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
888
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
|
|
1325
889
|
) -> Sequence[AssetSpec]:
|
|
1326
890
|
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
1327
891
|
|
|
1328
892
|
Args:
|
|
1329
|
-
workspace (
|
|
893
|
+
workspace (BaseAirbyteWorkspace): The Airbyte workspace to fetch assets from.
|
|
1330
894
|
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
1331
895
|
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
1332
896
|
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
897
|
+
connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
|
|
898
|
+
which Airbyte connection assets are created for.
|
|
1333
899
|
|
|
1334
900
|
Returns:
|
|
1335
901
|
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
1336
902
|
|
|
1337
903
|
Examples:
|
|
1338
|
-
Loading the asset specs for a given Airbyte
|
|
904
|
+
Loading the asset specs for a given Airbyte workspace:
|
|
1339
905
|
|
|
1340
906
|
.. code-block:: python
|
|
1341
907
|
|
|
1342
|
-
from dagster_airbyte import
|
|
908
|
+
from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
|
|
1343
909
|
|
|
1344
910
|
import dagster as dg
|
|
1345
911
|
|
|
1346
|
-
|
|
1347
|
-
workspace_id=dg.EnvVar("
|
|
1348
|
-
client_id=dg.EnvVar("
|
|
1349
|
-
client_secret=dg.EnvVar("
|
|
912
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
913
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
914
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
915
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
1350
916
|
)
|
|
1351
917
|
|
|
918
|
+
airbyte_specs = load_airbyte_asset_specs(airbyte_workspace)
|
|
919
|
+
dg.Definitions(assets=airbyte_specs)
|
|
1352
920
|
|
|
1353
|
-
|
|
1354
|
-
|
|
921
|
+
Filter connections by name:
|
|
922
|
+
|
|
923
|
+
.. code-block:: python
|
|
924
|
+
|
|
925
|
+
from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
|
|
926
|
+
|
|
927
|
+
import dagster as dg
|
|
928
|
+
|
|
929
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
930
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
931
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
932
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
airbyte_specs = load_airbyte_asset_specs(
|
|
936
|
+
workspace=airbyte_workspace,
|
|
937
|
+
connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
|
|
938
|
+
)
|
|
939
|
+
dg.Definitions(assets=airbyte_specs)
|
|
1355
940
|
"""
|
|
1356
941
|
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
1357
942
|
|
|
1358
|
-
with workspace.
|
|
943
|
+
with workspace.process_config_and_initialize_cm_cached() as initialized_workspace:
|
|
1359
944
|
return [
|
|
1360
945
|
spec.merge_attributes(
|
|
1361
946
|
metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
|
|
1362
947
|
)
|
|
1363
948
|
for spec in check.is_list(
|
|
1364
|
-
|
|
949
|
+
AirbyteWorkspaceDefsLoader(
|
|
1365
950
|
workspace=initialized_workspace,
|
|
1366
951
|
translator=dagster_airbyte_translator,
|
|
952
|
+
connection_selector_fn=connection_selector_fn,
|
|
1367
953
|
)
|
|
1368
954
|
.build_defs()
|
|
1369
955
|
.assets,
|
|
@@ -1372,22 +958,90 @@ def load_airbyte_cloud_asset_specs(
|
|
|
1372
958
|
]
|
|
1373
959
|
|
|
1374
960
|
|
|
961
|
+
@public
|
|
962
|
+
@superseded(additional_warn_text="Use load_airbyte_asset_specs instead.")
|
|
963
|
+
def load_airbyte_cloud_asset_specs(
|
|
964
|
+
workspace: AirbyteCloudWorkspace,
|
|
965
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
966
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
|
|
967
|
+
) -> Sequence[AssetSpec]:
|
|
968
|
+
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
969
|
+
|
|
970
|
+
Args:
|
|
971
|
+
workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
|
|
972
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
973
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
974
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
975
|
+
connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
|
|
976
|
+
which Airbyte connection assets are created for.
|
|
977
|
+
|
|
978
|
+
Returns:
|
|
979
|
+
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
980
|
+
|
|
981
|
+
Examples:
|
|
982
|
+
Loading the asset specs for a given Airbyte Cloud workspace:
|
|
983
|
+
|
|
984
|
+
.. code-block:: python
|
|
985
|
+
|
|
986
|
+
from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
|
|
987
|
+
|
|
988
|
+
import dagster as dg
|
|
989
|
+
|
|
990
|
+
airbyte_cloud_workspace = AirbyteCloudWorkspace(
|
|
991
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
992
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
993
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
994
|
+
)
|
|
995
|
+
|
|
996
|
+
airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
|
|
997
|
+
dg.Definitions(assets=airbyte_cloud_specs)
|
|
998
|
+
|
|
999
|
+
Filter connections by name:
|
|
1000
|
+
|
|
1001
|
+
.. code-block:: python
|
|
1002
|
+
|
|
1003
|
+
from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
|
|
1004
|
+
|
|
1005
|
+
import dagster as dg
|
|
1006
|
+
|
|
1007
|
+
airbyte_cloud_workspace = AirbyteCloudWorkspace(
|
|
1008
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
1009
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
1010
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
airbyte_cloud_specs = load_airbyte_cloud_asset_specs(
|
|
1014
|
+
workspace=airbyte_cloud_workspace,
|
|
1015
|
+
connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
|
|
1016
|
+
)
|
|
1017
|
+
dg.Definitions(assets=airbyte_cloud_specs)
|
|
1018
|
+
"""
|
|
1019
|
+
return load_airbyte_asset_specs(
|
|
1020
|
+
workspace=workspace,
|
|
1021
|
+
dagster_airbyte_translator=dagster_airbyte_translator,
|
|
1022
|
+
connection_selector_fn=connection_selector_fn,
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
|
|
1375
1026
|
@record
|
|
1376
|
-
class
|
|
1377
|
-
workspace: AirbyteCloudWorkspace
|
|
1027
|
+
class AirbyteWorkspaceDefsLoader(StateBackedDefinitionsLoader[AirbyteWorkspaceData]):
|
|
1028
|
+
workspace: Union[AirbyteWorkspace, AirbyteCloudWorkspace]
|
|
1378
1029
|
translator: DagsterAirbyteTranslator
|
|
1030
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]]
|
|
1379
1031
|
|
|
1380
1032
|
@property
|
|
1381
1033
|
def defs_key(self) -> str:
|
|
1382
|
-
return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}
|
|
1034
|
+
return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}.{self.workspace.workspace_id}"
|
|
1383
1035
|
|
|
1384
|
-
def fetch_state(self) -> AirbyteWorkspaceData:
|
|
1036
|
+
def fetch_state(self) -> AirbyteWorkspaceData:
|
|
1385
1037
|
return self.workspace.fetch_airbyte_workspace_data()
|
|
1386
1038
|
|
|
1387
|
-
def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
|
|
1039
|
+
def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
|
|
1388
1040
|
all_asset_specs = [
|
|
1389
1041
|
self.translator.get_asset_spec(props)
|
|
1390
1042
|
for props in state.to_airbyte_connection_table_props_data()
|
|
1043
|
+
if not self.connection_selector_fn
|
|
1044
|
+
or self.connection_selector_fn(state.connections_by_id[props.connection_id])
|
|
1391
1045
|
]
|
|
1392
1046
|
|
|
1393
1047
|
return Definitions(assets=all_asset_specs)
|