dagster-airbyte 0.23.7__py3-none-any.whl → 0.25.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dagster-airbyte might be problematic. Click here for more details.
- dagster_airbyte/__init__.py +16 -8
- dagster_airbyte/asset_decorator.py +113 -0
- dagster_airbyte/asset_defs.py +261 -170
- dagster_airbyte/managed/__init__.py +2 -2
- dagster_airbyte/managed/generated/__init__.py +1 -1
- dagster_airbyte/managed/generated/destinations.py +3 -3
- dagster_airbyte/managed/generated/sources.py +46 -46
- dagster_airbyte/managed/reconciliation.py +22 -34
- dagster_airbyte/managed/types.py +11 -10
- dagster_airbyte/ops.py +6 -5
- dagster_airbyte/py.typed +1 -0
- dagster_airbyte/resources.py +705 -45
- dagster_airbyte/translator.py +236 -0
- dagster_airbyte/types.py +7 -2
- dagster_airbyte/utils.py +38 -2
- dagster_airbyte/version.py +1 -1
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/METADATA +5 -5
- dagster_airbyte-0.25.10.dist-info/RECORD +23 -0
- dagster_airbyte-0.23.7.dist-info/RECORD +0 -20
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/LICENSE +0 -0
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/WHEEL +0 -0
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/entry_points.txt +0 -0
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/top_level.txt +0 -0
dagster_airbyte/resources.py
CHANGED
|
@@ -4,42 +4,71 @@ import logging
|
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
6
|
from abc import abstractmethod
|
|
7
|
+
from collections.abc import Mapping, Sequence
|
|
7
8
|
from contextlib import contextmanager
|
|
8
|
-
from
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
from typing import Any, Optional, cast
|
|
9
11
|
|
|
10
12
|
import requests
|
|
11
13
|
from dagster import (
|
|
14
|
+
AssetExecutionContext,
|
|
15
|
+
AssetMaterialization,
|
|
12
16
|
ConfigurableResource,
|
|
17
|
+
Definitions,
|
|
13
18
|
Failure,
|
|
19
|
+
InitResourceContext,
|
|
20
|
+
MaterializeResult,
|
|
14
21
|
_check as check,
|
|
15
22
|
get_dagster_logger,
|
|
16
23
|
resource,
|
|
17
24
|
)
|
|
25
|
+
from dagster._annotations import experimental, public
|
|
18
26
|
from dagster._config.pythonic_config import infer_schema_from_config_class
|
|
27
|
+
from dagster._core.definitions.asset_spec import AssetSpec
|
|
28
|
+
from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
|
|
19
29
|
from dagster._core.definitions.resource_definition import dagster_maintained_resource
|
|
30
|
+
from dagster._model import DagsterModel
|
|
31
|
+
from dagster._record import record
|
|
20
32
|
from dagster._utils.cached_method import cached_method
|
|
21
33
|
from dagster._utils.merger import deep_merge_dicts
|
|
22
|
-
from pydantic import Field
|
|
34
|
+
from pydantic import Field, PrivateAttr
|
|
23
35
|
from requests.exceptions import RequestException
|
|
24
36
|
|
|
37
|
+
from dagster_airbyte.translator import (
|
|
38
|
+
AirbyteConnection,
|
|
39
|
+
AirbyteConnectionTableProps,
|
|
40
|
+
AirbyteDestination,
|
|
41
|
+
AirbyteJob,
|
|
42
|
+
AirbyteJobStatusType,
|
|
43
|
+
AirbyteMetadataSet,
|
|
44
|
+
AirbyteWorkspaceData,
|
|
45
|
+
DagsterAirbyteTranslator,
|
|
46
|
+
)
|
|
25
47
|
from dagster_airbyte.types import AirbyteOutput
|
|
48
|
+
from dagster_airbyte.utils import (
|
|
49
|
+
DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY,
|
|
50
|
+
get_airbyte_connection_table_name,
|
|
51
|
+
get_translator_from_airbyte_assets,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
AIRBYTE_REST_API_BASE = "https://api.airbyte.com"
|
|
55
|
+
AIRBYTE_REST_API_VERSION = "v1"
|
|
56
|
+
|
|
57
|
+
AIRBYTE_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
|
|
58
|
+
AIRBYTE_CONFIGURATION_API_VERSION = "v1"
|
|
26
59
|
|
|
27
60
|
DEFAULT_POLL_INTERVAL_SECONDS = 10
|
|
28
61
|
|
|
62
|
+
# The access token expire every 3 minutes in Airbyte Cloud.
|
|
63
|
+
# Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
|
|
64
|
+
AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS = 150
|
|
29
65
|
|
|
30
|
-
|
|
31
|
-
RUNNING = "running"
|
|
32
|
-
SUCCEEDED = "succeeded"
|
|
33
|
-
CANCELLED = "cancelled"
|
|
34
|
-
PENDING = "pending"
|
|
35
|
-
FAILED = "failed"
|
|
36
|
-
ERROR = "error"
|
|
37
|
-
INCOMPLETE = "incomplete"
|
|
66
|
+
AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
|
|
38
67
|
|
|
39
68
|
|
|
40
69
|
class AirbyteResourceState:
|
|
41
70
|
def __init__(self) -> None:
|
|
42
|
-
self.request_cache:
|
|
71
|
+
self.request_cache: dict[str, Optional[Mapping[str, object]]] = {}
|
|
43
72
|
# Int in case we nest contexts
|
|
44
73
|
self.cache_enabled = 0
|
|
45
74
|
|
|
@@ -94,7 +123,11 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
94
123
|
raise NotImplementedError()
|
|
95
124
|
|
|
96
125
|
def make_request(
|
|
97
|
-
self,
|
|
126
|
+
self,
|
|
127
|
+
endpoint: str,
|
|
128
|
+
data: Optional[Mapping[str, object]] = None,
|
|
129
|
+
method: str = "POST",
|
|
130
|
+
include_additional_request_params: bool = True,
|
|
98
131
|
) -> Optional[Mapping[str, object]]:
|
|
99
132
|
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
100
133
|
|
|
@@ -111,7 +144,7 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
111
144
|
num_retries = 0
|
|
112
145
|
while True:
|
|
113
146
|
try:
|
|
114
|
-
request_args:
|
|
147
|
+
request_args: dict[str, Any] = dict(
|
|
115
148
|
method=method,
|
|
116
149
|
url=url,
|
|
117
150
|
headers=headers,
|
|
@@ -120,10 +153,11 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
120
153
|
if data:
|
|
121
154
|
request_args["json"] = data
|
|
122
155
|
|
|
123
|
-
|
|
124
|
-
request_args
|
|
125
|
-
|
|
126
|
-
|
|
156
|
+
if include_additional_request_params:
|
|
157
|
+
request_args = deep_merge_dicts(
|
|
158
|
+
request_args,
|
|
159
|
+
self.all_additional_request_params,
|
|
160
|
+
)
|
|
127
161
|
|
|
128
162
|
response = requests.request(
|
|
129
163
|
**request_args,
|
|
@@ -183,7 +217,7 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
183
217
|
"""
|
|
184
218
|
connection_details = self.get_connection_details(connection_id)
|
|
185
219
|
job_details = self.start_sync(connection_id)
|
|
186
|
-
job_info = cast(
|
|
220
|
+
job_info = cast(dict[str, object], job_details.get("job", {}))
|
|
187
221
|
job_id = cast(int, job_info.get("id"))
|
|
188
222
|
|
|
189
223
|
self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
|
|
@@ -201,7 +235,7 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
201
235
|
)
|
|
202
236
|
time.sleep(poll_interval or self.poll_interval)
|
|
203
237
|
job_details = self.get_job_status(connection_id, job_id)
|
|
204
|
-
attempts = cast(
|
|
238
|
+
attempts = cast(list, job_details.get("attempts", []))
|
|
205
239
|
cur_attempt = len(attempts)
|
|
206
240
|
# spit out the available Airbyte log info
|
|
207
241
|
if cur_attempt:
|
|
@@ -218,16 +252,20 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
218
252
|
logged_lines = 0
|
|
219
253
|
logged_attempts += 1
|
|
220
254
|
|
|
221
|
-
job_info = cast(
|
|
255
|
+
job_info = cast(dict[str, object], job_details.get("job", {}))
|
|
222
256
|
state = job_info.get("status")
|
|
223
257
|
|
|
224
|
-
if state in (
|
|
258
|
+
if state in (
|
|
259
|
+
AirbyteJobStatusType.RUNNING,
|
|
260
|
+
AirbyteJobStatusType.PENDING,
|
|
261
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
262
|
+
):
|
|
225
263
|
continue
|
|
226
|
-
elif state ==
|
|
264
|
+
elif state == AirbyteJobStatusType.SUCCEEDED:
|
|
227
265
|
break
|
|
228
|
-
elif state ==
|
|
266
|
+
elif state == AirbyteJobStatusType.ERROR:
|
|
229
267
|
raise Failure(f"Job failed: {job_id}")
|
|
230
|
-
elif state ==
|
|
268
|
+
elif state == AirbyteJobStatusType.CANCELLED:
|
|
231
269
|
raise Failure(f"Job was cancelled: {job_id}")
|
|
232
270
|
else:
|
|
233
271
|
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
@@ -235,7 +273,12 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
235
273
|
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
236
274
|
# the python process
|
|
237
275
|
if (
|
|
238
|
-
state
|
|
276
|
+
state
|
|
277
|
+
not in (
|
|
278
|
+
AirbyteJobStatusType.SUCCEEDED,
|
|
279
|
+
AirbyteJobStatusType.ERROR,
|
|
280
|
+
AirbyteJobStatusType.CANCELLED,
|
|
281
|
+
)
|
|
239
282
|
and self.cancel_sync_on_run_termination
|
|
240
283
|
):
|
|
241
284
|
self.cancel_job(job_id)
|
|
@@ -244,7 +287,7 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
244
287
|
|
|
245
288
|
|
|
246
289
|
class AirbyteCloudResource(BaseAirbyteResource):
|
|
247
|
-
"""This resource allows users to
|
|
290
|
+
"""This resource allows users to programmatically interface with the Airbyte Cloud API to launch
|
|
248
291
|
syncs and monitor their progress.
|
|
249
292
|
|
|
250
293
|
**Examples:**
|
|
@@ -255,7 +298,8 @@ class AirbyteCloudResource(BaseAirbyteResource):
|
|
|
255
298
|
from dagster_airbyte import AirbyteResource
|
|
256
299
|
|
|
257
300
|
my_airbyte_resource = AirbyteCloudResource(
|
|
258
|
-
|
|
301
|
+
client_id=EnvVar("AIRBYTE_CLIENT_ID"),
|
|
302
|
+
client_secret=EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
259
303
|
)
|
|
260
304
|
|
|
261
305
|
airbyte_assets = build_airbyte_assets(
|
|
@@ -269,7 +313,15 @@ class AirbyteCloudResource(BaseAirbyteResource):
|
|
|
269
313
|
)
|
|
270
314
|
"""
|
|
271
315
|
|
|
272
|
-
|
|
316
|
+
client_id: str = Field(..., description="The Airbyte Cloud client ID.")
|
|
317
|
+
client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
|
|
318
|
+
|
|
319
|
+
_access_token_value: Optional[str] = PrivateAttr(default=None)
|
|
320
|
+
_access_token_timestamp: Optional[float] = PrivateAttr(default=None)
|
|
321
|
+
|
|
322
|
+
def setup_for_execution(self, context: InitResourceContext) -> None:
|
|
323
|
+
# Refresh access token when the resource is initialized
|
|
324
|
+
self._refresh_access_token()
|
|
273
325
|
|
|
274
326
|
@property
|
|
275
327
|
def api_base_url(self) -> str:
|
|
@@ -277,7 +329,32 @@ class AirbyteCloudResource(BaseAirbyteResource):
|
|
|
277
329
|
|
|
278
330
|
@property
|
|
279
331
|
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
280
|
-
|
|
332
|
+
# Make sure the access token is refreshed before using it when calling the API.
|
|
333
|
+
if self._needs_refreshed_access_token():
|
|
334
|
+
self._refresh_access_token()
|
|
335
|
+
return {
|
|
336
|
+
"headers": {
|
|
337
|
+
"Authorization": f"Bearer {self._access_token_value}",
|
|
338
|
+
"User-Agent": "dagster",
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
def make_request(
|
|
343
|
+
self,
|
|
344
|
+
endpoint: str,
|
|
345
|
+
data: Optional[Mapping[str, object]] = None,
|
|
346
|
+
method: str = "POST",
|
|
347
|
+
include_additional_request_params: bool = True,
|
|
348
|
+
) -> Optional[Mapping[str, object]]:
|
|
349
|
+
# Make sure the access token is refreshed before using it when calling the API.
|
|
350
|
+
if include_additional_request_params and self._needs_refreshed_access_token():
|
|
351
|
+
self._refresh_access_token()
|
|
352
|
+
return super().make_request(
|
|
353
|
+
endpoint=endpoint,
|
|
354
|
+
data=data,
|
|
355
|
+
method=method,
|
|
356
|
+
include_additional_request_params=include_additional_request_params,
|
|
357
|
+
)
|
|
281
358
|
|
|
282
359
|
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
283
360
|
job_sync = check.not_none(
|
|
@@ -306,6 +383,31 @@ class AirbyteCloudResource(BaseAirbyteResource):
|
|
|
306
383
|
# Airbyte Cloud does not support streaming logs yet
|
|
307
384
|
return False
|
|
308
385
|
|
|
386
|
+
def _refresh_access_token(self) -> None:
|
|
387
|
+
response = check.not_none(
|
|
388
|
+
self.make_request(
|
|
389
|
+
endpoint="/applications/token",
|
|
390
|
+
data={
|
|
391
|
+
"client_id": self.client_id,
|
|
392
|
+
"client_secret": self.client_secret,
|
|
393
|
+
},
|
|
394
|
+
# Must not pass the bearer access token when refreshing it.
|
|
395
|
+
include_additional_request_params=False,
|
|
396
|
+
)
|
|
397
|
+
)
|
|
398
|
+
self._access_token_value = str(response["access_token"])
|
|
399
|
+
self._access_token_timestamp = datetime.now().timestamp()
|
|
400
|
+
|
|
401
|
+
def _needs_refreshed_access_token(self) -> bool:
|
|
402
|
+
return (
|
|
403
|
+
not self._access_token_value
|
|
404
|
+
or not self._access_token_timestamp
|
|
405
|
+
or self._access_token_timestamp
|
|
406
|
+
<= datetime.timestamp(
|
|
407
|
+
datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
|
|
408
|
+
)
|
|
409
|
+
)
|
|
410
|
+
|
|
309
411
|
|
|
310
412
|
class AirbyteResource(BaseAirbyteResource):
|
|
311
413
|
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
@@ -469,7 +571,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
469
571
|
|
|
470
572
|
def get_default_workspace(self) -> str:
|
|
471
573
|
workspaces = cast(
|
|
472
|
-
|
|
574
|
+
list[dict[str, Any]],
|
|
473
575
|
check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
|
|
474
576
|
"workspaces", []
|
|
475
577
|
),
|
|
@@ -481,7 +583,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
481
583
|
definitions = check.not_none(
|
|
482
584
|
self.make_request_cached(endpoint="/source_definitions/list", data={})
|
|
483
585
|
)
|
|
484
|
-
source_definitions = cast(
|
|
586
|
+
source_definitions = cast(list[dict[str, Any]], definitions["sourceDefinitions"])
|
|
485
587
|
|
|
486
588
|
return next(
|
|
487
589
|
(
|
|
@@ -495,7 +597,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
495
597
|
def get_destination_definition_by_name(self, name: str):
|
|
496
598
|
name_lower = name.lower()
|
|
497
599
|
definitions = cast(
|
|
498
|
-
|
|
600
|
+
dict[str, list[dict[str, str]]],
|
|
499
601
|
check.not_none(
|
|
500
602
|
self.make_request_cached(endpoint="/destination_definitions/list", data={})
|
|
501
603
|
),
|
|
@@ -511,7 +613,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
511
613
|
|
|
512
614
|
def get_source_catalog_id(self, source_id: str):
|
|
513
615
|
result = cast(
|
|
514
|
-
|
|
616
|
+
dict[str, Any],
|
|
515
617
|
check.not_none(
|
|
516
618
|
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
517
619
|
),
|
|
@@ -520,7 +622,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
520
622
|
|
|
521
623
|
def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
|
|
522
624
|
return cast(
|
|
523
|
-
|
|
625
|
+
dict[str, Any],
|
|
524
626
|
check.not_none(
|
|
525
627
|
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
526
628
|
),
|
|
@@ -532,7 +634,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
532
634
|
# Airbyte API changed source of truth for normalization in PR
|
|
533
635
|
# https://github.com/airbytehq/airbyte/pull/21005
|
|
534
636
|
norm_dest_def_spec: bool = cast(
|
|
535
|
-
|
|
637
|
+
dict[str, Any],
|
|
536
638
|
check.not_none(
|
|
537
639
|
self.make_request_cached(
|
|
538
640
|
endpoint="/destination_definition_specifications/get",
|
|
@@ -546,7 +648,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
546
648
|
|
|
547
649
|
norm_dest_def: bool = (
|
|
548
650
|
cast(
|
|
549
|
-
|
|
651
|
+
dict[str, Any],
|
|
550
652
|
check.not_none(
|
|
551
653
|
self.make_request_cached(
|
|
552
654
|
endpoint="/destination_definitions/get",
|
|
@@ -579,7 +681,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
579
681
|
},
|
|
580
682
|
)
|
|
581
683
|
)
|
|
582
|
-
job = next((job for job in cast(
|
|
684
|
+
job = next((job for job in cast(list, out["jobs"]) if job["job"]["id"] == job_id), None)
|
|
583
685
|
|
|
584
686
|
return check.not_none(job)
|
|
585
687
|
|
|
@@ -614,7 +716,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
614
716
|
"""
|
|
615
717
|
connection_details = self.get_connection_details(connection_id)
|
|
616
718
|
job_details = self.start_sync(connection_id)
|
|
617
|
-
job_info = cast(
|
|
719
|
+
job_info = cast(dict[str, object], job_details.get("job", {}))
|
|
618
720
|
job_id = cast(int, job_info.get("id"))
|
|
619
721
|
|
|
620
722
|
self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
|
|
@@ -632,7 +734,7 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
632
734
|
)
|
|
633
735
|
time.sleep(poll_interval or self.poll_interval)
|
|
634
736
|
job_details = self.get_job_status(connection_id, job_id)
|
|
635
|
-
attempts = cast(
|
|
737
|
+
attempts = cast(list, job_details.get("attempts", []))
|
|
636
738
|
cur_attempt = len(attempts)
|
|
637
739
|
# spit out the available Airbyte log info
|
|
638
740
|
if cur_attempt:
|
|
@@ -649,16 +751,20 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
649
751
|
logged_lines = 0
|
|
650
752
|
logged_attempts += 1
|
|
651
753
|
|
|
652
|
-
job_info = cast(
|
|
754
|
+
job_info = cast(dict[str, object], job_details.get("job", {}))
|
|
653
755
|
state = job_info.get("status")
|
|
654
756
|
|
|
655
|
-
if state in (
|
|
757
|
+
if state in (
|
|
758
|
+
AirbyteJobStatusType.RUNNING,
|
|
759
|
+
AirbyteJobStatusType.PENDING,
|
|
760
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
761
|
+
):
|
|
656
762
|
continue
|
|
657
|
-
elif state ==
|
|
763
|
+
elif state == AirbyteJobStatusType.SUCCEEDED:
|
|
658
764
|
break
|
|
659
|
-
elif state ==
|
|
765
|
+
elif state == AirbyteJobStatusType.ERROR:
|
|
660
766
|
raise Failure(f"Job failed: {job_id}")
|
|
661
|
-
elif state ==
|
|
767
|
+
elif state == AirbyteJobStatusType.CANCELLED:
|
|
662
768
|
raise Failure(f"Job was cancelled: {job_id}")
|
|
663
769
|
else:
|
|
664
770
|
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
@@ -666,7 +772,12 @@ class AirbyteResource(BaseAirbyteResource):
|
|
|
666
772
|
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
667
773
|
# the python process
|
|
668
774
|
if (
|
|
669
|
-
state
|
|
775
|
+
state
|
|
776
|
+
not in (
|
|
777
|
+
AirbyteJobStatusType.SUCCEEDED,
|
|
778
|
+
AirbyteJobStatusType.ERROR,
|
|
779
|
+
AirbyteJobStatusType.CANCELLED,
|
|
780
|
+
)
|
|
670
781
|
and self.cancel_sync_on_run_termination
|
|
671
782
|
):
|
|
672
783
|
self.cancel_job(job_id)
|
|
@@ -721,3 +832,552 @@ def airbyte_cloud_resource(context) -> AirbyteCloudResource:
|
|
|
721
832
|
|
|
722
833
|
"""
|
|
723
834
|
return AirbyteCloudResource.from_resource_context(context)
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
# -------------
|
|
838
|
+
# Resources v2
|
|
839
|
+
# -------------
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
@experimental
|
|
843
|
+
class AirbyteCloudClient(DagsterModel):
|
|
844
|
+
"""This class exposes methods on top of the Airbyte APIs for Airbyte Cloud."""
|
|
845
|
+
|
|
846
|
+
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
847
|
+
client_id: str = Field(..., description="The Airbyte client ID.")
|
|
848
|
+
client_secret: str = Field(..., description="The Airbyte client secret.")
|
|
849
|
+
request_max_retries: int = Field(
|
|
850
|
+
...,
|
|
851
|
+
description=(
|
|
852
|
+
"The maximum number of times requests to the Airbyte API should be retried "
|
|
853
|
+
"before failing."
|
|
854
|
+
),
|
|
855
|
+
)
|
|
856
|
+
request_retry_delay: float = Field(
|
|
857
|
+
...,
|
|
858
|
+
description="Time (in seconds) to wait between each request retry.",
|
|
859
|
+
)
|
|
860
|
+
request_timeout: int = Field(
|
|
861
|
+
...,
|
|
862
|
+
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
863
|
+
)
|
|
864
|
+
|
|
865
|
+
_access_token_value: Optional[str] = PrivateAttr(default=None)
|
|
866
|
+
_access_token_timestamp: Optional[float] = PrivateAttr(default=None)
|
|
867
|
+
|
|
868
|
+
@property
|
|
869
|
+
@cached_method
|
|
870
|
+
def _log(self) -> logging.Logger:
|
|
871
|
+
return get_dagster_logger()
|
|
872
|
+
|
|
873
|
+
@property
|
|
874
|
+
def rest_api_base_url(self) -> str:
|
|
875
|
+
return f"{AIRBYTE_REST_API_BASE}/{AIRBYTE_REST_API_VERSION}"
|
|
876
|
+
|
|
877
|
+
@property
|
|
878
|
+
def configuration_api_base_url(self) -> str:
|
|
879
|
+
return f"{AIRBYTE_CONFIGURATION_API_BASE}/{AIRBYTE_CONFIGURATION_API_VERSION}"
|
|
880
|
+
|
|
881
|
+
@property
|
|
882
|
+
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
883
|
+
return {**self.authorization_request_params, **self.user_agent_request_params}
|
|
884
|
+
|
|
885
|
+
@property
|
|
886
|
+
def authorization_request_params(self) -> Mapping[str, Any]:
|
|
887
|
+
# Make sure the access token is refreshed before using it when calling the API.
|
|
888
|
+
if self._needs_refreshed_access_token():
|
|
889
|
+
self._refresh_access_token()
|
|
890
|
+
return {
|
|
891
|
+
"Authorization": f"Bearer {self._access_token_value}",
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
@property
|
|
895
|
+
def user_agent_request_params(self) -> Mapping[str, Any]:
|
|
896
|
+
return {
|
|
897
|
+
"User-Agent": "dagster",
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
def _refresh_access_token(self) -> None:
|
|
901
|
+
response = check.not_none(
|
|
902
|
+
self._make_request(
|
|
903
|
+
method="POST",
|
|
904
|
+
endpoint="applications/token",
|
|
905
|
+
base_url=self.rest_api_base_url,
|
|
906
|
+
data={
|
|
907
|
+
"client_id": self.client_id,
|
|
908
|
+
"client_secret": self.client_secret,
|
|
909
|
+
},
|
|
910
|
+
# Must not pass the bearer access token when refreshing it.
|
|
911
|
+
include_additional_request_params=False,
|
|
912
|
+
)
|
|
913
|
+
)
|
|
914
|
+
self._access_token_value = str(response["access_token"])
|
|
915
|
+
self._access_token_timestamp = datetime.now().timestamp()
|
|
916
|
+
|
|
917
|
+
def _needs_refreshed_access_token(self) -> bool:
|
|
918
|
+
return (
|
|
919
|
+
not self._access_token_value
|
|
920
|
+
or not self._access_token_timestamp
|
|
921
|
+
or self._access_token_timestamp
|
|
922
|
+
<= (
|
|
923
|
+
datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
|
|
924
|
+
).timestamp()
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
def _get_session(self, include_additional_request_params: bool) -> requests.Session:
|
|
928
|
+
headers = {"accept": "application/json"}
|
|
929
|
+
if include_additional_request_params:
|
|
930
|
+
headers = {
|
|
931
|
+
**headers,
|
|
932
|
+
**self.all_additional_request_params,
|
|
933
|
+
}
|
|
934
|
+
session = requests.Session()
|
|
935
|
+
session.headers.update(headers)
|
|
936
|
+
return session
|
|
937
|
+
|
|
938
|
+
def _make_request(
|
|
939
|
+
self,
|
|
940
|
+
method: str,
|
|
941
|
+
endpoint: str,
|
|
942
|
+
base_url: str,
|
|
943
|
+
data: Optional[Mapping[str, Any]] = None,
|
|
944
|
+
params: Optional[Mapping[str, Any]] = None,
|
|
945
|
+
include_additional_request_params: bool = True,
|
|
946
|
+
) -> Mapping[str, Any]:
|
|
947
|
+
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
948
|
+
|
|
949
|
+
Args:
|
|
950
|
+
method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").
|
|
951
|
+
endpoint (str): The Airbyte API endpoint to send this request to.
|
|
952
|
+
base_url (str): The base url to the Airbyte API to use.
|
|
953
|
+
data (Optional[Dict[str, Any]]): JSON-formatted data string to be included in the request.
|
|
954
|
+
params (Optional[Dict[str, Any]]): JSON-formatted query params to be included in the request.
|
|
955
|
+
include_additional_request_params (bool): Whether to include authorization and user-agent headers
|
|
956
|
+
to the request parameters. Defaults to True.
|
|
957
|
+
|
|
958
|
+
Returns:
|
|
959
|
+
Dict[str, Any]: Parsed json data from the response to this request
|
|
960
|
+
"""
|
|
961
|
+
url = f"{base_url}/{endpoint}"
|
|
962
|
+
|
|
963
|
+
num_retries = 0
|
|
964
|
+
while True:
|
|
965
|
+
try:
|
|
966
|
+
session = self._get_session(
|
|
967
|
+
include_additional_request_params=include_additional_request_params
|
|
968
|
+
)
|
|
969
|
+
response = session.request(
|
|
970
|
+
method=method, url=url, json=data, params=params, timeout=self.request_timeout
|
|
971
|
+
)
|
|
972
|
+
response.raise_for_status()
|
|
973
|
+
return response.json()
|
|
974
|
+
except RequestException as e:
|
|
975
|
+
self._log.error(
|
|
976
|
+
f"Request to Airbyte API failed for url {url} with method {method} : {e}"
|
|
977
|
+
)
|
|
978
|
+
if num_retries == self.request_max_retries:
|
|
979
|
+
break
|
|
980
|
+
num_retries += 1
|
|
981
|
+
time.sleep(self.request_retry_delay)
|
|
982
|
+
|
|
983
|
+
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
984
|
+
|
|
985
|
+
def get_connections(self) -> Mapping[str, Any]:
|
|
986
|
+
"""Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
|
|
987
|
+
return self._make_request(
|
|
988
|
+
method="GET",
|
|
989
|
+
endpoint="connections",
|
|
990
|
+
base_url=self.rest_api_base_url,
|
|
991
|
+
params={"workspaceIds": self.workspace_id},
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
def get_connection_details(self, connection_id) -> Mapping[str, Any]:
|
|
995
|
+
"""Fetches details about a given connection from the Airbyte Configuration API.
|
|
996
|
+
The Airbyte Configuration API is an internal and may change in the future.
|
|
997
|
+
"""
|
|
998
|
+
# Using the Airbyte Configuration API to get the connection details, including streams and their configs.
|
|
999
|
+
# https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
|
|
1000
|
+
# https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
|
|
1001
|
+
return self._make_request(
|
|
1002
|
+
method="POST",
|
|
1003
|
+
endpoint="connections/get",
|
|
1004
|
+
base_url=self.configuration_api_base_url,
|
|
1005
|
+
data={"connectionId": connection_id},
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
|
|
1009
|
+
"""Fetches details about a given destination from the Airbyte REST API."""
|
|
1010
|
+
return self._make_request(
|
|
1011
|
+
method="GET",
|
|
1012
|
+
endpoint=f"destinations/{destination_id}",
|
|
1013
|
+
base_url=self.rest_api_base_url,
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
|
|
1017
|
+
return self._make_request(
|
|
1018
|
+
method="POST",
|
|
1019
|
+
endpoint="jobs",
|
|
1020
|
+
base_url=self.rest_api_base_url,
|
|
1021
|
+
data={
|
|
1022
|
+
"connectionId": connection_id,
|
|
1023
|
+
"jobType": "sync",
|
|
1024
|
+
},
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
def get_job_details(self, job_id: int) -> Mapping[str, Any]:
|
|
1028
|
+
return self._make_request(
|
|
1029
|
+
method="GET", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
|
|
1030
|
+
)
|
|
1031
|
+
|
|
1032
|
+
def cancel_job(self, job_id: int) -> Mapping[str, Any]:
|
|
1033
|
+
return self._make_request(
|
|
1034
|
+
method="DELETE", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
|
|
1035
|
+
)
|
|
1036
|
+
|
|
1037
|
+
def sync_and_poll(
|
|
1038
|
+
self,
|
|
1039
|
+
connection_id: str,
|
|
1040
|
+
poll_interval: Optional[float] = None,
|
|
1041
|
+
poll_timeout: Optional[float] = None,
|
|
1042
|
+
cancel_on_termination: bool = True,
|
|
1043
|
+
) -> AirbyteOutput:
|
|
1044
|
+
"""Initializes a sync operation for the given connection, and polls until it completes.
|
|
1045
|
+
|
|
1046
|
+
Args:
|
|
1047
|
+
connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
|
|
1048
|
+
"Connection" tab of a given connection in the Airbyte UI.
|
|
1049
|
+
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
1050
|
+
poll_timeout (float): The maximum time that will wait before this operation is timed
|
|
1051
|
+
out. By default, this will never time out.
|
|
1052
|
+
cancel_on_termination (bool): Whether to cancel a sync in Airbyte if the Dagster runner is terminated.
|
|
1053
|
+
This may be useful to disable if using Airbyte sources that cannot be cancelled and
|
|
1054
|
+
resumed easily, or if your Dagster deployment may experience runner interruptions
|
|
1055
|
+
that do not impact your Airbyte deployment.
|
|
1056
|
+
|
|
1057
|
+
Returns:
|
|
1058
|
+
:py:class:`~AirbyteOutput`:
|
|
1059
|
+
Details of the sync job.
|
|
1060
|
+
"""
|
|
1061
|
+
connection_details = self.get_connection_details(connection_id)
|
|
1062
|
+
start_job_details = self.start_sync_job(connection_id)
|
|
1063
|
+
job = AirbyteJob.from_job_details(job_details=start_job_details)
|
|
1064
|
+
|
|
1065
|
+
self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
|
|
1066
|
+
poll_start = datetime.now()
|
|
1067
|
+
poll_interval = (
|
|
1068
|
+
poll_interval if poll_interval is not None else DEFAULT_POLL_INTERVAL_SECONDS
|
|
1069
|
+
)
|
|
1070
|
+
try:
|
|
1071
|
+
while True:
|
|
1072
|
+
if poll_timeout and datetime.now() > poll_start + timedelta(seconds=poll_timeout):
|
|
1073
|
+
raise Failure(
|
|
1074
|
+
f"Timeout: Airbyte job {job.id} is not ready after the timeout"
|
|
1075
|
+
f" {poll_timeout} seconds"
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
time.sleep(poll_interval)
|
|
1079
|
+
# We return these job details in the AirbyteOutput when the job succeeds
|
|
1080
|
+
poll_job_details = self.get_job_details(job.id)
|
|
1081
|
+
job = AirbyteJob.from_job_details(job_details=poll_job_details)
|
|
1082
|
+
if job.status in (
|
|
1083
|
+
AirbyteJobStatusType.RUNNING,
|
|
1084
|
+
AirbyteJobStatusType.PENDING,
|
|
1085
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
1086
|
+
):
|
|
1087
|
+
continue
|
|
1088
|
+
elif job.status == AirbyteJobStatusType.SUCCEEDED:
|
|
1089
|
+
break
|
|
1090
|
+
elif job.status in [AirbyteJobStatusType.ERROR, AirbyteJobStatusType.FAILED]:
|
|
1091
|
+
raise Failure(f"Job failed: {job.id}")
|
|
1092
|
+
elif job.status == AirbyteJobStatusType.CANCELLED:
|
|
1093
|
+
raise Failure(f"Job was cancelled: {job.id}")
|
|
1094
|
+
else:
|
|
1095
|
+
raise Failure(
|
|
1096
|
+
f"Encountered unexpected state `{job.status}` for job_id {job.id}"
|
|
1097
|
+
)
|
|
1098
|
+
finally:
|
|
1099
|
+
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
1100
|
+
# the python process
|
|
1101
|
+
if cancel_on_termination and job.status not in (
|
|
1102
|
+
AirbyteJobStatusType.SUCCEEDED,
|
|
1103
|
+
AirbyteJobStatusType.ERROR,
|
|
1104
|
+
AirbyteJobStatusType.CANCELLED,
|
|
1105
|
+
AirbyteJobStatusType.FAILED,
|
|
1106
|
+
):
|
|
1107
|
+
self.cancel_job(job.id)
|
|
1108
|
+
|
|
1109
|
+
return AirbyteOutput(job_details=poll_job_details, connection_details=connection_details)
|
|
1110
|
+
|
|
1111
|
+
|
|
1112
|
+
@experimental
|
|
1113
|
+
class AirbyteCloudWorkspace(ConfigurableResource):
|
|
1114
|
+
"""This class represents a Airbyte Cloud workspace and provides utilities
|
|
1115
|
+
to interact with Airbyte APIs.
|
|
1116
|
+
"""
|
|
1117
|
+
|
|
1118
|
+
workspace_id: str = Field(..., description="The Airbyte Cloud workspace ID")
|
|
1119
|
+
client_id: str = Field(..., description="The Airbyte Cloud client ID.")
|
|
1120
|
+
client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
|
|
1121
|
+
request_max_retries: int = Field(
|
|
1122
|
+
default=3,
|
|
1123
|
+
description=(
|
|
1124
|
+
"The maximum number of times requests to the Airbyte API should be retried "
|
|
1125
|
+
"before failing."
|
|
1126
|
+
),
|
|
1127
|
+
)
|
|
1128
|
+
request_retry_delay: float = Field(
|
|
1129
|
+
default=0.25,
|
|
1130
|
+
description="Time (in seconds) to wait between each request retry.",
|
|
1131
|
+
)
|
|
1132
|
+
request_timeout: int = Field(
|
|
1133
|
+
default=15,
|
|
1134
|
+
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
1135
|
+
)
|
|
1136
|
+
|
|
1137
|
+
_client: AirbyteCloudClient = PrivateAttr(default=None) # type: ignore
|
|
1138
|
+
|
|
1139
|
+
@cached_method
|
|
1140
|
+
def get_client(self) -> AirbyteCloudClient:
|
|
1141
|
+
return AirbyteCloudClient(
|
|
1142
|
+
workspace_id=self.workspace_id,
|
|
1143
|
+
client_id=self.client_id,
|
|
1144
|
+
client_secret=self.client_secret,
|
|
1145
|
+
request_max_retries=self.request_max_retries,
|
|
1146
|
+
request_retry_delay=self.request_retry_delay,
|
|
1147
|
+
request_timeout=self.request_timeout,
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
def fetch_airbyte_workspace_data(
|
|
1151
|
+
self,
|
|
1152
|
+
) -> AirbyteWorkspaceData:
|
|
1153
|
+
"""Retrieves all Airbyte content from the workspace and returns it as a AirbyteWorkspaceData object.
|
|
1154
|
+
|
|
1155
|
+
Returns:
|
|
1156
|
+
AirbyteWorkspaceData: A snapshot of the Airbyte workspace's content.
|
|
1157
|
+
"""
|
|
1158
|
+
connections_by_id = {}
|
|
1159
|
+
destinations_by_id = {}
|
|
1160
|
+
|
|
1161
|
+
client = self.get_client()
|
|
1162
|
+
connections = client.get_connections()["data"]
|
|
1163
|
+
|
|
1164
|
+
for partial_connection_details in connections:
|
|
1165
|
+
full_connection_details = client.get_connection_details(
|
|
1166
|
+
connection_id=partial_connection_details["connectionId"]
|
|
1167
|
+
)
|
|
1168
|
+
connection = AirbyteConnection.from_connection_details(
|
|
1169
|
+
connection_details=full_connection_details
|
|
1170
|
+
)
|
|
1171
|
+
connections_by_id[connection.id] = connection
|
|
1172
|
+
|
|
1173
|
+
destination_details = client.get_destination_details(
|
|
1174
|
+
destination_id=connection.destination_id
|
|
1175
|
+
)
|
|
1176
|
+
destination = AirbyteDestination.from_destination_details(
|
|
1177
|
+
destination_details=destination_details
|
|
1178
|
+
)
|
|
1179
|
+
destinations_by_id[destination.id] = destination
|
|
1180
|
+
|
|
1181
|
+
return AirbyteWorkspaceData(
|
|
1182
|
+
connections_by_id=connections_by_id,
|
|
1183
|
+
destinations_by_id=destinations_by_id,
|
|
1184
|
+
)
|
|
1185
|
+
|
|
1186
|
+
@cached_method
|
|
1187
|
+
def load_asset_specs(
|
|
1188
|
+
self,
|
|
1189
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
1190
|
+
) -> Sequence[AssetSpec]:
|
|
1191
|
+
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
1192
|
+
|
|
1193
|
+
Args:
|
|
1194
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
1195
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
1196
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
1197
|
+
|
|
1198
|
+
Returns:
|
|
1199
|
+
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
1200
|
+
|
|
1201
|
+
Examples:
|
|
1202
|
+
Loading the asset specs for a given Airbyte workspace:
|
|
1203
|
+
.. code-block:: python
|
|
1204
|
+
|
|
1205
|
+
from dagster_airbyte import AirbyteCloudWorkspace
|
|
1206
|
+
|
|
1207
|
+
import dagster as dg
|
|
1208
|
+
|
|
1209
|
+
airbyte_workspace = AirbyteCloudWorkspace(
|
|
1210
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
1211
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
1212
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
airbyte_specs = airbyte_workspace.load_asset_specs()
|
|
1216
|
+
defs = dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace}
|
|
1217
|
+
"""
|
|
1218
|
+
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
1219
|
+
|
|
1220
|
+
return load_airbyte_cloud_asset_specs(
|
|
1221
|
+
workspace=self, dagster_airbyte_translator=dagster_airbyte_translator
|
|
1222
|
+
)
|
|
1223
|
+
|
|
1224
|
+
def _generate_materialization(
|
|
1225
|
+
self,
|
|
1226
|
+
airbyte_output: AirbyteOutput,
|
|
1227
|
+
dagster_airbyte_translator: DagsterAirbyteTranslator,
|
|
1228
|
+
):
|
|
1229
|
+
connection = AirbyteConnection.from_connection_details(
|
|
1230
|
+
connection_details=airbyte_output.connection_details
|
|
1231
|
+
)
|
|
1232
|
+
|
|
1233
|
+
for stream in connection.streams.values():
|
|
1234
|
+
if stream.selected:
|
|
1235
|
+
connection_table_name = get_airbyte_connection_table_name(
|
|
1236
|
+
stream_prefix=connection.stream_prefix,
|
|
1237
|
+
stream_name=stream.name,
|
|
1238
|
+
)
|
|
1239
|
+
stream_asset_spec = dagster_airbyte_translator.get_asset_spec(
|
|
1240
|
+
props=AirbyteConnectionTableProps(
|
|
1241
|
+
table_name=connection_table_name,
|
|
1242
|
+
stream_prefix=connection.stream_prefix,
|
|
1243
|
+
stream_name=stream.name,
|
|
1244
|
+
json_schema=stream.json_schema,
|
|
1245
|
+
connection_id=connection.id,
|
|
1246
|
+
connection_name=connection.name,
|
|
1247
|
+
destination_type=None,
|
|
1248
|
+
database=None,
|
|
1249
|
+
schema=None,
|
|
1250
|
+
)
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
yield AssetMaterialization(
|
|
1254
|
+
asset_key=stream_asset_spec.key,
|
|
1255
|
+
description=(
|
|
1256
|
+
f"Table generated via Airbyte Cloud sync "
|
|
1257
|
+
f"for connection {connection.name}: {connection_table_name}"
|
|
1258
|
+
),
|
|
1259
|
+
metadata=stream_asset_spec.metadata,
|
|
1260
|
+
)
|
|
1261
|
+
|
|
1262
|
+
@public
|
|
1263
|
+
@experimental
|
|
1264
|
+
def sync_and_poll(self, context: AssetExecutionContext):
|
|
1265
|
+
"""Executes a sync and poll process to materialize Airbyte Cloud assets.
|
|
1266
|
+
This method can only be used in the context of an asset execution.
|
|
1267
|
+
|
|
1268
|
+
Args:
|
|
1269
|
+
context (AssetExecutionContext): The execution context
|
|
1270
|
+
from within `@airbyte_assets`.
|
|
1271
|
+
|
|
1272
|
+
Returns:
|
|
1273
|
+
Iterator[Union[AssetMaterialization, MaterializeResult]]: An iterator of MaterializeResult
|
|
1274
|
+
or AssetMaterialization.
|
|
1275
|
+
"""
|
|
1276
|
+
assets_def = context.assets_def
|
|
1277
|
+
dagster_airbyte_translator = get_translator_from_airbyte_assets(assets_def)
|
|
1278
|
+
connection_id = next(
|
|
1279
|
+
check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id)
|
|
1280
|
+
for spec in assets_def.specs
|
|
1281
|
+
)
|
|
1282
|
+
|
|
1283
|
+
client = self.get_client()
|
|
1284
|
+
airbyte_output = client.sync_and_poll(
|
|
1285
|
+
connection_id=connection_id,
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
materialized_asset_keys = set()
|
|
1289
|
+
for materialization in self._generate_materialization(
|
|
1290
|
+
airbyte_output=airbyte_output, dagster_airbyte_translator=dagster_airbyte_translator
|
|
1291
|
+
):
|
|
1292
|
+
# Scan through all tables actually created, if it was expected then emit a MaterializeResult.
|
|
1293
|
+
# Otherwise, emit a runtime AssetMaterialization.
|
|
1294
|
+
if materialization.asset_key in context.selected_asset_keys:
|
|
1295
|
+
yield MaterializeResult(
|
|
1296
|
+
asset_key=materialization.asset_key, metadata=materialization.metadata
|
|
1297
|
+
)
|
|
1298
|
+
materialized_asset_keys.add(materialization.asset_key)
|
|
1299
|
+
else:
|
|
1300
|
+
context.log.warning(
|
|
1301
|
+
f"An unexpected asset was materialized: {materialization.asset_key}. "
|
|
1302
|
+
f"Yielding a materialization event."
|
|
1303
|
+
)
|
|
1304
|
+
yield materialization
|
|
1305
|
+
|
|
1306
|
+
unmaterialized_asset_keys = context.selected_asset_keys - materialized_asset_keys
|
|
1307
|
+
if unmaterialized_asset_keys:
|
|
1308
|
+
context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
@experimental
|
|
1312
|
+
def load_airbyte_cloud_asset_specs(
|
|
1313
|
+
workspace: AirbyteCloudWorkspace,
|
|
1314
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
1315
|
+
) -> Sequence[AssetSpec]:
|
|
1316
|
+
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
1317
|
+
|
|
1318
|
+
Args:
|
|
1319
|
+
workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
|
|
1320
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
1321
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
1322
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
1323
|
+
|
|
1324
|
+
Returns:
|
|
1325
|
+
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
1326
|
+
|
|
1327
|
+
Examples:
|
|
1328
|
+
Loading the asset specs for a given Airbyte Cloud workspace:
|
|
1329
|
+
|
|
1330
|
+
.. code-block:: python
|
|
1331
|
+
|
|
1332
|
+
from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
|
|
1333
|
+
|
|
1334
|
+
import dagster as dg
|
|
1335
|
+
|
|
1336
|
+
airbyte_cloud_workspace = AirbyteCloudWorkspace(
|
|
1337
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
1338
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
1339
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
1340
|
+
)
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
|
|
1344
|
+
defs = dg.Definitions(assets=airbyte_cloud_specs)
|
|
1345
|
+
"""
|
|
1346
|
+
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
1347
|
+
|
|
1348
|
+
with workspace.process_config_and_initialize_cm() as initialized_workspace:
|
|
1349
|
+
return [
|
|
1350
|
+
spec.merge_attributes(
|
|
1351
|
+
metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
|
|
1352
|
+
)
|
|
1353
|
+
for spec in check.is_list(
|
|
1354
|
+
AirbyteCloudWorkspaceDefsLoader(
|
|
1355
|
+
workspace=initialized_workspace,
|
|
1356
|
+
translator=dagster_airbyte_translator,
|
|
1357
|
+
)
|
|
1358
|
+
.build_defs()
|
|
1359
|
+
.assets,
|
|
1360
|
+
AssetSpec,
|
|
1361
|
+
)
|
|
1362
|
+
]
|
|
1363
|
+
|
|
1364
|
+
|
|
1365
|
+
@record
|
|
1366
|
+
class AirbyteCloudWorkspaceDefsLoader(StateBackedDefinitionsLoader[Mapping[str, Any]]):
|
|
1367
|
+
workspace: AirbyteCloudWorkspace
|
|
1368
|
+
translator: DagsterAirbyteTranslator
|
|
1369
|
+
|
|
1370
|
+
@property
|
|
1371
|
+
def defs_key(self) -> str:
|
|
1372
|
+
return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}/{self.workspace.workspace_id}"
|
|
1373
|
+
|
|
1374
|
+
def fetch_state(self) -> AirbyteWorkspaceData:
|
|
1375
|
+
return self.workspace.fetch_airbyte_workspace_data()
|
|
1376
|
+
|
|
1377
|
+
def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
|
|
1378
|
+
all_asset_specs = [
|
|
1379
|
+
self.translator.get_asset_spec(props)
|
|
1380
|
+
for props in state.to_airbyte_connection_table_props_data()
|
|
1381
|
+
]
|
|
1382
|
+
|
|
1383
|
+
return Definitions(assets=all_asset_specs)
|