dagster-airbyte 0.24.3__py3-none-any.whl → 0.28.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_airbyte/__init__.py +26 -9
- dagster_airbyte/asset_decorator.py +123 -0
- dagster_airbyte/asset_defs.py +334 -202
- dagster_airbyte/components/__init__.py +0 -0
- dagster_airbyte/components/workspace_component/__init__.py +0 -0
- dagster_airbyte/components/workspace_component/component.py +433 -0
- dagster_airbyte/components/workspace_component/scaffolder.py +30 -0
- dagster_airbyte/legacy_resources.py +826 -0
- dagster_airbyte/managed/__init__.py +2 -2
- dagster_airbyte/managed/generated/__init__.py +1 -1
- dagster_airbyte/managed/generated/sources.py +35 -35
- dagster_airbyte/managed/reconciliation.py +34 -44
- dagster_airbyte/managed/types.py +8 -7
- dagster_airbyte/ops.py +5 -4
- dagster_airbyte/resources.py +855 -601
- dagster_airbyte/translator.py +255 -0
- dagster_airbyte/types.py +8 -3
- dagster_airbyte/utils.py +36 -2
- dagster_airbyte/version.py +1 -1
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/METADATA +19 -10
- dagster_airbyte-0.28.3.dist-info/RECORD +28 -0
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/WHEEL +1 -1
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/entry_points.txt +3 -0
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info/licenses}/LICENSE +1 -1
- dagster_airbyte-0.24.3.dist-info/RECORD +0 -21
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/top_level.txt +0 -0
dagster_airbyte/resources.py
CHANGED
|
@@ -1,88 +1,173 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
|
-
import sys
|
|
5
2
|
import time
|
|
6
|
-
from abc import
|
|
3
|
+
from collections.abc import Callable, Iterator, Mapping, Sequence
|
|
7
4
|
from contextlib import contextmanager
|
|
8
5
|
from datetime import datetime, timedelta
|
|
9
|
-
from typing import Any,
|
|
6
|
+
from typing import Any, ClassVar, Optional, Union
|
|
7
|
+
from urllib.parse import parse_qsl, urlparse
|
|
10
8
|
|
|
11
9
|
import requests
|
|
12
10
|
from dagster import (
|
|
11
|
+
AssetExecutionContext,
|
|
12
|
+
AssetMaterialization,
|
|
13
|
+
AssetSpec,
|
|
13
14
|
ConfigurableResource,
|
|
15
|
+
Definitions,
|
|
14
16
|
Failure,
|
|
15
|
-
|
|
17
|
+
MaterializeResult,
|
|
16
18
|
_check as check,
|
|
17
19
|
get_dagster_logger,
|
|
18
|
-
resource,
|
|
19
20
|
)
|
|
20
|
-
from dagster.
|
|
21
|
-
from dagster._core.definitions.
|
|
22
|
-
from dagster.
|
|
23
|
-
from
|
|
24
|
-
from
|
|
21
|
+
from dagster._annotations import superseded
|
|
22
|
+
from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
|
|
23
|
+
from dagster._symbol_annotations import beta, public
|
|
24
|
+
from dagster_shared.dagster_model import DagsterModel
|
|
25
|
+
from dagster_shared.record import record
|
|
26
|
+
from dagster_shared.utils.cached_method import cached_method
|
|
27
|
+
from pydantic import Field, PrivateAttr, model_validator
|
|
25
28
|
from requests.exceptions import RequestException
|
|
26
29
|
|
|
30
|
+
from dagster_airbyte.translator import (
|
|
31
|
+
AirbyteConnection,
|
|
32
|
+
AirbyteConnectionTableProps,
|
|
33
|
+
AirbyteDestination,
|
|
34
|
+
AirbyteJob,
|
|
35
|
+
AirbyteJobStatusType,
|
|
36
|
+
AirbyteMetadataSet,
|
|
37
|
+
AirbyteWorkspaceData,
|
|
38
|
+
DagsterAirbyteTranslator,
|
|
39
|
+
)
|
|
27
40
|
from dagster_airbyte.types import AirbyteOutput
|
|
41
|
+
from dagster_airbyte.utils import (
|
|
42
|
+
DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY,
|
|
43
|
+
get_airbyte_connection_table_name,
|
|
44
|
+
get_translator_from_airbyte_assets,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
AIRBYTE_CLOUD_REST_API_BASE = "https://api.airbyte.com"
|
|
48
|
+
AIRBYTE_CLOUD_REST_API_VERSION = "v1"
|
|
49
|
+
AIRBYTE_CLOUD_REST_API_BASE_URL = f"{AIRBYTE_CLOUD_REST_API_BASE}/{AIRBYTE_CLOUD_REST_API_VERSION}"
|
|
50
|
+
AIRBYTE_CLOUD_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
|
|
51
|
+
AIRBYTE_CLOUD_CONFIGURATION_API_VERSION = "v1"
|
|
52
|
+
AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL = (
|
|
53
|
+
f"{AIRBYTE_CLOUD_CONFIGURATION_API_BASE}/{AIRBYTE_CLOUD_CONFIGURATION_API_VERSION}"
|
|
54
|
+
)
|
|
28
55
|
|
|
29
56
|
DEFAULT_POLL_INTERVAL_SECONDS = 10
|
|
30
57
|
|
|
31
58
|
# The access token expire every 3 minutes in Airbyte Cloud.
|
|
32
59
|
# Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class AirbyteState:
|
|
37
|
-
RUNNING = "running"
|
|
38
|
-
SUCCEEDED = "succeeded"
|
|
39
|
-
CANCELLED = "cancelled"
|
|
40
|
-
PENDING = "pending"
|
|
41
|
-
FAILED = "failed"
|
|
42
|
-
ERROR = "error"
|
|
43
|
-
INCOMPLETE = "incomplete"
|
|
60
|
+
AIRBYTE_REFRESH_TIMEDELTA_SECONDS = 150
|
|
44
61
|
|
|
62
|
+
AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
|
|
45
63
|
|
|
46
|
-
class AirbyteResourceState:
|
|
47
|
-
def __init__(self) -> None:
|
|
48
|
-
self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}
|
|
49
|
-
# Int in case we nest contexts
|
|
50
|
-
self.cache_enabled = 0
|
|
51
64
|
|
|
65
|
+
@beta
|
|
66
|
+
class AirbyteClient(DagsterModel):
|
|
67
|
+
"""This class exposes methods on top of the Airbyte APIs for Airbyte."""
|
|
52
68
|
|
|
53
|
-
|
|
69
|
+
rest_api_base_url: str = Field(
|
|
70
|
+
default=AIRBYTE_CLOUD_REST_API_BASE_URL,
|
|
71
|
+
description=(
|
|
72
|
+
"The base URL for the Airbyte REST API. "
|
|
73
|
+
"For Airbyte Cloud, leave this as the default. "
|
|
74
|
+
"For self-managed Airbyte, this is usually <your Airbyte host>/api/public/v1."
|
|
75
|
+
),
|
|
76
|
+
)
|
|
77
|
+
configuration_api_base_url: str = Field(
|
|
78
|
+
default=AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL,
|
|
79
|
+
description=(
|
|
80
|
+
"The base URL for the Airbyte Configuration API. "
|
|
81
|
+
"For Airbyte Cloud, leave this as the default. "
|
|
82
|
+
"For self-managed Airbyte, this is usually <your Airbyte host>/api/v1."
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
86
|
+
client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
|
|
87
|
+
client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
|
|
88
|
+
username: Optional[str] = Field(
|
|
89
|
+
default=None,
|
|
90
|
+
description="The Airbyte username for authentication. Used for self-managed Airbyte with basic auth.",
|
|
91
|
+
)
|
|
92
|
+
password: Optional[str] = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="The Airbyte password for authentication. Used for self-managed Airbyte with basic auth.",
|
|
95
|
+
)
|
|
54
96
|
request_max_retries: int = Field(
|
|
55
|
-
|
|
97
|
+
...,
|
|
56
98
|
description=(
|
|
57
99
|
"The maximum number of times requests to the Airbyte API should be retried "
|
|
58
100
|
"before failing."
|
|
59
101
|
),
|
|
60
102
|
)
|
|
61
103
|
request_retry_delay: float = Field(
|
|
62
|
-
|
|
104
|
+
...,
|
|
63
105
|
description="Time (in seconds) to wait between each request retry.",
|
|
64
106
|
)
|
|
65
107
|
request_timeout: int = Field(
|
|
66
|
-
|
|
108
|
+
...,
|
|
67
109
|
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
68
110
|
)
|
|
69
|
-
|
|
70
|
-
default=
|
|
111
|
+
max_items_per_page: int = Field(
|
|
112
|
+
default=100,
|
|
71
113
|
description=(
|
|
72
|
-
"
|
|
73
|
-
"
|
|
74
|
-
" resumed easily, or if your Dagster deployment may experience runner interruptions"
|
|
75
|
-
" that do not impact your Airbyte deployment."
|
|
114
|
+
"The maximum number of items per page. "
|
|
115
|
+
"Used for paginated resources like connections, destinations, etc. "
|
|
76
116
|
),
|
|
77
117
|
)
|
|
78
118
|
poll_interval: float = Field(
|
|
79
119
|
default=DEFAULT_POLL_INTERVAL_SECONDS,
|
|
80
|
-
description="
|
|
120
|
+
description="The time (in seconds) that will be waited between successive polls.",
|
|
121
|
+
)
|
|
122
|
+
poll_timeout: Optional[float] = Field(
|
|
123
|
+
default=None,
|
|
124
|
+
description=(
|
|
125
|
+
"The maximum time that will wait before this operation is timed "
|
|
126
|
+
"out. By default, this will never time out."
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
cancel_on_termination: bool = Field(
|
|
130
|
+
default=True,
|
|
131
|
+
description=(
|
|
132
|
+
"Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
|
|
133
|
+
"This may be useful to disable if using Airbyte sources that cannot be cancelled and "
|
|
134
|
+
"resumed easily, or if your Dagster deployment may experience runner interruptions "
|
|
135
|
+
"that do not impact your Airbyte deployment."
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
poll_previous_running_sync: bool = Field(
|
|
139
|
+
default=False,
|
|
140
|
+
description=(
|
|
141
|
+
"If set to True, Dagster will check for previous running sync for the same connection "
|
|
142
|
+
"and begin polling it instead of starting a new sync."
|
|
143
|
+
),
|
|
81
144
|
)
|
|
82
145
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
146
|
+
_access_token_value: Optional[str] = PrivateAttr(default=None)
|
|
147
|
+
_access_token_timestamp: Optional[float] = PrivateAttr(default=None)
|
|
148
|
+
|
|
149
|
+
@model_validator(mode="before")
|
|
150
|
+
def validate_authentication(cls, values):
|
|
151
|
+
has_client_id = values.get("client_id") is not None
|
|
152
|
+
has_client_secret = values.get("client_secret") is not None
|
|
153
|
+
has_username = values.get("username") is not None
|
|
154
|
+
has_password = values.get("password") is not None
|
|
155
|
+
|
|
156
|
+
check.invariant(
|
|
157
|
+
has_username == has_password,
|
|
158
|
+
"Missing config: both username and password are required for Airbyte authentication.",
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
check.invariant(
|
|
162
|
+
has_client_id == has_client_secret,
|
|
163
|
+
"Missing config: both client_id and client_secret are required for Airbyte authentication.",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
check.invariant(
|
|
167
|
+
not ((has_client_id or has_client_secret) and (has_username or has_password)),
|
|
168
|
+
"Invalid config: cannot provide both client_id/client_secret and username/password for Airbyte authentication.",
|
|
169
|
+
)
|
|
170
|
+
return values
|
|
86
171
|
|
|
87
172
|
@property
|
|
88
173
|
@cached_method
|
|
@@ -90,704 +175,873 @@ class BaseAirbyteResource(ConfigurableResource):
|
|
|
90
175
|
return get_dagster_logger()
|
|
91
176
|
|
|
92
177
|
@property
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
raise NotImplementedError()
|
|
178
|
+
def all_additional_request_headers(self) -> Mapping[str, Any]:
|
|
179
|
+
return {**self.authorization_request_headers, **self.user_agent_request_headers}
|
|
96
180
|
|
|
97
181
|
@property
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
182
|
+
def authorization_request_headers(self) -> Mapping[str, Any]:
|
|
183
|
+
# Make sure the access token is refreshed before using it when calling the API.
|
|
184
|
+
if not (self.client_id and self.client_secret):
|
|
185
|
+
return {}
|
|
101
186
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
include_additional_request_params: bool = True,
|
|
108
|
-
) -> Optional[Mapping[str, object]]:
|
|
109
|
-
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
187
|
+
if self._needs_refreshed_access_token():
|
|
188
|
+
self._refresh_access_token()
|
|
189
|
+
return {
|
|
190
|
+
"Authorization": f"Bearer {self._access_token_value}",
|
|
191
|
+
}
|
|
110
192
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
193
|
+
@property
|
|
194
|
+
def user_agent_request_headers(self) -> Mapping[str, Any]:
|
|
195
|
+
return {
|
|
196
|
+
"User-Agent": "dagster",
|
|
197
|
+
}
|
|
114
198
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
199
|
+
def _refresh_access_token(self) -> None:
|
|
200
|
+
response = check.not_none(
|
|
201
|
+
self._single_request(
|
|
202
|
+
method="POST",
|
|
203
|
+
url=f"{self.rest_api_base_url}/applications/token",
|
|
204
|
+
data={
|
|
205
|
+
"client_id": self.client_id,
|
|
206
|
+
"client_secret": self.client_secret,
|
|
207
|
+
},
|
|
208
|
+
# Must not pass the bearer access token when refreshing it.
|
|
209
|
+
include_additional_request_headers=False,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
self._access_token_value = str(response["access_token"])
|
|
213
|
+
self._access_token_timestamp = datetime.now().timestamp()
|
|
214
|
+
|
|
215
|
+
def _needs_refreshed_access_token(self) -> bool:
|
|
216
|
+
return (
|
|
217
|
+
not self._access_token_value
|
|
218
|
+
or not self._access_token_timestamp
|
|
219
|
+
or self._access_token_timestamp
|
|
220
|
+
<= (datetime.now() - timedelta(seconds=AIRBYTE_REFRESH_TIMEDELTA_SECONDS)).timestamp()
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def _get_session(self, include_additional_request_headers: bool) -> requests.Session:
|
|
119
224
|
headers = {"accept": "application/json"}
|
|
225
|
+
if include_additional_request_headers:
|
|
226
|
+
headers = {
|
|
227
|
+
**headers,
|
|
228
|
+
**self.all_additional_request_headers,
|
|
229
|
+
}
|
|
230
|
+
session = requests.Session()
|
|
231
|
+
session.headers.update(headers)
|
|
120
232
|
|
|
233
|
+
if self.username and self.password:
|
|
234
|
+
session.auth = (self.username, self.password)
|
|
235
|
+
|
|
236
|
+
return session
|
|
237
|
+
|
|
238
|
+
def _single_request(
|
|
239
|
+
self,
|
|
240
|
+
method: str,
|
|
241
|
+
url: str,
|
|
242
|
+
data: Optional[Mapping[str, Any]] = None,
|
|
243
|
+
params: Optional[Mapping[str, Any]] = None,
|
|
244
|
+
include_additional_request_headers: bool = True,
|
|
245
|
+
) -> Mapping[str, Any]:
|
|
246
|
+
"""Execute a single HTTP request with retry logic."""
|
|
121
247
|
num_retries = 0
|
|
122
248
|
while True:
|
|
123
249
|
try:
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
url=url,
|
|
127
|
-
headers=headers,
|
|
128
|
-
timeout=self.request_timeout,
|
|
250
|
+
session = self._get_session(
|
|
251
|
+
include_additional_request_headers=include_additional_request_headers
|
|
129
252
|
)
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
if include_additional_request_params:
|
|
134
|
-
request_args = deep_merge_dicts(
|
|
135
|
-
request_args,
|
|
136
|
-
self.all_additional_request_params,
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
response = requests.request(
|
|
140
|
-
**request_args,
|
|
253
|
+
response = session.request(
|
|
254
|
+
method=method, url=url, json=data, params=params, timeout=self.request_timeout
|
|
141
255
|
)
|
|
142
256
|
response.raise_for_status()
|
|
143
|
-
if response.status_code == 204:
|
|
144
|
-
return None
|
|
145
257
|
return response.json()
|
|
146
258
|
except RequestException as e:
|
|
147
|
-
self._log.error(
|
|
259
|
+
self._log.error(
|
|
260
|
+
f"Request to Airbyte API failed for url {url} with method {method} : {e}"
|
|
261
|
+
)
|
|
148
262
|
if num_retries == self.request_max_retries:
|
|
149
263
|
break
|
|
150
264
|
num_retries += 1
|
|
151
265
|
time.sleep(self.request_retry_delay)
|
|
152
266
|
|
|
153
|
-
|
|
267
|
+
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
154
268
|
|
|
155
|
-
|
|
156
|
-
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
157
|
-
raise NotImplementedError()
|
|
269
|
+
return {}
|
|
158
270
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
271
|
+
def _paginated_request(
|
|
272
|
+
self,
|
|
273
|
+
method: str,
|
|
274
|
+
url: str,
|
|
275
|
+
params: dict[str, Any],
|
|
276
|
+
data: Optional[Mapping[str, Any]] = None,
|
|
277
|
+
include_additional_request_params: bool = True,
|
|
278
|
+
) -> Sequence[Mapping[str, Any]]:
|
|
279
|
+
"""Execute paginated requests and yield all items."""
|
|
280
|
+
result_data = []
|
|
281
|
+
params = {"limit": self.max_items_per_page, **params}
|
|
282
|
+
while True:
|
|
283
|
+
response = self._single_request(
|
|
284
|
+
method=method,
|
|
285
|
+
url=url,
|
|
286
|
+
data=data,
|
|
287
|
+
params=params,
|
|
288
|
+
include_additional_request_headers=include_additional_request_params,
|
|
289
|
+
)
|
|
162
290
|
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
291
|
+
# Handle different response structures
|
|
292
|
+
result_data.extend(response.get("data", []))
|
|
293
|
+
next_url = response.get("next", "")
|
|
294
|
+
if not next_url:
|
|
295
|
+
break
|
|
166
296
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
297
|
+
# Parse the query string for the next page
|
|
298
|
+
next_params = parse_qsl(urlparse(next_url).query)
|
|
299
|
+
# Overwrite the pagination params with the ones for the next page
|
|
300
|
+
params.update(dict(next_params))
|
|
170
301
|
|
|
171
|
-
|
|
172
|
-
@abstractmethod
|
|
173
|
-
def _should_forward_logs(self) -> bool:
|
|
174
|
-
raise NotImplementedError()
|
|
302
|
+
return result_data
|
|
175
303
|
|
|
176
|
-
def
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
304
|
+
def validate_workspace_id(self) -> None:
|
|
305
|
+
"""Fetches workspace details. This is used to validate that the workspace exists."""
|
|
306
|
+
self._single_request(
|
|
307
|
+
method="GET",
|
|
308
|
+
url=f"{self.rest_api_base_url}/workspaces/{self.workspace_id}",
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def get_connections(self) -> Sequence[Mapping[str, Any]]:
|
|
312
|
+
"""Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
|
|
313
|
+
return self._paginated_request(
|
|
314
|
+
method="GET",
|
|
315
|
+
url=f"{self.rest_api_base_url}/connections",
|
|
316
|
+
params={"workspaceIds": self.workspace_id},
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
def get_jobs_for_connection(
|
|
320
|
+
self, connection_id: str, created_after: datetime | None = None
|
|
321
|
+
) -> Sequence[AirbyteJob]:
|
|
322
|
+
"""Fetches all jobs for a specific connection of an Airbyte workspace from the Airbyte REST API."""
|
|
323
|
+
params = {"workspaceIds": self.workspace_id, "connectionId": connection_id}
|
|
324
|
+
if created_after:
|
|
325
|
+
params["createdAtStart"] = created_after.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
326
|
+
|
|
327
|
+
return [
|
|
328
|
+
AirbyteJob.from_job_details(job_details=job_details)
|
|
329
|
+
for job_details in self._paginated_request(
|
|
330
|
+
method="GET",
|
|
331
|
+
url=f"{self.rest_api_base_url}/jobs",
|
|
332
|
+
params=params,
|
|
333
|
+
)
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
def get_connection_details(self, connection_id) -> Mapping[str, Any]:
|
|
337
|
+
"""Fetches details about a given connection from the Airbyte Configuration API.
|
|
338
|
+
The Airbyte Configuration API is an internal and may change in the future.
|
|
339
|
+
"""
|
|
340
|
+
# Using the Airbyte Configuration API to get the connection details, including streams and their configs.
|
|
341
|
+
# https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
|
|
342
|
+
# https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
|
|
343
|
+
return self._single_request(
|
|
344
|
+
method="POST",
|
|
345
|
+
url=f"{self.configuration_api_base_url}/connections/get",
|
|
346
|
+
data={"connectionId": connection_id},
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
|
|
350
|
+
"""Fetches details about a given destination from the Airbyte REST API."""
|
|
351
|
+
return self._single_request(
|
|
352
|
+
method="GET",
|
|
353
|
+
url=f"{self.rest_api_base_url}/destinations/{destination_id}",
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
|
|
357
|
+
return self._single_request(
|
|
358
|
+
method="POST",
|
|
359
|
+
url=f"{self.rest_api_base_url}/jobs",
|
|
360
|
+
data={
|
|
361
|
+
"connectionId": connection_id,
|
|
362
|
+
"jobType": "sync",
|
|
363
|
+
},
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
def get_job_details(self, job_id: int) -> Mapping[str, Any]:
|
|
367
|
+
return self._single_request(
|
|
368
|
+
method="GET",
|
|
369
|
+
url=f"{self.rest_api_base_url}/jobs/{job_id}",
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
def cancel_job(self, job_id: int) -> Mapping[str, Any]:
|
|
373
|
+
return self._single_request(
|
|
374
|
+
method="DELETE",
|
|
375
|
+
url=f"{self.rest_api_base_url}/jobs/{job_id}",
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
def sync_and_poll(self, connection_id: str) -> AirbyteOutput:
|
|
379
|
+
"""Initializes a sync operation for the given connection, and polls until it completes.
|
|
183
380
|
|
|
184
381
|
Args:
|
|
185
|
-
connection_id (str): The Airbyte
|
|
186
|
-
"Connection" tab of a given connection in the
|
|
187
|
-
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
188
|
-
poll_timeout (float): The maximum time that will waited before this operation is timed
|
|
189
|
-
out. By default, this will never time out.
|
|
382
|
+
connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
|
|
383
|
+
"Connection" tab of a given connection in the Airbyte UI.
|
|
190
384
|
|
|
191
385
|
Returns:
|
|
192
386
|
:py:class:`~AirbyteOutput`:
|
|
193
387
|
Details of the sync job.
|
|
194
388
|
"""
|
|
195
389
|
connection_details = self.get_connection_details(connection_id)
|
|
196
|
-
job_details = self.start_sync(connection_id)
|
|
197
|
-
job_info = cast(Dict[str, object], job_details.get("job", {}))
|
|
198
|
-
job_id = cast(int, job_info.get("id"))
|
|
199
390
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
391
|
+
existing_jobs = [
|
|
392
|
+
job
|
|
393
|
+
for job in self.get_jobs_for_connection(
|
|
394
|
+
connection_id=connection_id,
|
|
395
|
+
created_after=datetime.now() - timedelta(days=2),
|
|
396
|
+
)
|
|
397
|
+
if job.status
|
|
398
|
+
in (
|
|
399
|
+
AirbyteJobStatusType.RUNNING,
|
|
400
|
+
AirbyteJobStatusType.PENDING,
|
|
401
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
402
|
+
)
|
|
403
|
+
]
|
|
404
|
+
|
|
405
|
+
if not existing_jobs:
|
|
406
|
+
start_job_details = self.start_sync_job(connection_id)
|
|
407
|
+
job = AirbyteJob.from_job_details(job_details=start_job_details)
|
|
408
|
+
self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
|
|
409
|
+
else:
|
|
410
|
+
if self.poll_previous_running_sync:
|
|
411
|
+
if len(existing_jobs) == 1:
|
|
412
|
+
job = existing_jobs[0]
|
|
413
|
+
self._log.info(
|
|
414
|
+
f"Job {job.id} already running for connection_id={connection_id}. Resume polling."
|
|
415
|
+
)
|
|
416
|
+
else:
|
|
417
|
+
raise Failure(f"Found multiple running jobs for connection_id={connection_id}.")
|
|
418
|
+
else:
|
|
419
|
+
raise Failure(f"Found sync job for connection_id={connection_id} already running.")
|
|
420
|
+
|
|
421
|
+
poll_start = datetime.now()
|
|
205
422
|
|
|
206
423
|
try:
|
|
207
424
|
while True:
|
|
208
|
-
if poll_timeout and
|
|
425
|
+
if self.poll_timeout and datetime.now() > poll_start + timedelta(
|
|
426
|
+
seconds=self.poll_timeout
|
|
427
|
+
):
|
|
209
428
|
raise Failure(
|
|
210
|
-
f"Timeout: Airbyte job {
|
|
211
|
-
f" {poll_timeout} seconds"
|
|
429
|
+
f"Timeout: Airbyte job {job.id} is not ready after the timeout"
|
|
430
|
+
f" {self.poll_timeout} seconds"
|
|
212
431
|
)
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
sys.stdout.flush()
|
|
225
|
-
logged_lines = len(log_lines)
|
|
226
|
-
|
|
227
|
-
# if there's a next attempt, this one will have no more log messages
|
|
228
|
-
if logged_attempts < cur_attempt - 1:
|
|
229
|
-
logged_lines = 0
|
|
230
|
-
logged_attempts += 1
|
|
231
|
-
|
|
232
|
-
job_info = cast(Dict[str, object], job_details.get("job", {}))
|
|
233
|
-
state = job_info.get("status")
|
|
234
|
-
|
|
235
|
-
if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
|
|
432
|
+
|
|
433
|
+
time.sleep(self.poll_interval)
|
|
434
|
+
# We return these job details in the AirbyteOutput when the job succeeds
|
|
435
|
+
poll_job_details = self.get_job_details(job.id)
|
|
436
|
+
self._log.debug(poll_job_details)
|
|
437
|
+
job = AirbyteJob.from_job_details(job_details=poll_job_details)
|
|
438
|
+
if job.status in (
|
|
439
|
+
AirbyteJobStatusType.RUNNING,
|
|
440
|
+
AirbyteJobStatusType.PENDING,
|
|
441
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
442
|
+
):
|
|
236
443
|
continue
|
|
237
|
-
elif
|
|
444
|
+
elif job.status == AirbyteJobStatusType.SUCCEEDED:
|
|
238
445
|
break
|
|
239
|
-
elif
|
|
240
|
-
raise Failure(f"Job failed: {
|
|
241
|
-
elif
|
|
242
|
-
raise Failure(f"Job was cancelled: {
|
|
446
|
+
elif job.status in [AirbyteJobStatusType.ERROR, AirbyteJobStatusType.FAILED]:
|
|
447
|
+
raise Failure(f"Job failed: {job.id}")
|
|
448
|
+
elif job.status == AirbyteJobStatusType.CANCELLED:
|
|
449
|
+
raise Failure(f"Job was cancelled: {job.id}")
|
|
243
450
|
else:
|
|
244
|
-
raise Failure(
|
|
451
|
+
raise Failure(
|
|
452
|
+
f"Encountered unexpected state `{job.status}` for job_id {job.id}"
|
|
453
|
+
)
|
|
245
454
|
finally:
|
|
246
455
|
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
247
456
|
# the python process
|
|
248
|
-
if (
|
|
249
|
-
|
|
250
|
-
|
|
457
|
+
if self.cancel_on_termination and job.status not in (
|
|
458
|
+
AirbyteJobStatusType.SUCCEEDED,
|
|
459
|
+
AirbyteJobStatusType.ERROR,
|
|
460
|
+
AirbyteJobStatusType.CANCELLED,
|
|
461
|
+
AirbyteJobStatusType.FAILED,
|
|
251
462
|
):
|
|
252
|
-
self.cancel_job(
|
|
463
|
+
self.cancel_job(job.id)
|
|
253
464
|
|
|
254
|
-
return AirbyteOutput(job_details=
|
|
465
|
+
return AirbyteOutput(job_details=poll_job_details, connection_details=connection_details)
|
|
255
466
|
|
|
256
467
|
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
468
|
+
@beta
|
|
469
|
+
class BaseAirbyteWorkspace(ConfigurableResource):
|
|
470
|
+
"""This class represents a Airbyte workspace and provides utilities
|
|
471
|
+
to interact with Airbyte APIs.
|
|
472
|
+
"""
|
|
260
473
|
|
|
261
|
-
|
|
474
|
+
request_max_retries: int = Field(
|
|
475
|
+
default=3,
|
|
476
|
+
description=(
|
|
477
|
+
"The maximum number of times requests to the Airbyte API should be retried "
|
|
478
|
+
"before failing."
|
|
479
|
+
),
|
|
480
|
+
)
|
|
481
|
+
request_retry_delay: float = Field(
|
|
482
|
+
default=0.25,
|
|
483
|
+
description="Time (in seconds) to wait between each request retry.",
|
|
484
|
+
)
|
|
485
|
+
request_timeout: int = Field(
|
|
486
|
+
default=15,
|
|
487
|
+
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
488
|
+
)
|
|
489
|
+
max_items_per_page: int = Field(
|
|
490
|
+
default=100,
|
|
491
|
+
description=(
|
|
492
|
+
"The maximum number of items per page. "
|
|
493
|
+
"Used for paginated resources like connections, destinations, etc. "
|
|
494
|
+
),
|
|
495
|
+
)
|
|
496
|
+
poll_interval: float = Field(
|
|
497
|
+
default=DEFAULT_POLL_INTERVAL_SECONDS,
|
|
498
|
+
description="The time (in seconds) that will be waited between successive polls.",
|
|
499
|
+
)
|
|
500
|
+
poll_timeout: Optional[float] = Field(
|
|
501
|
+
default=None,
|
|
502
|
+
description=(
|
|
503
|
+
"The maximum time that will wait before this operation is timed "
|
|
504
|
+
"out. By default, this will never time out."
|
|
505
|
+
),
|
|
506
|
+
)
|
|
507
|
+
cancel_on_termination: bool = Field(
|
|
508
|
+
default=True,
|
|
509
|
+
description=(
|
|
510
|
+
"Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
|
|
511
|
+
"This may be useful to disable if using Airbyte sources that cannot be cancelled and "
|
|
512
|
+
"resumed easily, or if your Dagster deployment may experience runner interruptions "
|
|
513
|
+
"that do not impact your Airbyte deployment."
|
|
514
|
+
),
|
|
515
|
+
)
|
|
516
|
+
poll_previous_running_sync: bool = Field(
|
|
517
|
+
default=False,
|
|
518
|
+
description=(
|
|
519
|
+
"If set to True, Dagster will check for previous running sync for the same connection "
|
|
520
|
+
"and begin polling it instead of starting a new sync."
|
|
521
|
+
),
|
|
522
|
+
)
|
|
262
523
|
|
|
263
|
-
|
|
524
|
+
_client: AirbyteClient = PrivateAttr(default=None) # type: ignore
|
|
264
525
|
|
|
265
|
-
|
|
266
|
-
|
|
526
|
+
@cached_method
|
|
527
|
+
def fetch_airbyte_workspace_data(
|
|
528
|
+
self,
|
|
529
|
+
) -> AirbyteWorkspaceData:
|
|
530
|
+
"""Retrieves all Airbyte content from the workspace and returns it as a AirbyteWorkspaceData object.
|
|
267
531
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
532
|
+
Returns:
|
|
533
|
+
AirbyteWorkspaceData: A snapshot of the Airbyte workspace's content.
|
|
534
|
+
"""
|
|
535
|
+
connections_by_id = {}
|
|
536
|
+
destinations_by_id = {}
|
|
272
537
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
538
|
+
client = self.get_client()
|
|
539
|
+
|
|
540
|
+
client.validate_workspace_id()
|
|
541
|
+
|
|
542
|
+
connections = client.get_connections()
|
|
543
|
+
|
|
544
|
+
for partial_connection_details in connections:
|
|
545
|
+
full_connection_details = client.get_connection_details(
|
|
546
|
+
connection_id=partial_connection_details["connectionId"]
|
|
547
|
+
)
|
|
548
|
+
connection = AirbyteConnection.from_connection_details(
|
|
549
|
+
connection_details=full_connection_details
|
|
550
|
+
)
|
|
551
|
+
connections_by_id[connection.id] = connection
|
|
552
|
+
|
|
553
|
+
destination_details = client.get_destination_details(
|
|
554
|
+
destination_id=connection.destination_id
|
|
555
|
+
)
|
|
556
|
+
destination = AirbyteDestination.from_destination_details(
|
|
557
|
+
destination_details=destination_details
|
|
558
|
+
)
|
|
559
|
+
destinations_by_id[destination.id] = destination
|
|
277
560
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
561
|
+
return AirbyteWorkspaceData(
|
|
562
|
+
connections_by_id=connections_by_id,
|
|
563
|
+
destinations_by_id=destinations_by_id,
|
|
281
564
|
)
|
|
282
|
-
"""
|
|
283
565
|
|
|
284
|
-
|
|
285
|
-
|
|
566
|
+
@cached_method
|
|
567
|
+
def load_asset_specs(
|
|
568
|
+
self,
|
|
569
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
570
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
|
|
571
|
+
) -> Sequence[AssetSpec]:
|
|
572
|
+
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
286
573
|
|
|
287
|
-
|
|
288
|
-
|
|
574
|
+
Args:
|
|
575
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
576
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
577
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
578
|
+
connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
|
|
579
|
+
which Airbyte connection assets are created for.
|
|
289
580
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
self._refresh_access_token()
|
|
581
|
+
Returns:
|
|
582
|
+
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
293
583
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
584
|
+
Examples:
|
|
585
|
+
Loading the asset specs for a given Airbyte workspace:
|
|
586
|
+
.. code-block:: python
|
|
297
587
|
|
|
298
|
-
|
|
299
|
-
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
300
|
-
# Make sure the access token is refreshed before using it when calling the API.
|
|
301
|
-
if self._needs_refreshed_access_token():
|
|
302
|
-
self._refresh_access_token()
|
|
303
|
-
return {
|
|
304
|
-
"headers": {
|
|
305
|
-
"Authorization": f"Bearer {self._access_token_value}",
|
|
306
|
-
"User-Agent": "dagster",
|
|
307
|
-
}
|
|
308
|
-
}
|
|
588
|
+
from dagster_airbyte import AirbyteWorkspace
|
|
309
589
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
590
|
+
import dagster as dg
|
|
591
|
+
|
|
592
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
593
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
594
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
595
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
airbyte_specs = airbyte_workspace.load_asset_specs()
|
|
599
|
+
dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace})
|
|
600
|
+
"""
|
|
601
|
+
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
602
|
+
|
|
603
|
+
return load_airbyte_asset_specs(
|
|
604
|
+
workspace=self,
|
|
605
|
+
dagster_airbyte_translator=dagster_airbyte_translator,
|
|
606
|
+
connection_selector_fn=connection_selector_fn,
|
|
325
607
|
)
|
|
326
608
|
|
|
327
|
-
def
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
},
|
|
335
|
-
)
|
|
609
|
+
def _generate_materialization(
|
|
610
|
+
self,
|
|
611
|
+
airbyte_output: AirbyteOutput,
|
|
612
|
+
dagster_airbyte_translator: DagsterAirbyteTranslator,
|
|
613
|
+
):
|
|
614
|
+
connection = AirbyteConnection.from_connection_details(
|
|
615
|
+
connection_details=airbyte_output.connection_details
|
|
336
616
|
)
|
|
337
|
-
return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}
|
|
338
617
|
|
|
339
|
-
|
|
340
|
-
|
|
618
|
+
for stream in connection.streams.values():
|
|
619
|
+
if stream.selected:
|
|
620
|
+
connection_table_name = get_airbyte_connection_table_name(
|
|
621
|
+
stream_prefix=connection.stream_prefix,
|
|
622
|
+
stream_name=stream.name,
|
|
623
|
+
)
|
|
624
|
+
stream_asset_spec = dagster_airbyte_translator.get_asset_spec(
|
|
625
|
+
props=AirbyteConnectionTableProps(
|
|
626
|
+
table_name=connection_table_name,
|
|
627
|
+
stream_prefix=connection.stream_prefix,
|
|
628
|
+
stream_name=stream.name,
|
|
629
|
+
json_schema=stream.json_schema,
|
|
630
|
+
connection_id=connection.id,
|
|
631
|
+
connection_name=connection.name,
|
|
632
|
+
destination_type=None,
|
|
633
|
+
database=None,
|
|
634
|
+
schema=None,
|
|
635
|
+
)
|
|
636
|
+
)
|
|
341
637
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
638
|
+
yield AssetMaterialization(
|
|
639
|
+
asset_key=stream_asset_spec.key,
|
|
640
|
+
description=(
|
|
641
|
+
f"Table generated via Airbyte sync "
|
|
642
|
+
f"for connection {connection.name}: {connection_table_name}"
|
|
643
|
+
),
|
|
644
|
+
metadata=stream_asset_spec.metadata,
|
|
645
|
+
)
|
|
345
646
|
|
|
346
|
-
|
|
347
|
-
|
|
647
|
+
@public
|
|
648
|
+
@beta
|
|
649
|
+
def sync_and_poll(self, context: AssetExecutionContext):
|
|
650
|
+
"""Executes a sync and poll process to materialize Airbyte assets.
|
|
651
|
+
This method can only be used in the context of an asset execution.
|
|
348
652
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
return False
|
|
653
|
+
Args:
|
|
654
|
+
context (AssetExecutionContext): The execution context
|
|
655
|
+
from within `@airbyte_assets`.
|
|
353
656
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
include_additional_request_params=False,
|
|
364
|
-
)
|
|
657
|
+
Returns:
|
|
658
|
+
Iterator[Union[AssetMaterialization, MaterializeResult]]: An iterator of MaterializeResult
|
|
659
|
+
or AssetMaterialization.
|
|
660
|
+
"""
|
|
661
|
+
assets_def = context.assets_def
|
|
662
|
+
dagster_airbyte_translator = get_translator_from_airbyte_assets(assets_def)
|
|
663
|
+
connection_id = next(
|
|
664
|
+
check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id)
|
|
665
|
+
for spec in assets_def.specs
|
|
365
666
|
)
|
|
366
|
-
self._access_token_value = str(response["access_token"])
|
|
367
|
-
self._access_token_timestamp = datetime.now().timestamp()
|
|
368
667
|
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
or not self._access_token_timestamp
|
|
373
|
-
or self._access_token_timestamp
|
|
374
|
-
<= datetime.timestamp(
|
|
375
|
-
datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
|
|
376
|
-
)
|
|
668
|
+
client = self.get_client()
|
|
669
|
+
airbyte_output = client.sync_and_poll(
|
|
670
|
+
connection_id=connection_id,
|
|
377
671
|
)
|
|
378
672
|
|
|
673
|
+
materialized_asset_keys = set()
|
|
674
|
+
for materialization in self._generate_materialization(
|
|
675
|
+
airbyte_output=airbyte_output, dagster_airbyte_translator=dagster_airbyte_translator
|
|
676
|
+
):
|
|
677
|
+
# Scan through all tables actually created, if it was expected then emit a MaterializeResult.
|
|
678
|
+
# Otherwise, emit a runtime AssetMaterialization.
|
|
679
|
+
if materialization.asset_key in context.selected_asset_keys:
|
|
680
|
+
yield MaterializeResult(
|
|
681
|
+
asset_key=materialization.asset_key, metadata=materialization.metadata
|
|
682
|
+
)
|
|
683
|
+
materialized_asset_keys.add(materialization.asset_key)
|
|
684
|
+
else:
|
|
685
|
+
context.log.warning(
|
|
686
|
+
f"An unexpected asset was materialized: {materialization.asset_key}. "
|
|
687
|
+
f"Yielding a materialization event."
|
|
688
|
+
)
|
|
689
|
+
yield materialization
|
|
690
|
+
|
|
691
|
+
unmaterialized_asset_keys = context.selected_asset_keys - materialized_asset_keys
|
|
692
|
+
if unmaterialized_asset_keys:
|
|
693
|
+
context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
|
|
379
694
|
|
|
380
|
-
|
|
695
|
+
@contextmanager
|
|
696
|
+
def process_config_and_initialize_cm_cached(self) -> Iterator["AirbyteWorkspace"]:
|
|
697
|
+
# Hack to avoid reconstructing initialized copies of this resource, which invalidates
|
|
698
|
+
# @cached_method caches. This means that multiple calls to load_airbyte_asset_specs
|
|
699
|
+
# will not trigger multiple API calls to fetch the workspace data.
|
|
700
|
+
# Bespoke impl since @cached_method doesn't play nice with iterators; it's exhausted after
|
|
701
|
+
# the first call.
|
|
702
|
+
if hasattr(self, "_initialized"):
|
|
703
|
+
yield getattr(self, "_initialized")
|
|
704
|
+
else:
|
|
705
|
+
with self.process_config_and_initialize_cm() as initialized_workspace:
|
|
706
|
+
initialized = initialized_workspace
|
|
707
|
+
setattr(self, "_initialized", initialized)
|
|
708
|
+
yield initialized
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
@beta
|
|
712
|
+
class AirbyteWorkspace(BaseAirbyteWorkspace):
|
|
381
713
|
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
382
|
-
syncs and monitor their progress.
|
|
714
|
+
syncs and monitor their progress for a given Airbyte workspace.
|
|
383
715
|
|
|
384
716
|
**Examples:**
|
|
717
|
+
Using OAuth client credentials:
|
|
385
718
|
|
|
386
719
|
.. code-block:: python
|
|
387
720
|
|
|
388
|
-
|
|
389
|
-
from dagster_airbyte import
|
|
721
|
+
import dagster as dg
|
|
722
|
+
from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
|
|
723
|
+
|
|
724
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
725
|
+
rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
|
|
726
|
+
configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
|
|
727
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
728
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
729
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
733
|
+
|
|
734
|
+
defs = dg.Definitions(
|
|
735
|
+
assets=all_airbyte_assets,
|
|
736
|
+
resources={"airbyte": airbyte_workspace},
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
Using basic Authentication:
|
|
740
|
+
|
|
741
|
+
.. code-block:: python
|
|
742
|
+
|
|
743
|
+
import dagster as dg
|
|
744
|
+
from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
|
|
745
|
+
|
|
746
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
747
|
+
rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
|
|
748
|
+
configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
|
|
749
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
750
|
+
username=dg.EnvVar("AIRBYTE_USERNAME"),
|
|
751
|
+
password=dg.EnvVar("AIRBYTE_PASSWORD"),
|
|
752
|
+
)
|
|
753
|
+
|
|
754
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
390
755
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
# If using basic auth
|
|
395
|
-
username=EnvVar("AIRBYTE_USERNAME"),
|
|
396
|
-
password=EnvVar("AIRBYTE_PASSWORD"),
|
|
756
|
+
defs = dg.Definitions(
|
|
757
|
+
assets=all_airbyte_assets,
|
|
758
|
+
resources={"airbyte": airbyte_workspace},
|
|
397
759
|
)
|
|
398
760
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
761
|
+
Using no authentication:
|
|
762
|
+
|
|
763
|
+
.. code-block:: python
|
|
764
|
+
|
|
765
|
+
import dagster as dg
|
|
766
|
+
from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
|
|
767
|
+
|
|
768
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
769
|
+
rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
|
|
770
|
+
configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
|
|
771
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
402
772
|
)
|
|
403
773
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
774
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
775
|
+
|
|
776
|
+
defs = dg.Definitions(
|
|
777
|
+
assets=all_airbyte_assets,
|
|
778
|
+
resources={"airbyte": airbyte_workspace},
|
|
407
779
|
)
|
|
408
780
|
"""
|
|
409
781
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
782
|
+
rest_api_base_url: str = Field(
|
|
783
|
+
...,
|
|
784
|
+
description="The base URL for the Airbyte REST API.",
|
|
785
|
+
examples=[
|
|
786
|
+
"http://localhost:8000/api/public/v1",
|
|
787
|
+
"https://my-airbyte-server.com/api/public/v1",
|
|
788
|
+
"http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/public/v1",
|
|
789
|
+
],
|
|
416
790
|
)
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
description=
|
|
420
|
-
|
|
421
|
-
"
|
|
422
|
-
|
|
791
|
+
configuration_api_base_url: str = Field(
|
|
792
|
+
...,
|
|
793
|
+
description="The base URL for the Airbyte Configuration API.",
|
|
794
|
+
examples=[
|
|
795
|
+
"http://localhost:8000/api/v1",
|
|
796
|
+
"https://my-airbyte-server.com/api/v1",
|
|
797
|
+
"http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/v1",
|
|
798
|
+
],
|
|
423
799
|
)
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
800
|
+
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
801
|
+
client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
|
|
802
|
+
client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
|
|
803
|
+
username: Optional[str] = Field(
|
|
804
|
+
default=None, description="The Airbyte username for authentication."
|
|
805
|
+
)
|
|
806
|
+
password: Optional[str] = Field(
|
|
807
|
+
default=None, description="The Airbyte password for authentication."
|
|
429
808
|
)
|
|
430
809
|
|
|
431
|
-
@property
|
|
432
810
|
@cached_method
|
|
433
|
-
def
|
|
434
|
-
return
|
|
811
|
+
def get_client(self) -> AirbyteClient:
|
|
812
|
+
return AirbyteClient(
|
|
813
|
+
rest_api_base_url=self.rest_api_base_url,
|
|
814
|
+
configuration_api_base_url=self.configuration_api_base_url,
|
|
815
|
+
workspace_id=self.workspace_id,
|
|
816
|
+
client_id=self.client_id,
|
|
817
|
+
client_secret=self.client_secret,
|
|
818
|
+
username=self.username,
|
|
819
|
+
password=self.password,
|
|
820
|
+
request_max_retries=self.request_max_retries,
|
|
821
|
+
request_retry_delay=self.request_retry_delay,
|
|
822
|
+
request_timeout=self.request_timeout,
|
|
823
|
+
max_items_per_page=self.max_items_per_page,
|
|
824
|
+
poll_interval=self.poll_interval,
|
|
825
|
+
poll_timeout=self.poll_timeout,
|
|
826
|
+
cancel_on_termination=self.cancel_on_termination,
|
|
827
|
+
poll_previous_running_sync=self.poll_previous_running_sync,
|
|
828
|
+
)
|
|
435
829
|
|
|
436
|
-
@property
|
|
437
|
-
@cached_method
|
|
438
|
-
def _log(self) -> logging.Logger:
|
|
439
|
-
return get_dagster_logger()
|
|
440
830
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
+ (f"{self.host}:{self.port}" if self.port else self.host)
|
|
446
|
-
+ "/api/v1"
|
|
447
|
-
)
|
|
831
|
+
@beta
|
|
832
|
+
class AirbyteCloudWorkspace(BaseAirbyteWorkspace):
|
|
833
|
+
"""This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
|
|
834
|
+
syncs and monitor their progress for a given Airbyte Cloud workspace.
|
|
448
835
|
|
|
449
|
-
|
|
450
|
-
def _should_forward_logs(self) -> bool:
|
|
451
|
-
return self.forward_logs
|
|
836
|
+
**Examples:**
|
|
452
837
|
|
|
453
|
-
|
|
454
|
-
def cache_requests(self):
|
|
455
|
-
"""Context manager that enables caching certain requests to the Airbyte API,
|
|
456
|
-
cleared when the context is exited.
|
|
457
|
-
"""
|
|
458
|
-
self.clear_request_cache()
|
|
459
|
-
self._state.cache_enabled += 1
|
|
460
|
-
try:
|
|
461
|
-
yield
|
|
462
|
-
finally:
|
|
463
|
-
self.clear_request_cache()
|
|
464
|
-
self._state.cache_enabled -= 1
|
|
838
|
+
.. code-block:: python
|
|
465
839
|
|
|
466
|
-
|
|
467
|
-
self._state.request_cache = {}
|
|
840
|
+
from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
|
|
468
841
|
|
|
469
|
-
|
|
470
|
-
if not self._state.cache_enabled > 0:
|
|
471
|
-
return self.make_request(endpoint, data)
|
|
472
|
-
data_json = json.dumps(data, sort_keys=True)
|
|
473
|
-
sha = hashlib.sha1()
|
|
474
|
-
sha.update(endpoint.encode("utf-8"))
|
|
475
|
-
sha.update(data_json.encode("utf-8"))
|
|
476
|
-
digest = sha.hexdigest()
|
|
842
|
+
import dagster as dg
|
|
477
843
|
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
844
|
+
airbyte_workspace = AirbyteCloudWorkspace(
|
|
845
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
846
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
847
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
848
|
+
)
|
|
481
849
|
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
850
|
+
all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
|
|
851
|
+
|
|
852
|
+
defs = dg.Definitions(
|
|
853
|
+
assets=all_airbyte_assets,
|
|
854
|
+
resources={"airbyte": airbyte_workspace},
|
|
486
855
|
)
|
|
487
|
-
|
|
856
|
+
"""
|
|
488
857
|
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
858
|
+
rest_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_REST_API_BASE_URL
|
|
859
|
+
configuration_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL
|
|
860
|
+
workspace_id: str = Field(..., description="The Airbyte workspace ID")
|
|
861
|
+
client_id: str = Field(..., description="The Airbyte client ID.")
|
|
862
|
+
client_secret: str = Field(..., description="The Airbyte client secret.")
|
|
493
863
|
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
864
|
+
@cached_method
|
|
865
|
+
def get_client(self) -> AirbyteClient:
|
|
866
|
+
return AirbyteClient(
|
|
867
|
+
rest_api_base_url=self.rest_api_base_url,
|
|
868
|
+
configuration_api_base_url=self.configuration_api_base_url,
|
|
869
|
+
workspace_id=self.workspace_id,
|
|
870
|
+
client_id=self.client_id,
|
|
871
|
+
client_secret=self.client_secret,
|
|
872
|
+
request_max_retries=self.request_max_retries,
|
|
873
|
+
request_retry_delay=self.request_retry_delay,
|
|
874
|
+
request_timeout=self.request_timeout,
|
|
875
|
+
max_items_per_page=self.max_items_per_page,
|
|
876
|
+
poll_interval=self.poll_interval,
|
|
877
|
+
poll_timeout=self.poll_timeout,
|
|
878
|
+
cancel_on_termination=self.cancel_on_termination,
|
|
879
|
+
poll_previous_running_sync=self.poll_previous_running_sync,
|
|
880
|
+
)
|
|
497
881
|
|
|
498
|
-
Returns:
|
|
499
|
-
Optional[Dict[str, Any]]: Parsed json data from the response to this request
|
|
500
|
-
"""
|
|
501
|
-
url = self.api_base_url + endpoint
|
|
502
|
-
headers = {"accept": "application/json"}
|
|
503
882
|
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
headers=headers,
|
|
513
|
-
json=data,
|
|
514
|
-
timeout=self.request_timeout,
|
|
515
|
-
auth=(
|
|
516
|
-
(self.username, self.password)
|
|
517
|
-
if self.username and self.password
|
|
518
|
-
else None
|
|
519
|
-
),
|
|
520
|
-
),
|
|
521
|
-
self.request_additional_params,
|
|
522
|
-
),
|
|
523
|
-
)
|
|
524
|
-
response.raise_for_status()
|
|
525
|
-
if response.status_code == 204:
|
|
526
|
-
return None
|
|
527
|
-
return response.json()
|
|
528
|
-
except RequestException as e:
|
|
529
|
-
self._log.error("Request to Airbyte API failed: %s", e)
|
|
530
|
-
if num_retries == self.request_max_retries:
|
|
531
|
-
break
|
|
532
|
-
num_retries += 1
|
|
533
|
-
time.sleep(self.request_retry_delay)
|
|
883
|
+
@public
|
|
884
|
+
@beta
|
|
885
|
+
def load_airbyte_asset_specs(
|
|
886
|
+
workspace: BaseAirbyteWorkspace,
|
|
887
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
888
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
|
|
889
|
+
) -> Sequence[AssetSpec]:
|
|
890
|
+
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
534
891
|
|
|
535
|
-
|
|
892
|
+
Args:
|
|
893
|
+
workspace (BaseAirbyteWorkspace): The Airbyte workspace to fetch assets from.
|
|
894
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
895
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
896
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
897
|
+
connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
|
|
898
|
+
which Airbyte connection assets are created for.
|
|
536
899
|
|
|
537
|
-
|
|
538
|
-
|
|
900
|
+
Returns:
|
|
901
|
+
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
539
902
|
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
List[Dict[str, Any]],
|
|
543
|
-
check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
|
|
544
|
-
"workspaces", []
|
|
545
|
-
),
|
|
546
|
-
)
|
|
547
|
-
return workspaces[0]["workspaceId"]
|
|
903
|
+
Examples:
|
|
904
|
+
Loading the asset specs for a given Airbyte workspace:
|
|
548
905
|
|
|
549
|
-
|
|
550
|
-
name_lower = name.lower()
|
|
551
|
-
definitions = check.not_none(
|
|
552
|
-
self.make_request_cached(endpoint="/source_definitions/list", data={})
|
|
553
|
-
)
|
|
554
|
-
source_definitions = cast(List[Dict[str, Any]], definitions["sourceDefinitions"])
|
|
555
|
-
|
|
556
|
-
return next(
|
|
557
|
-
(
|
|
558
|
-
definition["sourceDefinitionId"]
|
|
559
|
-
for definition in source_definitions
|
|
560
|
-
if definition["name"].lower() == name_lower
|
|
561
|
-
),
|
|
562
|
-
None,
|
|
563
|
-
)
|
|
906
|
+
.. code-block:: python
|
|
564
907
|
|
|
565
|
-
|
|
566
|
-
name_lower = name.lower()
|
|
567
|
-
definitions = cast(
|
|
568
|
-
Dict[str, List[Dict[str, str]]],
|
|
569
|
-
check.not_none(
|
|
570
|
-
self.make_request_cached(endpoint="/destination_definitions/list", data={})
|
|
571
|
-
),
|
|
572
|
-
)
|
|
573
|
-
return next(
|
|
574
|
-
(
|
|
575
|
-
definition["destinationDefinitionId"]
|
|
576
|
-
for definition in definitions["destinationDefinitions"]
|
|
577
|
-
if definition["name"].lower() == name_lower
|
|
578
|
-
),
|
|
579
|
-
None,
|
|
580
|
-
)
|
|
908
|
+
from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
|
|
581
909
|
|
|
582
|
-
|
|
583
|
-
result = cast(
|
|
584
|
-
Dict[str, Any],
|
|
585
|
-
check.not_none(
|
|
586
|
-
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
587
|
-
),
|
|
588
|
-
)
|
|
589
|
-
return result["catalogId"]
|
|
590
|
-
|
|
591
|
-
def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
|
|
592
|
-
return cast(
|
|
593
|
-
Dict[str, Any],
|
|
594
|
-
check.not_none(
|
|
595
|
-
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
596
|
-
),
|
|
597
|
-
)
|
|
910
|
+
import dagster as dg
|
|
598
911
|
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
# https://github.com/airbytehq/airbyte/pull/21005
|
|
604
|
-
norm_dest_def_spec: bool = cast(
|
|
605
|
-
Dict[str, Any],
|
|
606
|
-
check.not_none(
|
|
607
|
-
self.make_request_cached(
|
|
608
|
-
endpoint="/destination_definition_specifications/get",
|
|
609
|
-
data={
|
|
610
|
-
"destinationDefinitionId": destination_definition_id,
|
|
611
|
-
"workspaceId": workspace_id,
|
|
612
|
-
},
|
|
613
|
-
)
|
|
614
|
-
),
|
|
615
|
-
).get("supportsNormalization", False)
|
|
616
|
-
|
|
617
|
-
norm_dest_def: bool = (
|
|
618
|
-
cast(
|
|
619
|
-
Dict[str, Any],
|
|
620
|
-
check.not_none(
|
|
621
|
-
self.make_request_cached(
|
|
622
|
-
endpoint="/destination_definitions/get",
|
|
623
|
-
data={
|
|
624
|
-
"destinationDefinitionId": destination_definition_id,
|
|
625
|
-
},
|
|
626
|
-
)
|
|
627
|
-
),
|
|
912
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
913
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
914
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
915
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
628
916
|
)
|
|
629
|
-
.get("normalizationConfig", {})
|
|
630
|
-
.get("supported", False)
|
|
631
|
-
)
|
|
632
917
|
|
|
633
|
-
|
|
918
|
+
airbyte_specs = load_airbyte_asset_specs(airbyte_workspace)
|
|
919
|
+
dg.Definitions(assets=airbyte_specs)
|
|
634
920
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
921
|
+
Filter connections by name:
|
|
922
|
+
|
|
923
|
+
.. code-block:: python
|
|
924
|
+
|
|
925
|
+
from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
|
|
926
|
+
|
|
927
|
+
import dagster as dg
|
|
928
|
+
|
|
929
|
+
airbyte_workspace = AirbyteWorkspace(
|
|
930
|
+
workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
|
|
931
|
+
client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
|
|
932
|
+
client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
airbyte_specs = load_airbyte_asset_specs(
|
|
936
|
+
workspace=airbyte_workspace,
|
|
937
|
+
connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
|
|
938
|
+
)
|
|
939
|
+
dg.Definitions(assets=airbyte_specs)
|
|
940
|
+
"""
|
|
941
|
+
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
942
|
+
|
|
943
|
+
with workspace.process_config_and_initialize_cm_cached() as initialized_workspace:
|
|
944
|
+
return [
|
|
945
|
+
spec.merge_attributes(
|
|
946
|
+
metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
|
|
947
|
+
)
|
|
948
|
+
for spec in check.is_list(
|
|
949
|
+
AirbyteWorkspaceDefsLoader(
|
|
950
|
+
workspace=initialized_workspace,
|
|
951
|
+
translator=dagster_airbyte_translator,
|
|
952
|
+
connection_selector_fn=connection_selector_fn,
|
|
650
953
|
)
|
|
954
|
+
.build_defs()
|
|
955
|
+
.assets,
|
|
956
|
+
AssetSpec,
|
|
651
957
|
)
|
|
652
|
-
|
|
958
|
+
]
|
|
653
959
|
|
|
654
|
-
return check.not_none(job)
|
|
655
960
|
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
961
|
+
@public
|
|
962
|
+
@superseded(additional_warn_text="Use load_airbyte_asset_specs instead.")
|
|
963
|
+
def load_airbyte_cloud_asset_specs(
|
|
964
|
+
workspace: AirbyteCloudWorkspace,
|
|
965
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
966
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
|
|
967
|
+
) -> Sequence[AssetSpec]:
|
|
968
|
+
"""Returns a list of AssetSpecs representing the Airbyte content in the workspace.
|
|
660
969
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
970
|
+
Args:
|
|
971
|
+
workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
|
|
972
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
973
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
974
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
975
|
+
connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
|
|
976
|
+
which Airbyte connection assets are created for.
|
|
665
977
|
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
connection_id: str,
|
|
669
|
-
poll_interval: Optional[float] = None,
|
|
670
|
-
poll_timeout: Optional[float] = None,
|
|
671
|
-
) -> AirbyteOutput:
|
|
672
|
-
"""Initializes a sync operation for the given connector, and polls until it completes.
|
|
978
|
+
Returns:
|
|
979
|
+
List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
|
|
673
980
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
"Connection" tab of a given connection in the Arbyte UI.
|
|
677
|
-
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
678
|
-
poll_timeout (float): The maximum time that will waited before this operation is timed
|
|
679
|
-
out. By default, this will never time out.
|
|
981
|
+
Examples:
|
|
982
|
+
Loading the asset specs for a given Airbyte Cloud workspace:
|
|
680
983
|
|
|
681
|
-
|
|
682
|
-
:py:class:`~AirbyteOutput`:
|
|
683
|
-
Details of the sync job.
|
|
684
|
-
"""
|
|
685
|
-
connection_details = self.get_connection_details(connection_id)
|
|
686
|
-
job_details = self.start_sync(connection_id)
|
|
687
|
-
job_info = cast(Dict[str, object], job_details.get("job", {}))
|
|
688
|
-
job_id = cast(int, job_info.get("id"))
|
|
984
|
+
.. code-block:: python
|
|
689
985
|
|
|
690
|
-
|
|
691
|
-
start = time.monotonic()
|
|
692
|
-
logged_attempts = 0
|
|
693
|
-
logged_lines = 0
|
|
694
|
-
state = None
|
|
986
|
+
from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
|
|
695
987
|
|
|
696
|
-
|
|
697
|
-
while True:
|
|
698
|
-
if poll_timeout and start + poll_timeout < time.monotonic():
|
|
699
|
-
raise Failure(
|
|
700
|
-
f"Timeout: Airbyte job {job_id} is not ready after the timeout"
|
|
701
|
-
f" {poll_timeout} seconds"
|
|
702
|
-
)
|
|
703
|
-
time.sleep(poll_interval or self.poll_interval)
|
|
704
|
-
job_details = self.get_job_status(connection_id, job_id)
|
|
705
|
-
attempts = cast(List, job_details.get("attempts", []))
|
|
706
|
-
cur_attempt = len(attempts)
|
|
707
|
-
# spit out the available Airbyte log info
|
|
708
|
-
if cur_attempt:
|
|
709
|
-
if self.forward_logs:
|
|
710
|
-
log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
|
|
711
|
-
|
|
712
|
-
for line in log_lines[logged_lines:]:
|
|
713
|
-
sys.stdout.write(line + "\n")
|
|
714
|
-
sys.stdout.flush()
|
|
715
|
-
logged_lines = len(log_lines)
|
|
716
|
-
|
|
717
|
-
# if there's a next attempt, this one will have no more log messages
|
|
718
|
-
if logged_attempts < cur_attempt - 1:
|
|
719
|
-
logged_lines = 0
|
|
720
|
-
logged_attempts += 1
|
|
721
|
-
|
|
722
|
-
job_info = cast(Dict[str, object], job_details.get("job", {}))
|
|
723
|
-
state = job_info.get("status")
|
|
724
|
-
|
|
725
|
-
if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
|
|
726
|
-
continue
|
|
727
|
-
elif state == AirbyteState.SUCCEEDED:
|
|
728
|
-
break
|
|
729
|
-
elif state == AirbyteState.ERROR:
|
|
730
|
-
raise Failure(f"Job failed: {job_id}")
|
|
731
|
-
elif state == AirbyteState.CANCELLED:
|
|
732
|
-
raise Failure(f"Job was cancelled: {job_id}")
|
|
733
|
-
else:
|
|
734
|
-
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
735
|
-
finally:
|
|
736
|
-
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
737
|
-
# the python process
|
|
738
|
-
if (
|
|
739
|
-
state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
|
|
740
|
-
and self.cancel_sync_on_run_termination
|
|
741
|
-
):
|
|
742
|
-
self.cancel_job(job_id)
|
|
988
|
+
import dagster as dg
|
|
743
989
|
|
|
744
|
-
|
|
990
|
+
airbyte_cloud_workspace = AirbyteCloudWorkspace(
|
|
991
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
992
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
993
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
994
|
+
)
|
|
745
995
|
|
|
996
|
+
airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
|
|
997
|
+
dg.Definitions(assets=airbyte_cloud_specs)
|
|
746
998
|
|
|
747
|
-
|
|
748
|
-
@resource(config_schema=AirbyteResource.to_config_schema())
|
|
749
|
-
def airbyte_resource(context) -> AirbyteResource:
|
|
750
|
-
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
751
|
-
syncs and monitor their progress. This currently implements only a subset of the functionality
|
|
752
|
-
exposed by the API.
|
|
999
|
+
Filter connections by name:
|
|
753
1000
|
|
|
754
|
-
|
|
755
|
-
schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.
|
|
1001
|
+
.. code-block:: python
|
|
756
1002
|
|
|
757
|
-
|
|
758
|
-
<https://docs.dagster.io/concepts/configuration/configured>`_ method.
|
|
1003
|
+
from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
|
|
759
1004
|
|
|
760
|
-
|
|
1005
|
+
import dagster as dg
|
|
761
1006
|
|
|
762
|
-
|
|
1007
|
+
airbyte_cloud_workspace = AirbyteCloudWorkspace(
|
|
1008
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
1009
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
1010
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
1011
|
+
)
|
|
763
1012
|
|
|
764
|
-
|
|
765
|
-
|
|
1013
|
+
airbyte_cloud_specs = load_airbyte_cloud_asset_specs(
|
|
1014
|
+
workspace=airbyte_cloud_workspace,
|
|
1015
|
+
connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
|
|
1016
|
+
)
|
|
1017
|
+
dg.Definitions(assets=airbyte_cloud_specs)
|
|
1018
|
+
"""
|
|
1019
|
+
return load_airbyte_asset_specs(
|
|
1020
|
+
workspace=workspace,
|
|
1021
|
+
dagster_airbyte_translator=dagster_airbyte_translator,
|
|
1022
|
+
connection_selector_fn=connection_selector_fn,
|
|
1023
|
+
)
|
|
766
1024
|
|
|
767
|
-
my_airbyte_resource = airbyte_resource.configured(
|
|
768
|
-
{
|
|
769
|
-
"host": {"env": "AIRBYTE_HOST"},
|
|
770
|
-
"port": {"env": "AIRBYTE_PORT"},
|
|
771
|
-
# If using basic auth
|
|
772
|
-
"username": {"env": "AIRBYTE_USERNAME"},
|
|
773
|
-
"password": {"env": "AIRBYTE_PASSWORD"},
|
|
774
|
-
}
|
|
775
|
-
)
|
|
776
1025
|
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
1026
|
+
@record
|
|
1027
|
+
class AirbyteWorkspaceDefsLoader(StateBackedDefinitionsLoader[AirbyteWorkspaceData]):
|
|
1028
|
+
workspace: Union[AirbyteWorkspace, AirbyteCloudWorkspace]
|
|
1029
|
+
translator: DagsterAirbyteTranslator
|
|
1030
|
+
connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]]
|
|
780
1031
|
|
|
781
|
-
|
|
782
|
-
|
|
1032
|
+
@property
|
|
1033
|
+
def defs_key(self) -> str:
|
|
1034
|
+
return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}.{self.workspace.workspace_id}"
|
|
783
1035
|
|
|
1036
|
+
def fetch_state(self) -> AirbyteWorkspaceData:
|
|
1037
|
+
return self.workspace.fetch_airbyte_workspace_data()
|
|
784
1038
|
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
1039
|
+
def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
|
|
1040
|
+
all_asset_specs = [
|
|
1041
|
+
self.translator.get_asset_spec(props)
|
|
1042
|
+
for props in state.to_airbyte_connection_table_props_data()
|
|
1043
|
+
if not self.connection_selector_fn
|
|
1044
|
+
or self.connection_selector_fn(state.connections_by_id[props.connection_id])
|
|
1045
|
+
]
|
|
791
1046
|
|
|
792
|
-
|
|
793
|
-
return AirbyteCloudResource.from_resource_context(context)
|
|
1047
|
+
return Definitions(assets=all_asset_specs)
|