dagster-airbyte 0.24.3__py3-none-any.whl → 0.28.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dagster_airbyte/__init__.py +26 -9
- dagster_airbyte/asset_decorator.py +123 -0
- dagster_airbyte/asset_defs.py +334 -202
- dagster_airbyte/components/__init__.py +0 -0
- dagster_airbyte/components/workspace_component/__init__.py +0 -0
- dagster_airbyte/components/workspace_component/component.py +433 -0
- dagster_airbyte/components/workspace_component/scaffolder.py +30 -0
- dagster_airbyte/legacy_resources.py +826 -0
- dagster_airbyte/managed/__init__.py +2 -2
- dagster_airbyte/managed/generated/__init__.py +1 -1
- dagster_airbyte/managed/generated/sources.py +35 -35
- dagster_airbyte/managed/reconciliation.py +34 -44
- dagster_airbyte/managed/types.py +8 -7
- dagster_airbyte/ops.py +5 -4
- dagster_airbyte/resources.py +855 -601
- dagster_airbyte/translator.py +255 -0
- dagster_airbyte/types.py +8 -3
- dagster_airbyte/utils.py +36 -2
- dagster_airbyte/version.py +1 -1
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/METADATA +19 -10
- dagster_airbyte-0.28.3.dist-info/RECORD +28 -0
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/WHEEL +1 -1
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/entry_points.txt +3 -0
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info/licenses}/LICENSE +1 -1
- dagster_airbyte-0.24.3.dist-info/RECORD +0 -21
- {dagster_airbyte-0.24.3.dist-info → dagster_airbyte-0.28.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,826 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
import time
|
|
6
|
+
from abc import abstractmethod
|
|
7
|
+
from collections.abc import Mapping
|
|
8
|
+
from contextlib import contextmanager
|
|
9
|
+
from datetime import datetime, timedelta
|
|
10
|
+
from typing import Any, Optional, cast
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
from dagster import (
|
|
14
|
+
ConfigurableResource,
|
|
15
|
+
Failure,
|
|
16
|
+
InitResourceContext,
|
|
17
|
+
_check as check,
|
|
18
|
+
get_dagster_logger,
|
|
19
|
+
resource,
|
|
20
|
+
)
|
|
21
|
+
from dagster._config.pythonic_config import infer_schema_from_config_class
|
|
22
|
+
from dagster._core.definitions.resource_definition import dagster_maintained_resource
|
|
23
|
+
from dagster._symbol_annotations import superseded
|
|
24
|
+
from dagster_shared.merger import deep_merge_dicts
|
|
25
|
+
from dagster_shared.utils.cached_method import cached_method
|
|
26
|
+
from pydantic import Field, PrivateAttr
|
|
27
|
+
from requests import RequestException
|
|
28
|
+
|
|
29
|
+
from dagster_airbyte.translator import AirbyteJobStatusType
|
|
30
|
+
from dagster_airbyte.types import AirbyteOutput
|
|
31
|
+
|
|
32
|
+
DEFAULT_POLL_INTERVAL_SECONDS = 10
|
|
33
|
+
|
|
34
|
+
# The access token expire every 3 minutes in Airbyte Cloud.
|
|
35
|
+
# Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
|
|
36
|
+
AIRBYTE_REFRESH_TIMEDELTA_SECONDS = 150
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class AirbyteResourceState:
|
|
40
|
+
def __init__(self) -> None:
|
|
41
|
+
self.request_cache: dict[str, Optional[Mapping[str, object]]] = {}
|
|
42
|
+
# Int in case we nest contexts
|
|
43
|
+
self.cache_enabled = 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BaseAirbyteResource(ConfigurableResource):
|
|
47
|
+
request_max_retries: int = Field(
|
|
48
|
+
default=3,
|
|
49
|
+
description=(
|
|
50
|
+
"The maximum number of times requests to the Airbyte API should be retried "
|
|
51
|
+
"before failing."
|
|
52
|
+
),
|
|
53
|
+
)
|
|
54
|
+
request_retry_delay: float = Field(
|
|
55
|
+
default=0.25,
|
|
56
|
+
description="Time (in seconds) to wait between each request retry.",
|
|
57
|
+
)
|
|
58
|
+
request_timeout: int = Field(
|
|
59
|
+
default=15,
|
|
60
|
+
description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
|
|
61
|
+
)
|
|
62
|
+
cancel_sync_on_run_termination: bool = Field(
|
|
63
|
+
default=True,
|
|
64
|
+
description=(
|
|
65
|
+
"Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"
|
|
66
|
+
" be useful to disable if using Airbyte sources that cannot be cancelled and"
|
|
67
|
+
" resumed easily, or if your Dagster deployment may experience runner interruptions"
|
|
68
|
+
" that do not impact your Airbyte deployment."
|
|
69
|
+
),
|
|
70
|
+
)
|
|
71
|
+
poll_interval: float = Field(
|
|
72
|
+
default=DEFAULT_POLL_INTERVAL_SECONDS,
|
|
73
|
+
description="Time (in seconds) to wait between checking a sync's status.",
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def _is_dagster_maintained(cls) -> bool:
|
|
78
|
+
return True
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
@cached_method
|
|
82
|
+
def _log(self) -> logging.Logger:
|
|
83
|
+
return get_dagster_logger()
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
@abstractmethod
|
|
87
|
+
def api_base_url(self) -> str:
|
|
88
|
+
raise NotImplementedError()
|
|
89
|
+
|
|
90
|
+
@property
|
|
91
|
+
@abstractmethod
|
|
92
|
+
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
93
|
+
raise NotImplementedError()
|
|
94
|
+
|
|
95
|
+
def make_request(
|
|
96
|
+
self,
|
|
97
|
+
endpoint: str,
|
|
98
|
+
data: Optional[Mapping[str, object]] = None,
|
|
99
|
+
method: str = "POST",
|
|
100
|
+
include_additional_request_params: bool = True,
|
|
101
|
+
) -> Optional[Mapping[str, object]]:
|
|
102
|
+
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
endpoint (str): The Airbyte API endpoint to send this request to.
|
|
106
|
+
data (Optional[str]): JSON-formatted data string to be included in the request.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Optional[Dict[str, Any]]: Parsed json data from the response to this request
|
|
110
|
+
"""
|
|
111
|
+
url = self.api_base_url + endpoint
|
|
112
|
+
headers = {"accept": "application/json"}
|
|
113
|
+
|
|
114
|
+
num_retries = 0
|
|
115
|
+
while True:
|
|
116
|
+
try:
|
|
117
|
+
request_args: dict[str, Any] = dict(
|
|
118
|
+
method=method,
|
|
119
|
+
url=url,
|
|
120
|
+
headers=headers,
|
|
121
|
+
timeout=self.request_timeout,
|
|
122
|
+
)
|
|
123
|
+
if data:
|
|
124
|
+
request_args["json"] = data
|
|
125
|
+
|
|
126
|
+
if include_additional_request_params:
|
|
127
|
+
request_args = deep_merge_dicts(
|
|
128
|
+
request_args,
|
|
129
|
+
self.all_additional_request_params,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
response = requests.request(
|
|
133
|
+
**request_args,
|
|
134
|
+
)
|
|
135
|
+
response.raise_for_status()
|
|
136
|
+
if response.status_code == 204:
|
|
137
|
+
return None
|
|
138
|
+
return response.json()
|
|
139
|
+
except RequestException as e:
|
|
140
|
+
self._log.error("Request to Airbyte API failed: %s", e)
|
|
141
|
+
if num_retries == self.request_max_retries:
|
|
142
|
+
break
|
|
143
|
+
num_retries += 1
|
|
144
|
+
time.sleep(self.request_retry_delay)
|
|
145
|
+
|
|
146
|
+
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
147
|
+
|
|
148
|
+
@abstractmethod
|
|
149
|
+
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
150
|
+
raise NotImplementedError()
|
|
151
|
+
|
|
152
|
+
@abstractmethod
|
|
153
|
+
def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
|
|
154
|
+
raise NotImplementedError()
|
|
155
|
+
|
|
156
|
+
@abstractmethod
|
|
157
|
+
def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
|
|
158
|
+
raise NotImplementedError()
|
|
159
|
+
|
|
160
|
+
@abstractmethod
|
|
161
|
+
def cancel_job(self, job_id: int):
|
|
162
|
+
raise NotImplementedError()
|
|
163
|
+
|
|
164
|
+
@property
|
|
165
|
+
@abstractmethod
|
|
166
|
+
def _should_forward_logs(self) -> bool:
|
|
167
|
+
raise NotImplementedError()
|
|
168
|
+
|
|
169
|
+
def sync_and_poll(
|
|
170
|
+
self,
|
|
171
|
+
connection_id: str,
|
|
172
|
+
poll_interval: Optional[float] = None,
|
|
173
|
+
poll_timeout: Optional[float] = None,
|
|
174
|
+
) -> AirbyteOutput:
|
|
175
|
+
"""Initializes a sync operation for the given connector, and polls until it completes.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
|
|
179
|
+
"Connection" tab of a given connection in the Arbyte UI.
|
|
180
|
+
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
181
|
+
poll_timeout (float): The maximum time that will waited before this operation is timed
|
|
182
|
+
out. By default, this will never time out.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
:py:class:`~AirbyteOutput`:
|
|
186
|
+
Details of the sync job.
|
|
187
|
+
"""
|
|
188
|
+
connection_details = self.get_connection_details(connection_id)
|
|
189
|
+
job_details = self.start_sync(connection_id)
|
|
190
|
+
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
191
|
+
job_id = cast("int", job_info.get("id"))
|
|
192
|
+
|
|
193
|
+
self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
|
|
194
|
+
start = time.monotonic()
|
|
195
|
+
logged_attempts = 0
|
|
196
|
+
logged_lines = 0
|
|
197
|
+
state = None
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
while True:
|
|
201
|
+
if poll_timeout and start + poll_timeout < time.monotonic():
|
|
202
|
+
raise Failure(
|
|
203
|
+
f"Timeout: Airbyte job {job_id} is not ready after the timeout"
|
|
204
|
+
f" {poll_timeout} seconds"
|
|
205
|
+
)
|
|
206
|
+
time.sleep(poll_interval or self.poll_interval)
|
|
207
|
+
job_details = self.get_job_status(connection_id, job_id)
|
|
208
|
+
attempts = cast("list", job_details.get("attempts", []))
|
|
209
|
+
cur_attempt = len(attempts)
|
|
210
|
+
# spit out the available Airbyte log info
|
|
211
|
+
if cur_attempt:
|
|
212
|
+
if self._should_forward_logs:
|
|
213
|
+
log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
|
|
214
|
+
|
|
215
|
+
for line in log_lines[logged_lines:]:
|
|
216
|
+
sys.stdout.write(line + "\n")
|
|
217
|
+
sys.stdout.flush()
|
|
218
|
+
logged_lines = len(log_lines)
|
|
219
|
+
|
|
220
|
+
# if there's a next attempt, this one will have no more log messages
|
|
221
|
+
if logged_attempts < cur_attempt - 1:
|
|
222
|
+
logged_lines = 0
|
|
223
|
+
logged_attempts += 1
|
|
224
|
+
|
|
225
|
+
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
226
|
+
state = job_info.get("status")
|
|
227
|
+
|
|
228
|
+
if state in (
|
|
229
|
+
AirbyteJobStatusType.RUNNING,
|
|
230
|
+
AirbyteJobStatusType.PENDING,
|
|
231
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
232
|
+
):
|
|
233
|
+
continue
|
|
234
|
+
elif state == AirbyteJobStatusType.SUCCEEDED:
|
|
235
|
+
break
|
|
236
|
+
elif state == AirbyteJobStatusType.ERROR:
|
|
237
|
+
raise Failure(f"Job failed: {job_id}")
|
|
238
|
+
elif state == AirbyteJobStatusType.CANCELLED:
|
|
239
|
+
raise Failure(f"Job was cancelled: {job_id}")
|
|
240
|
+
else:
|
|
241
|
+
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
242
|
+
finally:
|
|
243
|
+
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
244
|
+
# the python process
|
|
245
|
+
if (
|
|
246
|
+
state
|
|
247
|
+
not in (
|
|
248
|
+
AirbyteJobStatusType.SUCCEEDED,
|
|
249
|
+
AirbyteJobStatusType.ERROR,
|
|
250
|
+
AirbyteJobStatusType.CANCELLED,
|
|
251
|
+
)
|
|
252
|
+
and self.cancel_sync_on_run_termination
|
|
253
|
+
):
|
|
254
|
+
self.cancel_job(job_id)
|
|
255
|
+
|
|
256
|
+
return AirbyteOutput(job_details=job_details, connection_details=connection_details)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@superseded(
|
|
260
|
+
additional_warn_text=(
|
|
261
|
+
"If you are using Airbyte 1.6.0 or higher, please see the migration guide: https://docs.dagster.io/integrations/libraries/airbyte/migration-guide"
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
class AirbyteResource(BaseAirbyteResource):
|
|
265
|
+
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
266
|
+
syncs and monitor their progress.
|
|
267
|
+
|
|
268
|
+
**Examples:**
|
|
269
|
+
|
|
270
|
+
.. code-block:: python
|
|
271
|
+
|
|
272
|
+
from dagster import job, EnvVar
|
|
273
|
+
from dagster_airbyte import AirbyteResource
|
|
274
|
+
|
|
275
|
+
my_airbyte_resource = AirbyteResource(
|
|
276
|
+
host=EnvVar("AIRBYTE_HOST"),
|
|
277
|
+
port=EnvVar("AIRBYTE_PORT"),
|
|
278
|
+
# If using basic auth
|
|
279
|
+
username=EnvVar("AIRBYTE_USERNAME"),
|
|
280
|
+
password=EnvVar("AIRBYTE_PASSWORD"),
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
airbyte_assets = build_airbyte_assets(
|
|
284
|
+
connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
|
|
285
|
+
destination_tables=["releases", "tags", "teams"],
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
Definitions(
|
|
289
|
+
assets=[airbyte_assets],
|
|
290
|
+
resources={"airbyte": my_airbyte_resource},
|
|
291
|
+
)
|
|
292
|
+
"""
|
|
293
|
+
|
|
294
|
+
host: str = Field(description="The Airbyte server address.")
|
|
295
|
+
port: str = Field(description="Port used for the Airbyte server.")
|
|
296
|
+
username: Optional[str] = Field(default=None, description="Username if using basic auth.")
|
|
297
|
+
password: Optional[str] = Field(default=None, description="Password if using basic auth.")
|
|
298
|
+
use_https: bool = Field(
|
|
299
|
+
default=False, description="Whether to use HTTPS to connect to the Airbyte server."
|
|
300
|
+
)
|
|
301
|
+
forward_logs: bool = Field(
|
|
302
|
+
default=True,
|
|
303
|
+
description=(
|
|
304
|
+
"Whether to forward Airbyte logs to the compute log, can be expensive for"
|
|
305
|
+
" long-running syncs."
|
|
306
|
+
),
|
|
307
|
+
)
|
|
308
|
+
request_additional_params: Mapping[str, Any] = Field(
|
|
309
|
+
default=dict(),
|
|
310
|
+
description=(
|
|
311
|
+
"Any additional kwargs to pass to the requests library when making requests to Airbyte."
|
|
312
|
+
),
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
@cached_method
|
|
317
|
+
def _state(self) -> AirbyteResourceState:
|
|
318
|
+
return AirbyteResourceState()
|
|
319
|
+
|
|
320
|
+
@property
|
|
321
|
+
@cached_method
|
|
322
|
+
def _log(self) -> logging.Logger:
|
|
323
|
+
return get_dagster_logger()
|
|
324
|
+
|
|
325
|
+
@property
|
|
326
|
+
def api_base_url(self) -> str:
|
|
327
|
+
return (
|
|
328
|
+
("https://" if self.use_https else "http://")
|
|
329
|
+
+ (f"{self.host}:{self.port}" if self.port else self.host)
|
|
330
|
+
+ "/api/v1"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def _should_forward_logs(self) -> bool:
|
|
335
|
+
return self.forward_logs
|
|
336
|
+
|
|
337
|
+
@contextmanager
|
|
338
|
+
def cache_requests(self):
|
|
339
|
+
"""Context manager that enables caching certain requests to the Airbyte API,
|
|
340
|
+
cleared when the context is exited.
|
|
341
|
+
"""
|
|
342
|
+
self.clear_request_cache()
|
|
343
|
+
self._state.cache_enabled += 1
|
|
344
|
+
try:
|
|
345
|
+
yield
|
|
346
|
+
finally:
|
|
347
|
+
self.clear_request_cache()
|
|
348
|
+
self._state.cache_enabled -= 1
|
|
349
|
+
|
|
350
|
+
def clear_request_cache(self) -> None:
|
|
351
|
+
self._state.request_cache = {}
|
|
352
|
+
|
|
353
|
+
def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):
|
|
354
|
+
if not self._state.cache_enabled > 0:
|
|
355
|
+
return self.make_request(endpoint, data)
|
|
356
|
+
data_json = json.dumps(data, sort_keys=True)
|
|
357
|
+
sha = hashlib.sha1()
|
|
358
|
+
sha.update(endpoint.encode("utf-8"))
|
|
359
|
+
sha.update(data_json.encode("utf-8"))
|
|
360
|
+
digest = sha.hexdigest()
|
|
361
|
+
|
|
362
|
+
if digest not in self._state.request_cache:
|
|
363
|
+
self._state.request_cache[digest] = self.make_request(endpoint, data)
|
|
364
|
+
return self._state.request_cache[digest]
|
|
365
|
+
|
|
366
|
+
@property
|
|
367
|
+
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
368
|
+
auth_param = (
|
|
369
|
+
{"auth": (self.username, self.password)} if self.username and self.password else {}
|
|
370
|
+
)
|
|
371
|
+
return {**auth_param, **self.request_additional_params}
|
|
372
|
+
|
|
373
|
+
def make_request( # pyright: ignore[reportIncompatibleMethodOverride]
|
|
374
|
+
self, endpoint: str, data: Optional[Mapping[str, object]]
|
|
375
|
+
) -> Optional[Mapping[str, object]]:
|
|
376
|
+
"""Creates and sends a request to the desired Airbyte REST API endpoint.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
endpoint (str): The Airbyte API endpoint to send this request to.
|
|
380
|
+
data (Optional[str]): JSON-formatted data string to be included in the request.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Optional[Dict[str, Any]]: Parsed json data from the response to this request
|
|
384
|
+
"""
|
|
385
|
+
url = self.api_base_url + endpoint
|
|
386
|
+
headers = {"accept": "application/json"}
|
|
387
|
+
|
|
388
|
+
num_retries = 0
|
|
389
|
+
while True:
|
|
390
|
+
try:
|
|
391
|
+
response = requests.request(
|
|
392
|
+
**deep_merge_dicts( # type: ignore
|
|
393
|
+
dict(
|
|
394
|
+
method="POST",
|
|
395
|
+
url=url,
|
|
396
|
+
headers=headers,
|
|
397
|
+
json=data,
|
|
398
|
+
timeout=self.request_timeout,
|
|
399
|
+
auth=(
|
|
400
|
+
(self.username, self.password)
|
|
401
|
+
if self.username and self.password
|
|
402
|
+
else None
|
|
403
|
+
),
|
|
404
|
+
),
|
|
405
|
+
self.request_additional_params,
|
|
406
|
+
),
|
|
407
|
+
)
|
|
408
|
+
response.raise_for_status()
|
|
409
|
+
if response.status_code == 204:
|
|
410
|
+
return None
|
|
411
|
+
return response.json()
|
|
412
|
+
except RequestException as e:
|
|
413
|
+
self._log.error("Request to Airbyte API failed: %s", e)
|
|
414
|
+
if num_retries == self.request_max_retries:
|
|
415
|
+
break
|
|
416
|
+
num_retries += 1
|
|
417
|
+
time.sleep(self.request_retry_delay)
|
|
418
|
+
|
|
419
|
+
raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
|
|
420
|
+
|
|
421
|
+
def cancel_job(self, job_id: int):
|
|
422
|
+
self.make_request(endpoint="/jobs/cancel", data={"id": job_id})
|
|
423
|
+
|
|
424
|
+
def get_default_workspace(self) -> str:
|
|
425
|
+
workspaces = cast(
|
|
426
|
+
"list[dict[str, Any]]",
|
|
427
|
+
check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
|
|
428
|
+
"workspaces", []
|
|
429
|
+
),
|
|
430
|
+
)
|
|
431
|
+
return workspaces[0]["workspaceId"]
|
|
432
|
+
|
|
433
|
+
def get_source_definition_by_name(self, name: str) -> Optional[str]:
|
|
434
|
+
name_lower = name.lower()
|
|
435
|
+
definitions = check.not_none(
|
|
436
|
+
self.make_request_cached(endpoint="/source_definitions/list", data={})
|
|
437
|
+
)
|
|
438
|
+
source_definitions = cast("list[dict[str, Any]]", definitions["sourceDefinitions"])
|
|
439
|
+
|
|
440
|
+
return next(
|
|
441
|
+
(
|
|
442
|
+
definition["sourceDefinitionId"]
|
|
443
|
+
for definition in source_definitions
|
|
444
|
+
if definition["name"].lower() == name_lower
|
|
445
|
+
),
|
|
446
|
+
None,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
def get_destination_definition_by_name(self, name: str):
|
|
450
|
+
name_lower = name.lower()
|
|
451
|
+
definitions = cast(
|
|
452
|
+
"dict[str, list[dict[str, str]]]",
|
|
453
|
+
check.not_none(
|
|
454
|
+
self.make_request_cached(endpoint="/destination_definitions/list", data={})
|
|
455
|
+
),
|
|
456
|
+
)
|
|
457
|
+
return next(
|
|
458
|
+
(
|
|
459
|
+
definition["destinationDefinitionId"]
|
|
460
|
+
for definition in definitions["destinationDefinitions"]
|
|
461
|
+
if definition["name"].lower() == name_lower
|
|
462
|
+
),
|
|
463
|
+
None,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
def get_source_catalog_id(self, source_id: str):
|
|
467
|
+
result = cast(
|
|
468
|
+
"dict[str, Any]",
|
|
469
|
+
check.not_none(
|
|
470
|
+
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
471
|
+
),
|
|
472
|
+
)
|
|
473
|
+
return result["catalogId"]
|
|
474
|
+
|
|
475
|
+
def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
|
|
476
|
+
return cast(
|
|
477
|
+
"dict[str, Any]",
|
|
478
|
+
check.not_none(
|
|
479
|
+
self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
|
|
480
|
+
),
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
def does_dest_support_normalization(
|
|
484
|
+
self, destination_definition_id: str, workspace_id: str
|
|
485
|
+
) -> bool:
|
|
486
|
+
# Airbyte API changed source of truth for normalization in PR
|
|
487
|
+
# https://github.com/airbytehq/airbyte/pull/21005
|
|
488
|
+
norm_dest_def_spec: bool = cast(
|
|
489
|
+
"dict[str, Any]",
|
|
490
|
+
check.not_none(
|
|
491
|
+
self.make_request_cached(
|
|
492
|
+
endpoint="/destination_definition_specifications/get",
|
|
493
|
+
data={
|
|
494
|
+
"destinationDefinitionId": destination_definition_id,
|
|
495
|
+
"workspaceId": workspace_id,
|
|
496
|
+
},
|
|
497
|
+
)
|
|
498
|
+
),
|
|
499
|
+
).get("supportsNormalization", False)
|
|
500
|
+
|
|
501
|
+
norm_dest_def: bool = (
|
|
502
|
+
cast(
|
|
503
|
+
"dict[str, Any]",
|
|
504
|
+
check.not_none(
|
|
505
|
+
self.make_request_cached(
|
|
506
|
+
endpoint="/destination_definitions/get",
|
|
507
|
+
data={
|
|
508
|
+
"destinationDefinitionId": destination_definition_id,
|
|
509
|
+
},
|
|
510
|
+
)
|
|
511
|
+
),
|
|
512
|
+
)
|
|
513
|
+
.get("normalizationConfig", {})
|
|
514
|
+
.get("supported", False)
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
return any([norm_dest_def_spec, norm_dest_def])
|
|
518
|
+
|
|
519
|
+
def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
|
|
520
|
+
if self.forward_logs:
|
|
521
|
+
return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))
|
|
522
|
+
else:
|
|
523
|
+
# the "list all jobs" endpoint doesn't return logs, which actually makes it much more
|
|
524
|
+
# lightweight for long-running syncs with many logs
|
|
525
|
+
out = check.not_none(
|
|
526
|
+
self.make_request(
|
|
527
|
+
endpoint="/jobs/list",
|
|
528
|
+
data={
|
|
529
|
+
"configTypes": ["sync"],
|
|
530
|
+
"configId": connection_id,
|
|
531
|
+
# sync should be the most recent, so pageSize 5 is sufficient
|
|
532
|
+
"pagination": {"pageSize": 5},
|
|
533
|
+
},
|
|
534
|
+
)
|
|
535
|
+
)
|
|
536
|
+
job = next(
|
|
537
|
+
(job for job in cast("list", out["jobs"]) if job["job"]["id"] == job_id), None
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
return check.not_none(job)
|
|
541
|
+
|
|
542
|
+
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
543
|
+
return check.not_none(
|
|
544
|
+
self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
|
|
548
|
+
return check.not_none(
|
|
549
|
+
self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
def sync_and_poll(
|
|
553
|
+
self,
|
|
554
|
+
connection_id: str,
|
|
555
|
+
poll_interval: Optional[float] = None,
|
|
556
|
+
poll_timeout: Optional[float] = None,
|
|
557
|
+
) -> AirbyteOutput:
|
|
558
|
+
"""Initializes a sync operation for the given connector, and polls until it completes.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
|
|
562
|
+
"Connection" tab of a given connection in the Arbyte UI.
|
|
563
|
+
poll_interval (float): The time (in seconds) that will be waited between successive polls.
|
|
564
|
+
poll_timeout (float): The maximum time that will waited before this operation is timed
|
|
565
|
+
out. By default, this will never time out.
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
:py:class:`~AirbyteOutput`:
|
|
569
|
+
Details of the sync job.
|
|
570
|
+
"""
|
|
571
|
+
connection_details = self.get_connection_details(connection_id)
|
|
572
|
+
job_details = self.start_sync(connection_id)
|
|
573
|
+
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
574
|
+
job_id = cast("int", job_info.get("id"))
|
|
575
|
+
|
|
576
|
+
self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
|
|
577
|
+
start = time.monotonic()
|
|
578
|
+
logged_attempts = 0
|
|
579
|
+
logged_lines = 0
|
|
580
|
+
state = None
|
|
581
|
+
|
|
582
|
+
try:
|
|
583
|
+
while True:
|
|
584
|
+
if poll_timeout and start + poll_timeout < time.monotonic():
|
|
585
|
+
raise Failure(
|
|
586
|
+
f"Timeout: Airbyte job {job_id} is not ready after the timeout"
|
|
587
|
+
f" {poll_timeout} seconds"
|
|
588
|
+
)
|
|
589
|
+
time.sleep(poll_interval or self.poll_interval)
|
|
590
|
+
job_details = self.get_job_status(connection_id, job_id)
|
|
591
|
+
attempts = cast("list", job_details.get("attempts", []))
|
|
592
|
+
cur_attempt = len(attempts)
|
|
593
|
+
# spit out the available Airbyte log info
|
|
594
|
+
if cur_attempt:
|
|
595
|
+
if self.forward_logs:
|
|
596
|
+
log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
|
|
597
|
+
|
|
598
|
+
for line in log_lines[logged_lines:]:
|
|
599
|
+
sys.stdout.write(line + "\n")
|
|
600
|
+
sys.stdout.flush()
|
|
601
|
+
logged_lines = len(log_lines)
|
|
602
|
+
|
|
603
|
+
# if there's a next attempt, this one will have no more log messages
|
|
604
|
+
if logged_attempts < cur_attempt - 1:
|
|
605
|
+
logged_lines = 0
|
|
606
|
+
logged_attempts += 1
|
|
607
|
+
|
|
608
|
+
job_info = cast("dict[str, object]", job_details.get("job", {}))
|
|
609
|
+
state = job_info.get("status")
|
|
610
|
+
|
|
611
|
+
if state in (
|
|
612
|
+
AirbyteJobStatusType.RUNNING,
|
|
613
|
+
AirbyteJobStatusType.PENDING,
|
|
614
|
+
AirbyteJobStatusType.INCOMPLETE,
|
|
615
|
+
):
|
|
616
|
+
continue
|
|
617
|
+
elif state == AirbyteJobStatusType.SUCCEEDED:
|
|
618
|
+
break
|
|
619
|
+
elif state == AirbyteJobStatusType.ERROR:
|
|
620
|
+
raise Failure(f"Job failed: {job_id}")
|
|
621
|
+
elif state == AirbyteJobStatusType.CANCELLED:
|
|
622
|
+
raise Failure(f"Job was cancelled: {job_id}")
|
|
623
|
+
else:
|
|
624
|
+
raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
|
|
625
|
+
finally:
|
|
626
|
+
# if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
|
|
627
|
+
# the python process
|
|
628
|
+
if (
|
|
629
|
+
state
|
|
630
|
+
not in (
|
|
631
|
+
AirbyteJobStatusType.SUCCEEDED,
|
|
632
|
+
AirbyteJobStatusType.ERROR,
|
|
633
|
+
AirbyteJobStatusType.CANCELLED,
|
|
634
|
+
)
|
|
635
|
+
and self.cancel_sync_on_run_termination
|
|
636
|
+
):
|
|
637
|
+
self.cancel_job(job_id)
|
|
638
|
+
|
|
639
|
+
return AirbyteOutput(job_details=job_details, connection_details=connection_details)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
@superseded(
|
|
643
|
+
additional_warn_text=(
|
|
644
|
+
"If you are using Airbyte 1.6.0 or higher, please see the migration guide: https://docs.dagster.io/integrations/libraries/airbyte/migration-guide"
|
|
645
|
+
)
|
|
646
|
+
)
|
|
647
|
+
class AirbyteCloudResource(BaseAirbyteResource):
|
|
648
|
+
"""This resource allows users to programmatically interface with the Airbyte Cloud API to launch
|
|
649
|
+
syncs and monitor their progress.
|
|
650
|
+
|
|
651
|
+
**Examples:**
|
|
652
|
+
|
|
653
|
+
.. code-block:: python
|
|
654
|
+
|
|
655
|
+
from dagster import job, EnvVar
|
|
656
|
+
from dagster_airbyte import AirbyteResource
|
|
657
|
+
|
|
658
|
+
my_airbyte_resource = AirbyteCloudResource(
|
|
659
|
+
client_id=EnvVar("AIRBYTE_CLIENT_ID"),
|
|
660
|
+
client_secret=EnvVar("AIRBYTE_CLIENT_SECRET"),
|
|
661
|
+
)
|
|
662
|
+
|
|
663
|
+
airbyte_assets = build_airbyte_assets(
|
|
664
|
+
connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
|
|
665
|
+
destination_tables=["releases", "tags", "teams"],
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
Definitions(
|
|
669
|
+
assets=[airbyte_assets],
|
|
670
|
+
resources={"airbyte": my_airbyte_resource},
|
|
671
|
+
)
|
|
672
|
+
"""
|
|
673
|
+
|
|
674
|
+
client_id: str = Field(..., description="The Airbyte Cloud client ID.")
|
|
675
|
+
client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
|
|
676
|
+
|
|
677
|
+
_access_token_value: Optional[str] = PrivateAttr(default=None)
|
|
678
|
+
_access_token_timestamp: Optional[float] = PrivateAttr(default=None)
|
|
679
|
+
|
|
680
|
+
def setup_for_execution(self, context: InitResourceContext) -> None:
|
|
681
|
+
# Refresh access token when the resource is initialized
|
|
682
|
+
self._refresh_access_token()
|
|
683
|
+
|
|
684
|
+
@property
|
|
685
|
+
def api_base_url(self) -> str:
|
|
686
|
+
return "https://api.airbyte.com/v1"
|
|
687
|
+
|
|
688
|
+
@property
|
|
689
|
+
def all_additional_request_params(self) -> Mapping[str, Any]:
|
|
690
|
+
# Make sure the access token is refreshed before using it when calling the API.
|
|
691
|
+
if self._needs_refreshed_access_token():
|
|
692
|
+
self._refresh_access_token()
|
|
693
|
+
return {
|
|
694
|
+
"headers": {
|
|
695
|
+
"Authorization": f"Bearer {self._access_token_value}",
|
|
696
|
+
"User-Agent": "dagster",
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
def make_request(
|
|
701
|
+
self,
|
|
702
|
+
endpoint: str,
|
|
703
|
+
data: Optional[Mapping[str, object]] = None,
|
|
704
|
+
method: str = "POST",
|
|
705
|
+
include_additional_request_params: bool = True,
|
|
706
|
+
) -> Optional[Mapping[str, object]]:
|
|
707
|
+
# Make sure the access token is refreshed before using it when calling the API.
|
|
708
|
+
if include_additional_request_params and self._needs_refreshed_access_token():
|
|
709
|
+
self._refresh_access_token()
|
|
710
|
+
return super().make_request(
|
|
711
|
+
endpoint=endpoint,
|
|
712
|
+
data=data,
|
|
713
|
+
method=method,
|
|
714
|
+
include_additional_request_params=include_additional_request_params,
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
def start_sync(self, connection_id: str) -> Mapping[str, object]:
|
|
718
|
+
job_sync = check.not_none(
|
|
719
|
+
self.make_request(
|
|
720
|
+
endpoint="/jobs",
|
|
721
|
+
data={
|
|
722
|
+
"connectionId": connection_id,
|
|
723
|
+
"jobType": "sync",
|
|
724
|
+
},
|
|
725
|
+
)
|
|
726
|
+
)
|
|
727
|
+
return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}
|
|
728
|
+
|
|
729
|
+
def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
|
|
730
|
+
return {}
|
|
731
|
+
|
|
732
|
+
def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
|
|
733
|
+
job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))
|
|
734
|
+
return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}
|
|
735
|
+
|
|
736
|
+
def cancel_job(self, job_id: int):
|
|
737
|
+
self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")
|
|
738
|
+
|
|
739
|
+
@property
|
|
740
|
+
def _should_forward_logs(self) -> bool:
|
|
741
|
+
# Airbyte Cloud does not support streaming logs yet
|
|
742
|
+
return False
|
|
743
|
+
|
|
744
|
+
def _refresh_access_token(self) -> None:
|
|
745
|
+
response = check.not_none(
|
|
746
|
+
self.make_request(
|
|
747
|
+
endpoint="/applications/token",
|
|
748
|
+
data={
|
|
749
|
+
"client_id": self.client_id,
|
|
750
|
+
"client_secret": self.client_secret,
|
|
751
|
+
},
|
|
752
|
+
# Must not pass the bearer access token when refreshing it.
|
|
753
|
+
include_additional_request_params=False,
|
|
754
|
+
)
|
|
755
|
+
)
|
|
756
|
+
self._access_token_value = str(response["access_token"])
|
|
757
|
+
self._access_token_timestamp = datetime.now().timestamp()
|
|
758
|
+
|
|
759
|
+
def _needs_refreshed_access_token(self) -> bool:
|
|
760
|
+
return (
|
|
761
|
+
not self._access_token_value
|
|
762
|
+
or not self._access_token_timestamp
|
|
763
|
+
or self._access_token_timestamp
|
|
764
|
+
<= datetime.timestamp(
|
|
765
|
+
datetime.now() - timedelta(seconds=AIRBYTE_REFRESH_TIMEDELTA_SECONDS)
|
|
766
|
+
)
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
@superseded(
|
|
771
|
+
additional_warn_text=(
|
|
772
|
+
"If you are using Airbyte 1.6.0 or higher, please see the migration guide: https://docs.dagster.io/integrations/libraries/airbyte/migration-guide"
|
|
773
|
+
)
|
|
774
|
+
)
|
|
775
|
+
@dagster_maintained_resource
|
|
776
|
+
@resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))
|
|
777
|
+
def airbyte_cloud_resource(context) -> AirbyteCloudResource:
|
|
778
|
+
"""This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
|
|
779
|
+
syncs and monitor their progress. Currently, this resource may only be used with the more basic
|
|
780
|
+
`dagster-airbyte` APIs, including the ops and assets.
|
|
781
|
+
|
|
782
|
+
"""
|
|
783
|
+
return AirbyteCloudResource.from_resource_context(context)
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
@superseded(
|
|
787
|
+
additional_warn_text=(
|
|
788
|
+
"If you are using Airbyte 1.6.0 or higher, please see the migration guide: https://docs.dagster.io/integrations/libraries/airbyte/migration-guide"
|
|
789
|
+
)
|
|
790
|
+
)
|
|
791
|
+
@dagster_maintained_resource
|
|
792
|
+
@resource(config_schema=AirbyteResource.to_config_schema())
|
|
793
|
+
def airbyte_resource(context) -> AirbyteResource:
|
|
794
|
+
"""This resource allows users to programatically interface with the Airbyte REST API to launch
|
|
795
|
+
syncs and monitor their progress. This currently implements only a subset of the functionality
|
|
796
|
+
exposed by the API.
|
|
797
|
+
|
|
798
|
+
For a complete set of documentation on the Airbyte REST API, including expected response JSON
|
|
799
|
+
schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.
|
|
800
|
+
|
|
801
|
+
To configure this resource, we recommend using the `configured
|
|
802
|
+
<https://legacy-docs.dagster.io/concepts/configuration/configured>`_ method.
|
|
803
|
+
|
|
804
|
+
**Examples:**
|
|
805
|
+
|
|
806
|
+
.. code-block:: python
|
|
807
|
+
|
|
808
|
+
from dagster import job
|
|
809
|
+
from dagster_airbyte import airbyte_resource
|
|
810
|
+
|
|
811
|
+
my_airbyte_resource = airbyte_resource.configured(
|
|
812
|
+
{
|
|
813
|
+
"host": {"env": "AIRBYTE_HOST"},
|
|
814
|
+
"port": {"env": "AIRBYTE_PORT"},
|
|
815
|
+
# If using basic auth
|
|
816
|
+
"username": {"env": "AIRBYTE_USERNAME"},
|
|
817
|
+
"password": {"env": "AIRBYTE_PASSWORD"},
|
|
818
|
+
}
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
@job(resource_defs={"airbyte":my_airbyte_resource})
|
|
822
|
+
def my_airbyte_job():
|
|
823
|
+
...
|
|
824
|
+
|
|
825
|
+
"""
|
|
826
|
+
return AirbyteResource.from_resource_context(context)
|