dagster-airbyte 0.24.3__py3-none-any.whl → 0.28.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,88 +1,173 @@
1
- import hashlib
2
- import json
3
1
  import logging
4
- import sys
5
2
  import time
6
- from abc import abstractmethod
3
+ from collections.abc import Callable, Iterator, Mapping, Sequence
7
4
  from contextlib import contextmanager
8
5
  from datetime import datetime, timedelta
9
- from typing import Any, Dict, List, Mapping, Optional, cast
6
+ from typing import Any, ClassVar, Optional, Union
7
+ from urllib.parse import parse_qsl, urlparse
10
8
 
11
9
  import requests
12
10
  from dagster import (
11
+ AssetExecutionContext,
12
+ AssetMaterialization,
13
+ AssetSpec,
13
14
  ConfigurableResource,
15
+ Definitions,
14
16
  Failure,
15
- InitResourceContext,
17
+ MaterializeResult,
16
18
  _check as check,
17
19
  get_dagster_logger,
18
- resource,
19
20
  )
20
- from dagster._config.pythonic_config import infer_schema_from_config_class
21
- from dagster._core.definitions.resource_definition import dagster_maintained_resource
22
- from dagster._utils.cached_method import cached_method
23
- from dagster._utils.merger import deep_merge_dicts
24
- from pydantic import Field, PrivateAttr
21
+ from dagster._annotations import superseded
22
+ from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
23
+ from dagster._symbol_annotations import beta, public
24
+ from dagster_shared.dagster_model import DagsterModel
25
+ from dagster_shared.record import record
26
+ from dagster_shared.utils.cached_method import cached_method
27
+ from pydantic import Field, PrivateAttr, model_validator
25
28
  from requests.exceptions import RequestException
26
29
 
30
+ from dagster_airbyte.translator import (
31
+ AirbyteConnection,
32
+ AirbyteConnectionTableProps,
33
+ AirbyteDestination,
34
+ AirbyteJob,
35
+ AirbyteJobStatusType,
36
+ AirbyteMetadataSet,
37
+ AirbyteWorkspaceData,
38
+ DagsterAirbyteTranslator,
39
+ )
27
40
  from dagster_airbyte.types import AirbyteOutput
41
+ from dagster_airbyte.utils import (
42
+ DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY,
43
+ get_airbyte_connection_table_name,
44
+ get_translator_from_airbyte_assets,
45
+ )
46
+
47
+ AIRBYTE_CLOUD_REST_API_BASE = "https://api.airbyte.com"
48
+ AIRBYTE_CLOUD_REST_API_VERSION = "v1"
49
+ AIRBYTE_CLOUD_REST_API_BASE_URL = f"{AIRBYTE_CLOUD_REST_API_BASE}/{AIRBYTE_CLOUD_REST_API_VERSION}"
50
+ AIRBYTE_CLOUD_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
51
+ AIRBYTE_CLOUD_CONFIGURATION_API_VERSION = "v1"
52
+ AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL = (
53
+ f"{AIRBYTE_CLOUD_CONFIGURATION_API_BASE}/{AIRBYTE_CLOUD_CONFIGURATION_API_VERSION}"
54
+ )
28
55
 
29
56
  DEFAULT_POLL_INTERVAL_SECONDS = 10
30
57
 
31
58
  # The access token expire every 3 minutes in Airbyte Cloud.
32
59
  # Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
33
- AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS = 150
34
-
35
-
36
- class AirbyteState:
37
- RUNNING = "running"
38
- SUCCEEDED = "succeeded"
39
- CANCELLED = "cancelled"
40
- PENDING = "pending"
41
- FAILED = "failed"
42
- ERROR = "error"
43
- INCOMPLETE = "incomplete"
60
+ AIRBYTE_REFRESH_TIMEDELTA_SECONDS = 150
44
61
 
62
+ AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
45
63
 
46
- class AirbyteResourceState:
47
- def __init__(self) -> None:
48
- self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}
49
- # Int in case we nest contexts
50
- self.cache_enabled = 0
51
64
 
65
+ @beta
66
+ class AirbyteClient(DagsterModel):
67
+ """This class exposes methods on top of the Airbyte APIs for Airbyte."""
52
68
 
53
- class BaseAirbyteResource(ConfigurableResource):
69
+ rest_api_base_url: str = Field(
70
+ default=AIRBYTE_CLOUD_REST_API_BASE_URL,
71
+ description=(
72
+ "The base URL for the Airbyte REST API. "
73
+ "For Airbyte Cloud, leave this as the default. "
74
+ "For self-managed Airbyte, this is usually <your Airbyte host>/api/public/v1."
75
+ ),
76
+ )
77
+ configuration_api_base_url: str = Field(
78
+ default=AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL,
79
+ description=(
80
+ "The base URL for the Airbyte Configuration API. "
81
+ "For Airbyte Cloud, leave this as the default. "
82
+ "For self-managed Airbyte, this is usually <your Airbyte host>/api/v1."
83
+ ),
84
+ )
85
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
86
+ client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
87
+ client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
88
+ username: Optional[str] = Field(
89
+ default=None,
90
+ description="The Airbyte username for authentication. Used for self-managed Airbyte with basic auth.",
91
+ )
92
+ password: Optional[str] = Field(
93
+ default=None,
94
+ description="The Airbyte password for authentication. Used for self-managed Airbyte with basic auth.",
95
+ )
54
96
  request_max_retries: int = Field(
55
- default=3,
97
+ ...,
56
98
  description=(
57
99
  "The maximum number of times requests to the Airbyte API should be retried "
58
100
  "before failing."
59
101
  ),
60
102
  )
61
103
  request_retry_delay: float = Field(
62
- default=0.25,
104
+ ...,
63
105
  description="Time (in seconds) to wait between each request retry.",
64
106
  )
65
107
  request_timeout: int = Field(
66
- default=15,
108
+ ...,
67
109
  description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
68
110
  )
69
- cancel_sync_on_run_termination: bool = Field(
70
- default=True,
111
+ max_items_per_page: int = Field(
112
+ default=100,
71
113
  description=(
72
- "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"
73
- " be useful to disable if using Airbyte sources that cannot be cancelled and"
74
- " resumed easily, or if your Dagster deployment may experience runner interruptions"
75
- " that do not impact your Airbyte deployment."
114
+ "The maximum number of items per page. "
115
+ "Used for paginated resources like connections, destinations, etc. "
76
116
  ),
77
117
  )
78
118
  poll_interval: float = Field(
79
119
  default=DEFAULT_POLL_INTERVAL_SECONDS,
80
- description="Time (in seconds) to wait between checking a sync's status.",
120
+ description="The time (in seconds) that will be waited between successive polls.",
121
+ )
122
+ poll_timeout: Optional[float] = Field(
123
+ default=None,
124
+ description=(
125
+ "The maximum time that will wait before this operation is timed "
126
+ "out. By default, this will never time out."
127
+ ),
128
+ )
129
+ cancel_on_termination: bool = Field(
130
+ default=True,
131
+ description=(
132
+ "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
133
+ "This may be useful to disable if using Airbyte sources that cannot be cancelled and "
134
+ "resumed easily, or if your Dagster deployment may experience runner interruptions "
135
+ "that do not impact your Airbyte deployment."
136
+ ),
137
+ )
138
+ poll_previous_running_sync: bool = Field(
139
+ default=False,
140
+ description=(
141
+ "If set to True, Dagster will check for previous running sync for the same connection "
142
+ "and begin polling it instead of starting a new sync."
143
+ ),
81
144
  )
82
145
 
83
- @classmethod
84
- def _is_dagster_maintained(cls) -> bool:
85
- return True
146
+ _access_token_value: Optional[str] = PrivateAttr(default=None)
147
+ _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
148
+
149
+ @model_validator(mode="before")
150
+ def validate_authentication(cls, values):
151
+ has_client_id = values.get("client_id") is not None
152
+ has_client_secret = values.get("client_secret") is not None
153
+ has_username = values.get("username") is not None
154
+ has_password = values.get("password") is not None
155
+
156
+ check.invariant(
157
+ has_username == has_password,
158
+ "Missing config: both username and password are required for Airbyte authentication.",
159
+ )
160
+
161
+ check.invariant(
162
+ has_client_id == has_client_secret,
163
+ "Missing config: both client_id and client_secret are required for Airbyte authentication.",
164
+ )
165
+
166
+ check.invariant(
167
+ not ((has_client_id or has_client_secret) and (has_username or has_password)),
168
+ "Invalid config: cannot provide both client_id/client_secret and username/password for Airbyte authentication.",
169
+ )
170
+ return values
86
171
 
87
172
  @property
88
173
  @cached_method
@@ -90,704 +175,873 @@ class BaseAirbyteResource(ConfigurableResource):
90
175
  return get_dagster_logger()
91
176
 
92
177
  @property
93
- @abstractmethod
94
- def api_base_url(self) -> str:
95
- raise NotImplementedError()
178
+ def all_additional_request_headers(self) -> Mapping[str, Any]:
179
+ return {**self.authorization_request_headers, **self.user_agent_request_headers}
96
180
 
97
181
  @property
98
- @abstractmethod
99
- def all_additional_request_params(self) -> Mapping[str, Any]:
100
- raise NotImplementedError()
182
+ def authorization_request_headers(self) -> Mapping[str, Any]:
183
+ # Make sure the access token is refreshed before using it when calling the API.
184
+ if not (self.client_id and self.client_secret):
185
+ return {}
101
186
 
102
- def make_request(
103
- self,
104
- endpoint: str,
105
- data: Optional[Mapping[str, object]] = None,
106
- method: str = "POST",
107
- include_additional_request_params: bool = True,
108
- ) -> Optional[Mapping[str, object]]:
109
- """Creates and sends a request to the desired Airbyte REST API endpoint.
187
+ if self._needs_refreshed_access_token():
188
+ self._refresh_access_token()
189
+ return {
190
+ "Authorization": f"Bearer {self._access_token_value}",
191
+ }
110
192
 
111
- Args:
112
- endpoint (str): The Airbyte API endpoint to send this request to.
113
- data (Optional[str]): JSON-formatted data string to be included in the request.
193
+ @property
194
+ def user_agent_request_headers(self) -> Mapping[str, Any]:
195
+ return {
196
+ "User-Agent": "dagster",
197
+ }
114
198
 
115
- Returns:
116
- Optional[Dict[str, Any]]: Parsed json data from the response to this request
117
- """
118
- url = self.api_base_url + endpoint
199
+ def _refresh_access_token(self) -> None:
200
+ response = check.not_none(
201
+ self._single_request(
202
+ method="POST",
203
+ url=f"{self.rest_api_base_url}/applications/token",
204
+ data={
205
+ "client_id": self.client_id,
206
+ "client_secret": self.client_secret,
207
+ },
208
+ # Must not pass the bearer access token when refreshing it.
209
+ include_additional_request_headers=False,
210
+ )
211
+ )
212
+ self._access_token_value = str(response["access_token"])
213
+ self._access_token_timestamp = datetime.now().timestamp()
214
+
215
+ def _needs_refreshed_access_token(self) -> bool:
216
+ return (
217
+ not self._access_token_value
218
+ or not self._access_token_timestamp
219
+ or self._access_token_timestamp
220
+ <= (datetime.now() - timedelta(seconds=AIRBYTE_REFRESH_TIMEDELTA_SECONDS)).timestamp()
221
+ )
222
+
223
+ def _get_session(self, include_additional_request_headers: bool) -> requests.Session:
119
224
  headers = {"accept": "application/json"}
225
+ if include_additional_request_headers:
226
+ headers = {
227
+ **headers,
228
+ **self.all_additional_request_headers,
229
+ }
230
+ session = requests.Session()
231
+ session.headers.update(headers)
120
232
 
233
+ if self.username and self.password:
234
+ session.auth = (self.username, self.password)
235
+
236
+ return session
237
+
238
+ def _single_request(
239
+ self,
240
+ method: str,
241
+ url: str,
242
+ data: Optional[Mapping[str, Any]] = None,
243
+ params: Optional[Mapping[str, Any]] = None,
244
+ include_additional_request_headers: bool = True,
245
+ ) -> Mapping[str, Any]:
246
+ """Execute a single HTTP request with retry logic."""
121
247
  num_retries = 0
122
248
  while True:
123
249
  try:
124
- request_args: Dict[str, Any] = dict(
125
- method=method,
126
- url=url,
127
- headers=headers,
128
- timeout=self.request_timeout,
250
+ session = self._get_session(
251
+ include_additional_request_headers=include_additional_request_headers
129
252
  )
130
- if data:
131
- request_args["json"] = data
132
-
133
- if include_additional_request_params:
134
- request_args = deep_merge_dicts(
135
- request_args,
136
- self.all_additional_request_params,
137
- )
138
-
139
- response = requests.request(
140
- **request_args,
253
+ response = session.request(
254
+ method=method, url=url, json=data, params=params, timeout=self.request_timeout
141
255
  )
142
256
  response.raise_for_status()
143
- if response.status_code == 204:
144
- return None
145
257
  return response.json()
146
258
  except RequestException as e:
147
- self._log.error("Request to Airbyte API failed: %s", e)
259
+ self._log.error(
260
+ f"Request to Airbyte API failed for url {url} with method {method} : {e}"
261
+ )
148
262
  if num_retries == self.request_max_retries:
149
263
  break
150
264
  num_retries += 1
151
265
  time.sleep(self.request_retry_delay)
152
266
 
153
- raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
267
+ raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
154
268
 
155
- @abstractmethod
156
- def start_sync(self, connection_id: str) -> Mapping[str, object]:
157
- raise NotImplementedError()
269
+ return {}
158
270
 
159
- @abstractmethod
160
- def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
161
- raise NotImplementedError()
271
+ def _paginated_request(
272
+ self,
273
+ method: str,
274
+ url: str,
275
+ params: dict[str, Any],
276
+ data: Optional[Mapping[str, Any]] = None,
277
+ include_additional_request_params: bool = True,
278
+ ) -> Sequence[Mapping[str, Any]]:
279
+ """Execute paginated requests and yield all items."""
280
+ result_data = []
281
+ params = {"limit": self.max_items_per_page, **params}
282
+ while True:
283
+ response = self._single_request(
284
+ method=method,
285
+ url=url,
286
+ data=data,
287
+ params=params,
288
+ include_additional_request_headers=include_additional_request_params,
289
+ )
162
290
 
163
- @abstractmethod
164
- def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
165
- raise NotImplementedError()
291
+ # Handle different response structures
292
+ result_data.extend(response.get("data", []))
293
+ next_url = response.get("next", "")
294
+ if not next_url:
295
+ break
166
296
 
167
- @abstractmethod
168
- def cancel_job(self, job_id: int):
169
- raise NotImplementedError()
297
+ # Parse the query string for the next page
298
+ next_params = parse_qsl(urlparse(next_url).query)
299
+ # Overwrite the pagination params with the ones for the next page
300
+ params.update(dict(next_params))
170
301
 
171
- @property
172
- @abstractmethod
173
- def _should_forward_logs(self) -> bool:
174
- raise NotImplementedError()
302
+ return result_data
175
303
 
176
- def sync_and_poll(
177
- self,
178
- connection_id: str,
179
- poll_interval: Optional[float] = None,
180
- poll_timeout: Optional[float] = None,
181
- ) -> AirbyteOutput:
182
- """Initializes a sync operation for the given connector, and polls until it completes.
304
+ def validate_workspace_id(self) -> None:
305
+ """Fetches workspace details. This is used to validate that the workspace exists."""
306
+ self._single_request(
307
+ method="GET",
308
+ url=f"{self.rest_api_base_url}/workspaces/{self.workspace_id}",
309
+ )
310
+
311
+ def get_connections(self) -> Sequence[Mapping[str, Any]]:
312
+ """Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
313
+ return self._paginated_request(
314
+ method="GET",
315
+ url=f"{self.rest_api_base_url}/connections",
316
+ params={"workspaceIds": self.workspace_id},
317
+ )
318
+
319
+ def get_jobs_for_connection(
320
+ self, connection_id: str, created_after: datetime | None = None
321
+ ) -> Sequence[AirbyteJob]:
322
+ """Fetches all jobs for a specific connection of an Airbyte workspace from the Airbyte REST API."""
323
+ params = {"workspaceIds": self.workspace_id, "connectionId": connection_id}
324
+ if created_after:
325
+ params["createdAtStart"] = created_after.strftime("%Y-%m-%dT%H:%M:%SZ")
326
+
327
+ return [
328
+ AirbyteJob.from_job_details(job_details=job_details)
329
+ for job_details in self._paginated_request(
330
+ method="GET",
331
+ url=f"{self.rest_api_base_url}/jobs",
332
+ params=params,
333
+ )
334
+ ]
335
+
336
+ def get_connection_details(self, connection_id) -> Mapping[str, Any]:
337
+ """Fetches details about a given connection from the Airbyte Configuration API.
338
+ The Airbyte Configuration API is an internal and may change in the future.
339
+ """
340
+ # Using the Airbyte Configuration API to get the connection details, including streams and their configs.
341
+ # https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
342
+ # https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
343
+ return self._single_request(
344
+ method="POST",
345
+ url=f"{self.configuration_api_base_url}/connections/get",
346
+ data={"connectionId": connection_id},
347
+ )
348
+
349
+ def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
350
+ """Fetches details about a given destination from the Airbyte REST API."""
351
+ return self._single_request(
352
+ method="GET",
353
+ url=f"{self.rest_api_base_url}/destinations/{destination_id}",
354
+ )
355
+
356
+ def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
357
+ return self._single_request(
358
+ method="POST",
359
+ url=f"{self.rest_api_base_url}/jobs",
360
+ data={
361
+ "connectionId": connection_id,
362
+ "jobType": "sync",
363
+ },
364
+ )
365
+
366
+ def get_job_details(self, job_id: int) -> Mapping[str, Any]:
367
+ return self._single_request(
368
+ method="GET",
369
+ url=f"{self.rest_api_base_url}/jobs/{job_id}",
370
+ )
371
+
372
+ def cancel_job(self, job_id: int) -> Mapping[str, Any]:
373
+ return self._single_request(
374
+ method="DELETE",
375
+ url=f"{self.rest_api_base_url}/jobs/{job_id}",
376
+ )
377
+
378
+ def sync_and_poll(self, connection_id: str) -> AirbyteOutput:
379
+ """Initializes a sync operation for the given connection, and polls until it completes.
183
380
 
184
381
  Args:
185
- connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
186
- "Connection" tab of a given connection in the Arbyte UI.
187
- poll_interval (float): The time (in seconds) that will be waited between successive polls.
188
- poll_timeout (float): The maximum time that will waited before this operation is timed
189
- out. By default, this will never time out.
382
+ connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
383
+ "Connection" tab of a given connection in the Airbyte UI.
190
384
 
191
385
  Returns:
192
386
  :py:class:`~AirbyteOutput`:
193
387
  Details of the sync job.
194
388
  """
195
389
  connection_details = self.get_connection_details(connection_id)
196
- job_details = self.start_sync(connection_id)
197
- job_info = cast(Dict[str, object], job_details.get("job", {}))
198
- job_id = cast(int, job_info.get("id"))
199
390
 
200
- self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
201
- start = time.monotonic()
202
- logged_attempts = 0
203
- logged_lines = 0
204
- state = None
391
+ existing_jobs = [
392
+ job
393
+ for job in self.get_jobs_for_connection(
394
+ connection_id=connection_id,
395
+ created_after=datetime.now() - timedelta(days=2),
396
+ )
397
+ if job.status
398
+ in (
399
+ AirbyteJobStatusType.RUNNING,
400
+ AirbyteJobStatusType.PENDING,
401
+ AirbyteJobStatusType.INCOMPLETE,
402
+ )
403
+ ]
404
+
405
+ if not existing_jobs:
406
+ start_job_details = self.start_sync_job(connection_id)
407
+ job = AirbyteJob.from_job_details(job_details=start_job_details)
408
+ self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
409
+ else:
410
+ if self.poll_previous_running_sync:
411
+ if len(existing_jobs) == 1:
412
+ job = existing_jobs[0]
413
+ self._log.info(
414
+ f"Job {job.id} already running for connection_id={connection_id}. Resume polling."
415
+ )
416
+ else:
417
+ raise Failure(f"Found multiple running jobs for connection_id={connection_id}.")
418
+ else:
419
+ raise Failure(f"Found sync job for connection_id={connection_id} already running.")
420
+
421
+ poll_start = datetime.now()
205
422
 
206
423
  try:
207
424
  while True:
208
- if poll_timeout and start + poll_timeout < time.monotonic():
425
+ if self.poll_timeout and datetime.now() > poll_start + timedelta(
426
+ seconds=self.poll_timeout
427
+ ):
209
428
  raise Failure(
210
- f"Timeout: Airbyte job {job_id} is not ready after the timeout"
211
- f" {poll_timeout} seconds"
429
+ f"Timeout: Airbyte job {job.id} is not ready after the timeout"
430
+ f" {self.poll_timeout} seconds"
212
431
  )
213
- time.sleep(poll_interval or self.poll_interval)
214
- job_details = self.get_job_status(connection_id, job_id)
215
- attempts = cast(List, job_details.get("attempts", []))
216
- cur_attempt = len(attempts)
217
- # spit out the available Airbyte log info
218
- if cur_attempt:
219
- if self._should_forward_logs:
220
- log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
221
-
222
- for line in log_lines[logged_lines:]:
223
- sys.stdout.write(line + "\n")
224
- sys.stdout.flush()
225
- logged_lines = len(log_lines)
226
-
227
- # if there's a next attempt, this one will have no more log messages
228
- if logged_attempts < cur_attempt - 1:
229
- logged_lines = 0
230
- logged_attempts += 1
231
-
232
- job_info = cast(Dict[str, object], job_details.get("job", {}))
233
- state = job_info.get("status")
234
-
235
- if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
432
+
433
+ time.sleep(self.poll_interval)
434
+ # We return these job details in the AirbyteOutput when the job succeeds
435
+ poll_job_details = self.get_job_details(job.id)
436
+ self._log.debug(poll_job_details)
437
+ job = AirbyteJob.from_job_details(job_details=poll_job_details)
438
+ if job.status in (
439
+ AirbyteJobStatusType.RUNNING,
440
+ AirbyteJobStatusType.PENDING,
441
+ AirbyteJobStatusType.INCOMPLETE,
442
+ ):
236
443
  continue
237
- elif state == AirbyteState.SUCCEEDED:
444
+ elif job.status == AirbyteJobStatusType.SUCCEEDED:
238
445
  break
239
- elif state == AirbyteState.ERROR:
240
- raise Failure(f"Job failed: {job_id}")
241
- elif state == AirbyteState.CANCELLED:
242
- raise Failure(f"Job was cancelled: {job_id}")
446
+ elif job.status in [AirbyteJobStatusType.ERROR, AirbyteJobStatusType.FAILED]:
447
+ raise Failure(f"Job failed: {job.id}")
448
+ elif job.status == AirbyteJobStatusType.CANCELLED:
449
+ raise Failure(f"Job was cancelled: {job.id}")
243
450
  else:
244
- raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
451
+ raise Failure(
452
+ f"Encountered unexpected state `{job.status}` for job_id {job.id}"
453
+ )
245
454
  finally:
246
455
  # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
247
456
  # the python process
248
- if (
249
- state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
250
- and self.cancel_sync_on_run_termination
457
+ if self.cancel_on_termination and job.status not in (
458
+ AirbyteJobStatusType.SUCCEEDED,
459
+ AirbyteJobStatusType.ERROR,
460
+ AirbyteJobStatusType.CANCELLED,
461
+ AirbyteJobStatusType.FAILED,
251
462
  ):
252
- self.cancel_job(job_id)
463
+ self.cancel_job(job.id)
253
464
 
254
- return AirbyteOutput(job_details=job_details, connection_details=connection_details)
465
+ return AirbyteOutput(job_details=poll_job_details, connection_details=connection_details)
255
466
 
256
467
 
257
- class AirbyteCloudResource(BaseAirbyteResource):
258
- """This resource allows users to programmatically interface with the Airbyte Cloud API to launch
259
- syncs and monitor their progress.
468
+ @beta
469
+ class BaseAirbyteWorkspace(ConfigurableResource):
470
+ """This class represents a Airbyte workspace and provides utilities
471
+ to interact with Airbyte APIs.
472
+ """
260
473
 
261
- **Examples:**
474
+ request_max_retries: int = Field(
475
+ default=3,
476
+ description=(
477
+ "The maximum number of times requests to the Airbyte API should be retried "
478
+ "before failing."
479
+ ),
480
+ )
481
+ request_retry_delay: float = Field(
482
+ default=0.25,
483
+ description="Time (in seconds) to wait between each request retry.",
484
+ )
485
+ request_timeout: int = Field(
486
+ default=15,
487
+ description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
488
+ )
489
+ max_items_per_page: int = Field(
490
+ default=100,
491
+ description=(
492
+ "The maximum number of items per page. "
493
+ "Used for paginated resources like connections, destinations, etc. "
494
+ ),
495
+ )
496
+ poll_interval: float = Field(
497
+ default=DEFAULT_POLL_INTERVAL_SECONDS,
498
+ description="The time (in seconds) that will be waited between successive polls.",
499
+ )
500
+ poll_timeout: Optional[float] = Field(
501
+ default=None,
502
+ description=(
503
+ "The maximum time that will wait before this operation is timed "
504
+ "out. By default, this will never time out."
505
+ ),
506
+ )
507
+ cancel_on_termination: bool = Field(
508
+ default=True,
509
+ description=(
510
+ "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
511
+ "This may be useful to disable if using Airbyte sources that cannot be cancelled and "
512
+ "resumed easily, or if your Dagster deployment may experience runner interruptions "
513
+ "that do not impact your Airbyte deployment."
514
+ ),
515
+ )
516
+ poll_previous_running_sync: bool = Field(
517
+ default=False,
518
+ description=(
519
+ "If set to True, Dagster will check for previous running sync for the same connection "
520
+ "and begin polling it instead of starting a new sync."
521
+ ),
522
+ )
262
523
 
263
- .. code-block:: python
524
+ _client: AirbyteClient = PrivateAttr(default=None) # type: ignore
264
525
 
265
- from dagster import job, EnvVar
266
- from dagster_airbyte import AirbyteResource
526
+ @cached_method
527
+ def fetch_airbyte_workspace_data(
528
+ self,
529
+ ) -> AirbyteWorkspaceData:
530
+ """Retrieves all Airbyte content from the workspace and returns it as a AirbyteWorkspaceData object.
267
531
 
268
- my_airbyte_resource = AirbyteCloudResource(
269
- client_id=EnvVar("AIRBYTE_CLIENT_ID"),
270
- client_secret=EnvVar("AIRBYTE_CLIENT_SECRET"),
271
- )
532
+ Returns:
533
+ AirbyteWorkspaceData: A snapshot of the Airbyte workspace's content.
534
+ """
535
+ connections_by_id = {}
536
+ destinations_by_id = {}
272
537
 
273
- airbyte_assets = build_airbyte_assets(
274
- connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
275
- destination_tables=["releases", "tags", "teams"],
276
- )
538
+ client = self.get_client()
539
+
540
+ client.validate_workspace_id()
541
+
542
+ connections = client.get_connections()
543
+
544
+ for partial_connection_details in connections:
545
+ full_connection_details = client.get_connection_details(
546
+ connection_id=partial_connection_details["connectionId"]
547
+ )
548
+ connection = AirbyteConnection.from_connection_details(
549
+ connection_details=full_connection_details
550
+ )
551
+ connections_by_id[connection.id] = connection
552
+
553
+ destination_details = client.get_destination_details(
554
+ destination_id=connection.destination_id
555
+ )
556
+ destination = AirbyteDestination.from_destination_details(
557
+ destination_details=destination_details
558
+ )
559
+ destinations_by_id[destination.id] = destination
277
560
 
278
- defs = Definitions(
279
- assets=[airbyte_assets],
280
- resources={"airbyte": my_airbyte_resource},
561
+ return AirbyteWorkspaceData(
562
+ connections_by_id=connections_by_id,
563
+ destinations_by_id=destinations_by_id,
281
564
  )
282
- """
283
565
 
284
- client_id: str = Field(..., description="The Airbyte Cloud client ID.")
285
- client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
566
+ @cached_method
567
+ def load_asset_specs(
568
+ self,
569
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
570
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
571
+ ) -> Sequence[AssetSpec]:
572
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
286
573
 
287
- _access_token_value: Optional[str] = PrivateAttr(default=None)
288
- _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
574
+ Args:
575
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
576
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
577
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
578
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
579
+ which Airbyte connection assets are created for.
289
580
 
290
- def setup_for_execution(self, context: InitResourceContext) -> None:
291
- # Refresh access token when the resource is initialized
292
- self._refresh_access_token()
581
+ Returns:
582
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
293
583
 
294
- @property
295
- def api_base_url(self) -> str:
296
- return "https://api.airbyte.com/v1"
584
+ Examples:
585
+ Loading the asset specs for a given Airbyte workspace:
586
+ .. code-block:: python
297
587
 
298
- @property
299
- def all_additional_request_params(self) -> Mapping[str, Any]:
300
- # Make sure the access token is refreshed before using it when calling the API.
301
- if self._needs_refreshed_access_token():
302
- self._refresh_access_token()
303
- return {
304
- "headers": {
305
- "Authorization": f"Bearer {self._access_token_value}",
306
- "User-Agent": "dagster",
307
- }
308
- }
588
+ from dagster_airbyte import AirbyteWorkspace
309
589
 
310
- def make_request(
311
- self,
312
- endpoint: str,
313
- data: Optional[Mapping[str, object]] = None,
314
- method: str = "POST",
315
- include_additional_request_params: bool = True,
316
- ) -> Optional[Mapping[str, object]]:
317
- # Make sure the access token is refreshed before using it when calling the API.
318
- if include_additional_request_params and self._needs_refreshed_access_token():
319
- self._refresh_access_token()
320
- return super().make_request(
321
- endpoint=endpoint,
322
- data=data,
323
- method=method,
324
- include_additional_request_params=include_additional_request_params,
590
+ import dagster as dg
591
+
592
+ airbyte_workspace = AirbyteWorkspace(
593
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
594
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
595
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
596
+ )
597
+
598
+ airbyte_specs = airbyte_workspace.load_asset_specs()
599
+ dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace})
600
+ """
601
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
602
+
603
+ return load_airbyte_asset_specs(
604
+ workspace=self,
605
+ dagster_airbyte_translator=dagster_airbyte_translator,
606
+ connection_selector_fn=connection_selector_fn,
325
607
  )
326
608
 
327
- def start_sync(self, connection_id: str) -> Mapping[str, object]:
328
- job_sync = check.not_none(
329
- self.make_request(
330
- endpoint="/jobs",
331
- data={
332
- "connectionId": connection_id,
333
- "jobType": "sync",
334
- },
335
- )
609
+ def _generate_materialization(
610
+ self,
611
+ airbyte_output: AirbyteOutput,
612
+ dagster_airbyte_translator: DagsterAirbyteTranslator,
613
+ ):
614
+ connection = AirbyteConnection.from_connection_details(
615
+ connection_details=airbyte_output.connection_details
336
616
  )
337
- return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}
338
617
 
339
- def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
340
- return {}
618
+ for stream in connection.streams.values():
619
+ if stream.selected:
620
+ connection_table_name = get_airbyte_connection_table_name(
621
+ stream_prefix=connection.stream_prefix,
622
+ stream_name=stream.name,
623
+ )
624
+ stream_asset_spec = dagster_airbyte_translator.get_asset_spec(
625
+ props=AirbyteConnectionTableProps(
626
+ table_name=connection_table_name,
627
+ stream_prefix=connection.stream_prefix,
628
+ stream_name=stream.name,
629
+ json_schema=stream.json_schema,
630
+ connection_id=connection.id,
631
+ connection_name=connection.name,
632
+ destination_type=None,
633
+ database=None,
634
+ schema=None,
635
+ )
636
+ )
341
637
 
342
- def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
343
- job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))
344
- return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}
638
+ yield AssetMaterialization(
639
+ asset_key=stream_asset_spec.key,
640
+ description=(
641
+ f"Table generated via Airbyte sync "
642
+ f"for connection {connection.name}: {connection_table_name}"
643
+ ),
644
+ metadata=stream_asset_spec.metadata,
645
+ )
345
646
 
346
- def cancel_job(self, job_id: int):
347
- self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")
647
+ @public
648
+ @beta
649
+ def sync_and_poll(self, context: AssetExecutionContext):
650
+ """Executes a sync and poll process to materialize Airbyte assets.
651
+ This method can only be used in the context of an asset execution.
348
652
 
349
- @property
350
- def _should_forward_logs(self) -> bool:
351
- # Airbyte Cloud does not support streaming logs yet
352
- return False
653
+ Args:
654
+ context (AssetExecutionContext): The execution context
655
+ from within `@airbyte_assets`.
353
656
 
354
- def _refresh_access_token(self) -> None:
355
- response = check.not_none(
356
- self.make_request(
357
- endpoint="/applications/token",
358
- data={
359
- "client_id": self.client_id,
360
- "client_secret": self.client_secret,
361
- },
362
- # Must not pass the bearer access token when refreshing it.
363
- include_additional_request_params=False,
364
- )
657
+ Returns:
658
+ Iterator[Union[AssetMaterialization, MaterializeResult]]: An iterator of MaterializeResult
659
+ or AssetMaterialization.
660
+ """
661
+ assets_def = context.assets_def
662
+ dagster_airbyte_translator = get_translator_from_airbyte_assets(assets_def)
663
+ connection_id = next(
664
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id)
665
+ for spec in assets_def.specs
365
666
  )
366
- self._access_token_value = str(response["access_token"])
367
- self._access_token_timestamp = datetime.now().timestamp()
368
667
 
369
- def _needs_refreshed_access_token(self) -> bool:
370
- return (
371
- not self._access_token_value
372
- or not self._access_token_timestamp
373
- or self._access_token_timestamp
374
- <= datetime.timestamp(
375
- datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
376
- )
668
+ client = self.get_client()
669
+ airbyte_output = client.sync_and_poll(
670
+ connection_id=connection_id,
377
671
  )
378
672
 
673
+ materialized_asset_keys = set()
674
+ for materialization in self._generate_materialization(
675
+ airbyte_output=airbyte_output, dagster_airbyte_translator=dagster_airbyte_translator
676
+ ):
677
+ # Scan through all tables actually created, if it was expected then emit a MaterializeResult.
678
+ # Otherwise, emit a runtime AssetMaterialization.
679
+ if materialization.asset_key in context.selected_asset_keys:
680
+ yield MaterializeResult(
681
+ asset_key=materialization.asset_key, metadata=materialization.metadata
682
+ )
683
+ materialized_asset_keys.add(materialization.asset_key)
684
+ else:
685
+ context.log.warning(
686
+ f"An unexpected asset was materialized: {materialization.asset_key}. "
687
+ f"Yielding a materialization event."
688
+ )
689
+ yield materialization
690
+
691
+ unmaterialized_asset_keys = context.selected_asset_keys - materialized_asset_keys
692
+ if unmaterialized_asset_keys:
693
+ context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
379
694
 
380
- class AirbyteResource(BaseAirbyteResource):
695
+ @contextmanager
696
+ def process_config_and_initialize_cm_cached(self) -> Iterator["AirbyteWorkspace"]:
697
+ # Hack to avoid reconstructing initialized copies of this resource, which invalidates
698
+ # @cached_method caches. This means that multiple calls to load_airbyte_asset_specs
699
+ # will not trigger multiple API calls to fetch the workspace data.
700
+ # Bespoke impl since @cached_method doesn't play nice with iterators; it's exhausted after
701
+ # the first call.
702
+ if hasattr(self, "_initialized"):
703
+ yield getattr(self, "_initialized")
704
+ else:
705
+ with self.process_config_and_initialize_cm() as initialized_workspace:
706
+ initialized = initialized_workspace
707
+ setattr(self, "_initialized", initialized)
708
+ yield initialized
709
+
710
+
711
+ @beta
712
+ class AirbyteWorkspace(BaseAirbyteWorkspace):
381
713
  """This resource allows users to programatically interface with the Airbyte REST API to launch
382
- syncs and monitor their progress.
714
+ syncs and monitor their progress for a given Airbyte workspace.
383
715
 
384
716
  **Examples:**
717
+ Using OAuth client credentials:
385
718
 
386
719
  .. code-block:: python
387
720
 
388
- from dagster import job, EnvVar
389
- from dagster_airbyte import AirbyteResource
721
+ import dagster as dg
722
+ from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
723
+
724
+ airbyte_workspace = AirbyteWorkspace(
725
+ rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
726
+ configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
727
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
728
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
729
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
730
+ )
731
+
732
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
733
+
734
+ defs = dg.Definitions(
735
+ assets=all_airbyte_assets,
736
+ resources={"airbyte": airbyte_workspace},
737
+ )
738
+
739
+ Using basic Authentication:
740
+
741
+ .. code-block:: python
742
+
743
+ import dagster as dg
744
+ from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
745
+
746
+ airbyte_workspace = AirbyteWorkspace(
747
+ rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
748
+ configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
749
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
750
+ username=dg.EnvVar("AIRBYTE_USERNAME"),
751
+ password=dg.EnvVar("AIRBYTE_PASSWORD"),
752
+ )
753
+
754
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
390
755
 
391
- my_airbyte_resource = AirbyteResource(
392
- host=EnvVar("AIRBYTE_HOST"),
393
- port=EnvVar("AIRBYTE_PORT"),
394
- # If using basic auth
395
- username=EnvVar("AIRBYTE_USERNAME"),
396
- password=EnvVar("AIRBYTE_PASSWORD"),
756
+ defs = dg.Definitions(
757
+ assets=all_airbyte_assets,
758
+ resources={"airbyte": airbyte_workspace},
397
759
  )
398
760
 
399
- airbyte_assets = build_airbyte_assets(
400
- connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
401
- destination_tables=["releases", "tags", "teams"],
761
+ Using no authentication:
762
+
763
+ .. code-block:: python
764
+
765
+ import dagster as dg
766
+ from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
767
+
768
+ airbyte_workspace = AirbyteWorkspace(
769
+ rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
770
+ configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
771
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
402
772
  )
403
773
 
404
- defs = Definitions(
405
- assets=[airbyte_assets],
406
- resources={"airbyte": my_airbyte_resource},
774
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
775
+
776
+ defs = dg.Definitions(
777
+ assets=all_airbyte_assets,
778
+ resources={"airbyte": airbyte_workspace},
407
779
  )
408
780
  """
409
781
 
410
- host: str = Field(description="The Airbyte server address.")
411
- port: str = Field(description="Port used for the Airbyte server.")
412
- username: Optional[str] = Field(default=None, description="Username if using basic auth.")
413
- password: Optional[str] = Field(default=None, description="Password if using basic auth.")
414
- use_https: bool = Field(
415
- default=False, description="Whether to use HTTPS to connect to the Airbyte server."
782
+ rest_api_base_url: str = Field(
783
+ ...,
784
+ description="The base URL for the Airbyte REST API.",
785
+ examples=[
786
+ "http://localhost:8000/api/public/v1",
787
+ "https://my-airbyte-server.com/api/public/v1",
788
+ "http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/public/v1",
789
+ ],
416
790
  )
417
- forward_logs: bool = Field(
418
- default=True,
419
- description=(
420
- "Whether to forward Airbyte logs to the compute log, can be expensive for"
421
- " long-running syncs."
422
- ),
791
+ configuration_api_base_url: str = Field(
792
+ ...,
793
+ description="The base URL for the Airbyte Configuration API.",
794
+ examples=[
795
+ "http://localhost:8000/api/v1",
796
+ "https://my-airbyte-server.com/api/v1",
797
+ "http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/v1",
798
+ ],
423
799
  )
424
- request_additional_params: Mapping[str, Any] = Field(
425
- default=dict(),
426
- description=(
427
- "Any additional kwargs to pass to the requests library when making requests to Airbyte."
428
- ),
800
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
801
+ client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
802
+ client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
803
+ username: Optional[str] = Field(
804
+ default=None, description="The Airbyte username for authentication."
805
+ )
806
+ password: Optional[str] = Field(
807
+ default=None, description="The Airbyte password for authentication."
429
808
  )
430
809
 
431
- @property
432
810
  @cached_method
433
- def _state(self) -> AirbyteResourceState:
434
- return AirbyteResourceState()
811
+ def get_client(self) -> AirbyteClient:
812
+ return AirbyteClient(
813
+ rest_api_base_url=self.rest_api_base_url,
814
+ configuration_api_base_url=self.configuration_api_base_url,
815
+ workspace_id=self.workspace_id,
816
+ client_id=self.client_id,
817
+ client_secret=self.client_secret,
818
+ username=self.username,
819
+ password=self.password,
820
+ request_max_retries=self.request_max_retries,
821
+ request_retry_delay=self.request_retry_delay,
822
+ request_timeout=self.request_timeout,
823
+ max_items_per_page=self.max_items_per_page,
824
+ poll_interval=self.poll_interval,
825
+ poll_timeout=self.poll_timeout,
826
+ cancel_on_termination=self.cancel_on_termination,
827
+ poll_previous_running_sync=self.poll_previous_running_sync,
828
+ )
435
829
 
436
- @property
437
- @cached_method
438
- def _log(self) -> logging.Logger:
439
- return get_dagster_logger()
440
830
 
441
- @property
442
- def api_base_url(self) -> str:
443
- return (
444
- ("https://" if self.use_https else "http://")
445
- + (f"{self.host}:{self.port}" if self.port else self.host)
446
- + "/api/v1"
447
- )
831
+ @beta
832
+ class AirbyteCloudWorkspace(BaseAirbyteWorkspace):
833
+ """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
834
+ syncs and monitor their progress for a given Airbyte Cloud workspace.
448
835
 
449
- @property
450
- def _should_forward_logs(self) -> bool:
451
- return self.forward_logs
836
+ **Examples:**
452
837
 
453
- @contextmanager
454
- def cache_requests(self):
455
- """Context manager that enables caching certain requests to the Airbyte API,
456
- cleared when the context is exited.
457
- """
458
- self.clear_request_cache()
459
- self._state.cache_enabled += 1
460
- try:
461
- yield
462
- finally:
463
- self.clear_request_cache()
464
- self._state.cache_enabled -= 1
838
+ .. code-block:: python
465
839
 
466
- def clear_request_cache(self) -> None:
467
- self._state.request_cache = {}
840
+ from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
468
841
 
469
- def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):
470
- if not self._state.cache_enabled > 0:
471
- return self.make_request(endpoint, data)
472
- data_json = json.dumps(data, sort_keys=True)
473
- sha = hashlib.sha1()
474
- sha.update(endpoint.encode("utf-8"))
475
- sha.update(data_json.encode("utf-8"))
476
- digest = sha.hexdigest()
842
+ import dagster as dg
477
843
 
478
- if digest not in self._state.request_cache:
479
- self._state.request_cache[digest] = self.make_request(endpoint, data)
480
- return self._state.request_cache[digest]
844
+ airbyte_workspace = AirbyteCloudWorkspace(
845
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
846
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
847
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
848
+ )
481
849
 
482
- @property
483
- def all_additional_request_params(self) -> Mapping[str, Any]:
484
- auth_param = (
485
- {"auth": (self.username, self.password)} if self.username and self.password else {}
850
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
851
+
852
+ defs = dg.Definitions(
853
+ assets=all_airbyte_assets,
854
+ resources={"airbyte": airbyte_workspace},
486
855
  )
487
- return {**auth_param, **self.request_additional_params}
856
+ """
488
857
 
489
- def make_request(
490
- self, endpoint: str, data: Optional[Mapping[str, object]]
491
- ) -> Optional[Mapping[str, object]]:
492
- """Creates and sends a request to the desired Airbyte REST API endpoint.
858
+ rest_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_REST_API_BASE_URL
859
+ configuration_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL
860
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
861
+ client_id: str = Field(..., description="The Airbyte client ID.")
862
+ client_secret: str = Field(..., description="The Airbyte client secret.")
493
863
 
494
- Args:
495
- endpoint (str): The Airbyte API endpoint to send this request to.
496
- data (Optional[str]): JSON-formatted data string to be included in the request.
864
+ @cached_method
865
+ def get_client(self) -> AirbyteClient:
866
+ return AirbyteClient(
867
+ rest_api_base_url=self.rest_api_base_url,
868
+ configuration_api_base_url=self.configuration_api_base_url,
869
+ workspace_id=self.workspace_id,
870
+ client_id=self.client_id,
871
+ client_secret=self.client_secret,
872
+ request_max_retries=self.request_max_retries,
873
+ request_retry_delay=self.request_retry_delay,
874
+ request_timeout=self.request_timeout,
875
+ max_items_per_page=self.max_items_per_page,
876
+ poll_interval=self.poll_interval,
877
+ poll_timeout=self.poll_timeout,
878
+ cancel_on_termination=self.cancel_on_termination,
879
+ poll_previous_running_sync=self.poll_previous_running_sync,
880
+ )
497
881
 
498
- Returns:
499
- Optional[Dict[str, Any]]: Parsed json data from the response to this request
500
- """
501
- url = self.api_base_url + endpoint
502
- headers = {"accept": "application/json"}
503
882
 
504
- num_retries = 0
505
- while True:
506
- try:
507
- response = requests.request(
508
- **deep_merge_dicts( # type: ignore
509
- dict(
510
- method="POST",
511
- url=url,
512
- headers=headers,
513
- json=data,
514
- timeout=self.request_timeout,
515
- auth=(
516
- (self.username, self.password)
517
- if self.username and self.password
518
- else None
519
- ),
520
- ),
521
- self.request_additional_params,
522
- ),
523
- )
524
- response.raise_for_status()
525
- if response.status_code == 204:
526
- return None
527
- return response.json()
528
- except RequestException as e:
529
- self._log.error("Request to Airbyte API failed: %s", e)
530
- if num_retries == self.request_max_retries:
531
- break
532
- num_retries += 1
533
- time.sleep(self.request_retry_delay)
883
+ @public
884
+ @beta
885
+ def load_airbyte_asset_specs(
886
+ workspace: BaseAirbyteWorkspace,
887
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
888
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
889
+ ) -> Sequence[AssetSpec]:
890
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
534
891
 
535
- raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
892
+ Args:
893
+ workspace (BaseAirbyteWorkspace): The Airbyte workspace to fetch assets from.
894
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
895
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
896
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
897
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
898
+ which Airbyte connection assets are created for.
536
899
 
537
- def cancel_job(self, job_id: int):
538
- self.make_request(endpoint="/jobs/cancel", data={"id": job_id})
900
+ Returns:
901
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
539
902
 
540
- def get_default_workspace(self) -> str:
541
- workspaces = cast(
542
- List[Dict[str, Any]],
543
- check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
544
- "workspaces", []
545
- ),
546
- )
547
- return workspaces[0]["workspaceId"]
903
+ Examples:
904
+ Loading the asset specs for a given Airbyte workspace:
548
905
 
549
- def get_source_definition_by_name(self, name: str) -> Optional[str]:
550
- name_lower = name.lower()
551
- definitions = check.not_none(
552
- self.make_request_cached(endpoint="/source_definitions/list", data={})
553
- )
554
- source_definitions = cast(List[Dict[str, Any]], definitions["sourceDefinitions"])
555
-
556
- return next(
557
- (
558
- definition["sourceDefinitionId"]
559
- for definition in source_definitions
560
- if definition["name"].lower() == name_lower
561
- ),
562
- None,
563
- )
906
+ .. code-block:: python
564
907
 
565
- def get_destination_definition_by_name(self, name: str):
566
- name_lower = name.lower()
567
- definitions = cast(
568
- Dict[str, List[Dict[str, str]]],
569
- check.not_none(
570
- self.make_request_cached(endpoint="/destination_definitions/list", data={})
571
- ),
572
- )
573
- return next(
574
- (
575
- definition["destinationDefinitionId"]
576
- for definition in definitions["destinationDefinitions"]
577
- if definition["name"].lower() == name_lower
578
- ),
579
- None,
580
- )
908
+ from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
581
909
 
582
- def get_source_catalog_id(self, source_id: str):
583
- result = cast(
584
- Dict[str, Any],
585
- check.not_none(
586
- self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
587
- ),
588
- )
589
- return result["catalogId"]
590
-
591
- def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
592
- return cast(
593
- Dict[str, Any],
594
- check.not_none(
595
- self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
596
- ),
597
- )
910
+ import dagster as dg
598
911
 
599
- def does_dest_support_normalization(
600
- self, destination_definition_id: str, workspace_id: str
601
- ) -> bool:
602
- # Airbyte API changed source of truth for normalization in PR
603
- # https://github.com/airbytehq/airbyte/pull/21005
604
- norm_dest_def_spec: bool = cast(
605
- Dict[str, Any],
606
- check.not_none(
607
- self.make_request_cached(
608
- endpoint="/destination_definition_specifications/get",
609
- data={
610
- "destinationDefinitionId": destination_definition_id,
611
- "workspaceId": workspace_id,
612
- },
613
- )
614
- ),
615
- ).get("supportsNormalization", False)
616
-
617
- norm_dest_def: bool = (
618
- cast(
619
- Dict[str, Any],
620
- check.not_none(
621
- self.make_request_cached(
622
- endpoint="/destination_definitions/get",
623
- data={
624
- "destinationDefinitionId": destination_definition_id,
625
- },
626
- )
627
- ),
912
+ airbyte_workspace = AirbyteWorkspace(
913
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
914
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
915
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
628
916
  )
629
- .get("normalizationConfig", {})
630
- .get("supported", False)
631
- )
632
917
 
633
- return any([norm_dest_def_spec, norm_dest_def])
918
+ airbyte_specs = load_airbyte_asset_specs(airbyte_workspace)
919
+ dg.Definitions(assets=airbyte_specs)
634
920
 
635
- def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
636
- if self.forward_logs:
637
- return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))
638
- else:
639
- # the "list all jobs" endpoint doesn't return logs, which actually makes it much more
640
- # lightweight for long-running syncs with many logs
641
- out = check.not_none(
642
- self.make_request(
643
- endpoint="/jobs/list",
644
- data={
645
- "configTypes": ["sync"],
646
- "configId": connection_id,
647
- # sync should be the most recent, so pageSize 5 is sufficient
648
- "pagination": {"pageSize": 5},
649
- },
921
+ Filter connections by name:
922
+
923
+ .. code-block:: python
924
+
925
+ from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
926
+
927
+ import dagster as dg
928
+
929
+ airbyte_workspace = AirbyteWorkspace(
930
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
931
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
932
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
933
+ )
934
+
935
+ airbyte_specs = load_airbyte_asset_specs(
936
+ workspace=airbyte_workspace,
937
+ connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
938
+ )
939
+ dg.Definitions(assets=airbyte_specs)
940
+ """
941
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
942
+
943
+ with workspace.process_config_and_initialize_cm_cached() as initialized_workspace:
944
+ return [
945
+ spec.merge_attributes(
946
+ metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
947
+ )
948
+ for spec in check.is_list(
949
+ AirbyteWorkspaceDefsLoader(
950
+ workspace=initialized_workspace,
951
+ translator=dagster_airbyte_translator,
952
+ connection_selector_fn=connection_selector_fn,
650
953
  )
954
+ .build_defs()
955
+ .assets,
956
+ AssetSpec,
651
957
  )
652
- job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)
958
+ ]
653
959
 
654
- return check.not_none(job)
655
960
 
656
- def start_sync(self, connection_id: str) -> Mapping[str, object]:
657
- return check.not_none(
658
- self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})
659
- )
961
+ @public
962
+ @superseded(additional_warn_text="Use load_airbyte_asset_specs instead.")
963
+ def load_airbyte_cloud_asset_specs(
964
+ workspace: AirbyteCloudWorkspace,
965
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
966
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
967
+ ) -> Sequence[AssetSpec]:
968
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
660
969
 
661
- def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
662
- return check.not_none(
663
- self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})
664
- )
970
+ Args:
971
+ workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
972
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
973
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
974
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
975
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
976
+ which Airbyte connection assets are created for.
665
977
 
666
- def sync_and_poll(
667
- self,
668
- connection_id: str,
669
- poll_interval: Optional[float] = None,
670
- poll_timeout: Optional[float] = None,
671
- ) -> AirbyteOutput:
672
- """Initializes a sync operation for the given connector, and polls until it completes.
978
+ Returns:
979
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
673
980
 
674
- Args:
675
- connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
676
- "Connection" tab of a given connection in the Arbyte UI.
677
- poll_interval (float): The time (in seconds) that will be waited between successive polls.
678
- poll_timeout (float): The maximum time that will waited before this operation is timed
679
- out. By default, this will never time out.
981
+ Examples:
982
+ Loading the asset specs for a given Airbyte Cloud workspace:
680
983
 
681
- Returns:
682
- :py:class:`~AirbyteOutput`:
683
- Details of the sync job.
684
- """
685
- connection_details = self.get_connection_details(connection_id)
686
- job_details = self.start_sync(connection_id)
687
- job_info = cast(Dict[str, object], job_details.get("job", {}))
688
- job_id = cast(int, job_info.get("id"))
984
+ .. code-block:: python
689
985
 
690
- self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
691
- start = time.monotonic()
692
- logged_attempts = 0
693
- logged_lines = 0
694
- state = None
986
+ from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
695
987
 
696
- try:
697
- while True:
698
- if poll_timeout and start + poll_timeout < time.monotonic():
699
- raise Failure(
700
- f"Timeout: Airbyte job {job_id} is not ready after the timeout"
701
- f" {poll_timeout} seconds"
702
- )
703
- time.sleep(poll_interval or self.poll_interval)
704
- job_details = self.get_job_status(connection_id, job_id)
705
- attempts = cast(List, job_details.get("attempts", []))
706
- cur_attempt = len(attempts)
707
- # spit out the available Airbyte log info
708
- if cur_attempt:
709
- if self.forward_logs:
710
- log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
711
-
712
- for line in log_lines[logged_lines:]:
713
- sys.stdout.write(line + "\n")
714
- sys.stdout.flush()
715
- logged_lines = len(log_lines)
716
-
717
- # if there's a next attempt, this one will have no more log messages
718
- if logged_attempts < cur_attempt - 1:
719
- logged_lines = 0
720
- logged_attempts += 1
721
-
722
- job_info = cast(Dict[str, object], job_details.get("job", {}))
723
- state = job_info.get("status")
724
-
725
- if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
726
- continue
727
- elif state == AirbyteState.SUCCEEDED:
728
- break
729
- elif state == AirbyteState.ERROR:
730
- raise Failure(f"Job failed: {job_id}")
731
- elif state == AirbyteState.CANCELLED:
732
- raise Failure(f"Job was cancelled: {job_id}")
733
- else:
734
- raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
735
- finally:
736
- # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
737
- # the python process
738
- if (
739
- state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
740
- and self.cancel_sync_on_run_termination
741
- ):
742
- self.cancel_job(job_id)
988
+ import dagster as dg
743
989
 
744
- return AirbyteOutput(job_details=job_details, connection_details=connection_details)
990
+ airbyte_cloud_workspace = AirbyteCloudWorkspace(
991
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
992
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
993
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
994
+ )
745
995
 
996
+ airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
997
+ dg.Definitions(assets=airbyte_cloud_specs)
746
998
 
747
- @dagster_maintained_resource
748
- @resource(config_schema=AirbyteResource.to_config_schema())
749
- def airbyte_resource(context) -> AirbyteResource:
750
- """This resource allows users to programatically interface with the Airbyte REST API to launch
751
- syncs and monitor their progress. This currently implements only a subset of the functionality
752
- exposed by the API.
999
+ Filter connections by name:
753
1000
 
754
- For a complete set of documentation on the Airbyte REST API, including expected response JSON
755
- schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.
1001
+ .. code-block:: python
756
1002
 
757
- To configure this resource, we recommend using the `configured
758
- <https://docs.dagster.io/concepts/configuration/configured>`_ method.
1003
+ from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
759
1004
 
760
- **Examples:**
1005
+ import dagster as dg
761
1006
 
762
- .. code-block:: python
1007
+ airbyte_cloud_workspace = AirbyteCloudWorkspace(
1008
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1009
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1010
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1011
+ )
763
1012
 
764
- from dagster import job
765
- from dagster_airbyte import airbyte_resource
1013
+ airbyte_cloud_specs = load_airbyte_cloud_asset_specs(
1014
+ workspace=airbyte_cloud_workspace,
1015
+ connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
1016
+ )
1017
+ dg.Definitions(assets=airbyte_cloud_specs)
1018
+ """
1019
+ return load_airbyte_asset_specs(
1020
+ workspace=workspace,
1021
+ dagster_airbyte_translator=dagster_airbyte_translator,
1022
+ connection_selector_fn=connection_selector_fn,
1023
+ )
766
1024
 
767
- my_airbyte_resource = airbyte_resource.configured(
768
- {
769
- "host": {"env": "AIRBYTE_HOST"},
770
- "port": {"env": "AIRBYTE_PORT"},
771
- # If using basic auth
772
- "username": {"env": "AIRBYTE_USERNAME"},
773
- "password": {"env": "AIRBYTE_PASSWORD"},
774
- }
775
- )
776
1025
 
777
- @job(resource_defs={"airbyte":my_airbyte_resource})
778
- def my_airbyte_job():
779
- ...
1026
+ @record
1027
+ class AirbyteWorkspaceDefsLoader(StateBackedDefinitionsLoader[AirbyteWorkspaceData]):
1028
+ workspace: Union[AirbyteWorkspace, AirbyteCloudWorkspace]
1029
+ translator: DagsterAirbyteTranslator
1030
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]]
780
1031
 
781
- """
782
- return AirbyteResource.from_resource_context(context)
1032
+ @property
1033
+ def defs_key(self) -> str:
1034
+ return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}.{self.workspace.workspace_id}"
783
1035
 
1036
+ def fetch_state(self) -> AirbyteWorkspaceData:
1037
+ return self.workspace.fetch_airbyte_workspace_data()
784
1038
 
785
- @dagster_maintained_resource
786
- @resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))
787
- def airbyte_cloud_resource(context) -> AirbyteCloudResource:
788
- """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
789
- syncs and monitor their progress. Currently, this resource may only be used with the more basic
790
- `dagster-airbyte` APIs, including the ops and assets.
1039
+ def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
1040
+ all_asset_specs = [
1041
+ self.translator.get_asset_spec(props)
1042
+ for props in state.to_airbyte_connection_table_props_data()
1043
+ if not self.connection_selector_fn
1044
+ or self.connection_selector_fn(state.connections_by_id[props.connection_id])
1045
+ ]
791
1046
 
792
- """
793
- return AirbyteCloudResource.from_resource_context(context)
1047
+ return Definitions(assets=all_asset_specs)