dagster-airbyte 0.23.7__py3-none-any.whl → 0.25.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dagster-airbyte might be problematic. Click here for more details.

@@ -4,42 +4,71 @@ import logging
4
4
  import sys
5
5
  import time
6
6
  from abc import abstractmethod
7
+ from collections.abc import Mapping, Sequence
7
8
  from contextlib import contextmanager
8
- from typing import Any, Dict, List, Mapping, Optional, cast
9
+ from datetime import datetime, timedelta
10
+ from typing import Any, Optional, cast
9
11
 
10
12
  import requests
11
13
  from dagster import (
14
+ AssetExecutionContext,
15
+ AssetMaterialization,
12
16
  ConfigurableResource,
17
+ Definitions,
13
18
  Failure,
19
+ InitResourceContext,
20
+ MaterializeResult,
14
21
  _check as check,
15
22
  get_dagster_logger,
16
23
  resource,
17
24
  )
25
+ from dagster._annotations import experimental, public
18
26
  from dagster._config.pythonic_config import infer_schema_from_config_class
27
+ from dagster._core.definitions.asset_spec import AssetSpec
28
+ from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
19
29
  from dagster._core.definitions.resource_definition import dagster_maintained_resource
30
+ from dagster._model import DagsterModel
31
+ from dagster._record import record
20
32
  from dagster._utils.cached_method import cached_method
21
33
  from dagster._utils.merger import deep_merge_dicts
22
- from pydantic import Field
34
+ from pydantic import Field, PrivateAttr
23
35
  from requests.exceptions import RequestException
24
36
 
37
+ from dagster_airbyte.translator import (
38
+ AirbyteConnection,
39
+ AirbyteConnectionTableProps,
40
+ AirbyteDestination,
41
+ AirbyteJob,
42
+ AirbyteJobStatusType,
43
+ AirbyteMetadataSet,
44
+ AirbyteWorkspaceData,
45
+ DagsterAirbyteTranslator,
46
+ )
25
47
  from dagster_airbyte.types import AirbyteOutput
48
+ from dagster_airbyte.utils import (
49
+ DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY,
50
+ get_airbyte_connection_table_name,
51
+ get_translator_from_airbyte_assets,
52
+ )
53
+
54
+ AIRBYTE_REST_API_BASE = "https://api.airbyte.com"
55
+ AIRBYTE_REST_API_VERSION = "v1"
56
+
57
+ AIRBYTE_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
58
+ AIRBYTE_CONFIGURATION_API_VERSION = "v1"
26
59
 
27
60
  DEFAULT_POLL_INTERVAL_SECONDS = 10
28
61
 
62
+ # The access token expire every 3 minutes in Airbyte Cloud.
63
+ # Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
64
+ AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS = 150
29
65
 
30
- class AirbyteState:
31
- RUNNING = "running"
32
- SUCCEEDED = "succeeded"
33
- CANCELLED = "cancelled"
34
- PENDING = "pending"
35
- FAILED = "failed"
36
- ERROR = "error"
37
- INCOMPLETE = "incomplete"
66
+ AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
38
67
 
39
68
 
40
69
  class AirbyteResourceState:
41
70
  def __init__(self) -> None:
42
- self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}
71
+ self.request_cache: dict[str, Optional[Mapping[str, object]]] = {}
43
72
  # Int in case we nest contexts
44
73
  self.cache_enabled = 0
45
74
 
@@ -94,7 +123,11 @@ class BaseAirbyteResource(ConfigurableResource):
94
123
  raise NotImplementedError()
95
124
 
96
125
  def make_request(
97
- self, endpoint: str, data: Optional[Mapping[str, object]] = None, method: str = "POST"
126
+ self,
127
+ endpoint: str,
128
+ data: Optional[Mapping[str, object]] = None,
129
+ method: str = "POST",
130
+ include_additional_request_params: bool = True,
98
131
  ) -> Optional[Mapping[str, object]]:
99
132
  """Creates and sends a request to the desired Airbyte REST API endpoint.
100
133
 
@@ -111,7 +144,7 @@ class BaseAirbyteResource(ConfigurableResource):
111
144
  num_retries = 0
112
145
  while True:
113
146
  try:
114
- request_args: Dict[str, Any] = dict(
147
+ request_args: dict[str, Any] = dict(
115
148
  method=method,
116
149
  url=url,
117
150
  headers=headers,
@@ -120,10 +153,11 @@ class BaseAirbyteResource(ConfigurableResource):
120
153
  if data:
121
154
  request_args["json"] = data
122
155
 
123
- request_args = deep_merge_dicts(
124
- request_args,
125
- self.all_additional_request_params,
126
- )
156
+ if include_additional_request_params:
157
+ request_args = deep_merge_dicts(
158
+ request_args,
159
+ self.all_additional_request_params,
160
+ )
127
161
 
128
162
  response = requests.request(
129
163
  **request_args,
@@ -183,7 +217,7 @@ class BaseAirbyteResource(ConfigurableResource):
183
217
  """
184
218
  connection_details = self.get_connection_details(connection_id)
185
219
  job_details = self.start_sync(connection_id)
186
- job_info = cast(Dict[str, object], job_details.get("job", {}))
220
+ job_info = cast(dict[str, object], job_details.get("job", {}))
187
221
  job_id = cast(int, job_info.get("id"))
188
222
 
189
223
  self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
@@ -201,7 +235,7 @@ class BaseAirbyteResource(ConfigurableResource):
201
235
  )
202
236
  time.sleep(poll_interval or self.poll_interval)
203
237
  job_details = self.get_job_status(connection_id, job_id)
204
- attempts = cast(List, job_details.get("attempts", []))
238
+ attempts = cast(list, job_details.get("attempts", []))
205
239
  cur_attempt = len(attempts)
206
240
  # spit out the available Airbyte log info
207
241
  if cur_attempt:
@@ -218,16 +252,20 @@ class BaseAirbyteResource(ConfigurableResource):
218
252
  logged_lines = 0
219
253
  logged_attempts += 1
220
254
 
221
- job_info = cast(Dict[str, object], job_details.get("job", {}))
255
+ job_info = cast(dict[str, object], job_details.get("job", {}))
222
256
  state = job_info.get("status")
223
257
 
224
- if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
258
+ if state in (
259
+ AirbyteJobStatusType.RUNNING,
260
+ AirbyteJobStatusType.PENDING,
261
+ AirbyteJobStatusType.INCOMPLETE,
262
+ ):
225
263
  continue
226
- elif state == AirbyteState.SUCCEEDED:
264
+ elif state == AirbyteJobStatusType.SUCCEEDED:
227
265
  break
228
- elif state == AirbyteState.ERROR:
266
+ elif state == AirbyteJobStatusType.ERROR:
229
267
  raise Failure(f"Job failed: {job_id}")
230
- elif state == AirbyteState.CANCELLED:
268
+ elif state == AirbyteJobStatusType.CANCELLED:
231
269
  raise Failure(f"Job was cancelled: {job_id}")
232
270
  else:
233
271
  raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
@@ -235,7 +273,12 @@ class BaseAirbyteResource(ConfigurableResource):
235
273
  # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
236
274
  # the python process
237
275
  if (
238
- state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
276
+ state
277
+ not in (
278
+ AirbyteJobStatusType.SUCCEEDED,
279
+ AirbyteJobStatusType.ERROR,
280
+ AirbyteJobStatusType.CANCELLED,
281
+ )
239
282
  and self.cancel_sync_on_run_termination
240
283
  ):
241
284
  self.cancel_job(job_id)
@@ -244,7 +287,7 @@ class BaseAirbyteResource(ConfigurableResource):
244
287
 
245
288
 
246
289
  class AirbyteCloudResource(BaseAirbyteResource):
247
- """This resource allows users to programatically interface with the Airbyte Cloud API to launch
290
+ """This resource allows users to programmatically interface with the Airbyte Cloud API to launch
248
291
  syncs and monitor their progress.
249
292
 
250
293
  **Examples:**
@@ -255,7 +298,8 @@ class AirbyteCloudResource(BaseAirbyteResource):
255
298
  from dagster_airbyte import AirbyteResource
256
299
 
257
300
  my_airbyte_resource = AirbyteCloudResource(
258
- api_key=EnvVar("AIRBYTE_API_KEY"),
301
+ client_id=EnvVar("AIRBYTE_CLIENT_ID"),
302
+ client_secret=EnvVar("AIRBYTE_CLIENT_SECRET"),
259
303
  )
260
304
 
261
305
  airbyte_assets = build_airbyte_assets(
@@ -269,7 +313,15 @@ class AirbyteCloudResource(BaseAirbyteResource):
269
313
  )
270
314
  """
271
315
 
272
- api_key: str = Field(..., description="The Airbyte Cloud API key.")
316
+ client_id: str = Field(..., description="The Airbyte Cloud client ID.")
317
+ client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
318
+
319
+ _access_token_value: Optional[str] = PrivateAttr(default=None)
320
+ _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
321
+
322
+ def setup_for_execution(self, context: InitResourceContext) -> None:
323
+ # Refresh access token when the resource is initialized
324
+ self._refresh_access_token()
273
325
 
274
326
  @property
275
327
  def api_base_url(self) -> str:
@@ -277,7 +329,32 @@ class AirbyteCloudResource(BaseAirbyteResource):
277
329
 
278
330
  @property
279
331
  def all_additional_request_params(self) -> Mapping[str, Any]:
280
- return {"headers": {"Authorization": f"Bearer {self.api_key}", "User-Agent": "dagster"}}
332
+ # Make sure the access token is refreshed before using it when calling the API.
333
+ if self._needs_refreshed_access_token():
334
+ self._refresh_access_token()
335
+ return {
336
+ "headers": {
337
+ "Authorization": f"Bearer {self._access_token_value}",
338
+ "User-Agent": "dagster",
339
+ }
340
+ }
341
+
342
+ def make_request(
343
+ self,
344
+ endpoint: str,
345
+ data: Optional[Mapping[str, object]] = None,
346
+ method: str = "POST",
347
+ include_additional_request_params: bool = True,
348
+ ) -> Optional[Mapping[str, object]]:
349
+ # Make sure the access token is refreshed before using it when calling the API.
350
+ if include_additional_request_params and self._needs_refreshed_access_token():
351
+ self._refresh_access_token()
352
+ return super().make_request(
353
+ endpoint=endpoint,
354
+ data=data,
355
+ method=method,
356
+ include_additional_request_params=include_additional_request_params,
357
+ )
281
358
 
282
359
  def start_sync(self, connection_id: str) -> Mapping[str, object]:
283
360
  job_sync = check.not_none(
@@ -306,6 +383,31 @@ class AirbyteCloudResource(BaseAirbyteResource):
306
383
  # Airbyte Cloud does not support streaming logs yet
307
384
  return False
308
385
 
386
+ def _refresh_access_token(self) -> None:
387
+ response = check.not_none(
388
+ self.make_request(
389
+ endpoint="/applications/token",
390
+ data={
391
+ "client_id": self.client_id,
392
+ "client_secret": self.client_secret,
393
+ },
394
+ # Must not pass the bearer access token when refreshing it.
395
+ include_additional_request_params=False,
396
+ )
397
+ )
398
+ self._access_token_value = str(response["access_token"])
399
+ self._access_token_timestamp = datetime.now().timestamp()
400
+
401
+ def _needs_refreshed_access_token(self) -> bool:
402
+ return (
403
+ not self._access_token_value
404
+ or not self._access_token_timestamp
405
+ or self._access_token_timestamp
406
+ <= datetime.timestamp(
407
+ datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
408
+ )
409
+ )
410
+
309
411
 
310
412
  class AirbyteResource(BaseAirbyteResource):
311
413
  """This resource allows users to programatically interface with the Airbyte REST API to launch
@@ -469,7 +571,7 @@ class AirbyteResource(BaseAirbyteResource):
469
571
 
470
572
  def get_default_workspace(self) -> str:
471
573
  workspaces = cast(
472
- List[Dict[str, Any]],
574
+ list[dict[str, Any]],
473
575
  check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
474
576
  "workspaces", []
475
577
  ),
@@ -481,7 +583,7 @@ class AirbyteResource(BaseAirbyteResource):
481
583
  definitions = check.not_none(
482
584
  self.make_request_cached(endpoint="/source_definitions/list", data={})
483
585
  )
484
- source_definitions = cast(List[Dict[str, Any]], definitions["sourceDefinitions"])
586
+ source_definitions = cast(list[dict[str, Any]], definitions["sourceDefinitions"])
485
587
 
486
588
  return next(
487
589
  (
@@ -495,7 +597,7 @@ class AirbyteResource(BaseAirbyteResource):
495
597
  def get_destination_definition_by_name(self, name: str):
496
598
  name_lower = name.lower()
497
599
  definitions = cast(
498
- Dict[str, List[Dict[str, str]]],
600
+ dict[str, list[dict[str, str]]],
499
601
  check.not_none(
500
602
  self.make_request_cached(endpoint="/destination_definitions/list", data={})
501
603
  ),
@@ -511,7 +613,7 @@ class AirbyteResource(BaseAirbyteResource):
511
613
 
512
614
  def get_source_catalog_id(self, source_id: str):
513
615
  result = cast(
514
- Dict[str, Any],
616
+ dict[str, Any],
515
617
  check.not_none(
516
618
  self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
517
619
  ),
@@ -520,7 +622,7 @@ class AirbyteResource(BaseAirbyteResource):
520
622
 
521
623
  def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
522
624
  return cast(
523
- Dict[str, Any],
625
+ dict[str, Any],
524
626
  check.not_none(
525
627
  self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
526
628
  ),
@@ -532,7 +634,7 @@ class AirbyteResource(BaseAirbyteResource):
532
634
  # Airbyte API changed source of truth for normalization in PR
533
635
  # https://github.com/airbytehq/airbyte/pull/21005
534
636
  norm_dest_def_spec: bool = cast(
535
- Dict[str, Any],
637
+ dict[str, Any],
536
638
  check.not_none(
537
639
  self.make_request_cached(
538
640
  endpoint="/destination_definition_specifications/get",
@@ -546,7 +648,7 @@ class AirbyteResource(BaseAirbyteResource):
546
648
 
547
649
  norm_dest_def: bool = (
548
650
  cast(
549
- Dict[str, Any],
651
+ dict[str, Any],
550
652
  check.not_none(
551
653
  self.make_request_cached(
552
654
  endpoint="/destination_definitions/get",
@@ -579,7 +681,7 @@ class AirbyteResource(BaseAirbyteResource):
579
681
  },
580
682
  )
581
683
  )
582
- job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)
684
+ job = next((job for job in cast(list, out["jobs"]) if job["job"]["id"] == job_id), None)
583
685
 
584
686
  return check.not_none(job)
585
687
 
@@ -614,7 +716,7 @@ class AirbyteResource(BaseAirbyteResource):
614
716
  """
615
717
  connection_details = self.get_connection_details(connection_id)
616
718
  job_details = self.start_sync(connection_id)
617
- job_info = cast(Dict[str, object], job_details.get("job", {}))
719
+ job_info = cast(dict[str, object], job_details.get("job", {}))
618
720
  job_id = cast(int, job_info.get("id"))
619
721
 
620
722
  self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
@@ -632,7 +734,7 @@ class AirbyteResource(BaseAirbyteResource):
632
734
  )
633
735
  time.sleep(poll_interval or self.poll_interval)
634
736
  job_details = self.get_job_status(connection_id, job_id)
635
- attempts = cast(List, job_details.get("attempts", []))
737
+ attempts = cast(list, job_details.get("attempts", []))
636
738
  cur_attempt = len(attempts)
637
739
  # spit out the available Airbyte log info
638
740
  if cur_attempt:
@@ -649,16 +751,20 @@ class AirbyteResource(BaseAirbyteResource):
649
751
  logged_lines = 0
650
752
  logged_attempts += 1
651
753
 
652
- job_info = cast(Dict[str, object], job_details.get("job", {}))
754
+ job_info = cast(dict[str, object], job_details.get("job", {}))
653
755
  state = job_info.get("status")
654
756
 
655
- if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
757
+ if state in (
758
+ AirbyteJobStatusType.RUNNING,
759
+ AirbyteJobStatusType.PENDING,
760
+ AirbyteJobStatusType.INCOMPLETE,
761
+ ):
656
762
  continue
657
- elif state == AirbyteState.SUCCEEDED:
763
+ elif state == AirbyteJobStatusType.SUCCEEDED:
658
764
  break
659
- elif state == AirbyteState.ERROR:
765
+ elif state == AirbyteJobStatusType.ERROR:
660
766
  raise Failure(f"Job failed: {job_id}")
661
- elif state == AirbyteState.CANCELLED:
767
+ elif state == AirbyteJobStatusType.CANCELLED:
662
768
  raise Failure(f"Job was cancelled: {job_id}")
663
769
  else:
664
770
  raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
@@ -666,7 +772,12 @@ class AirbyteResource(BaseAirbyteResource):
666
772
  # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
667
773
  # the python process
668
774
  if (
669
- state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
775
+ state
776
+ not in (
777
+ AirbyteJobStatusType.SUCCEEDED,
778
+ AirbyteJobStatusType.ERROR,
779
+ AirbyteJobStatusType.CANCELLED,
780
+ )
670
781
  and self.cancel_sync_on_run_termination
671
782
  ):
672
783
  self.cancel_job(job_id)
@@ -721,3 +832,552 @@ def airbyte_cloud_resource(context) -> AirbyteCloudResource:
721
832
 
722
833
  """
723
834
  return AirbyteCloudResource.from_resource_context(context)
835
+
836
+
837
+ # -------------
838
+ # Resources v2
839
+ # -------------
840
+
841
+
842
+ @experimental
843
+ class AirbyteCloudClient(DagsterModel):
844
+ """This class exposes methods on top of the Airbyte APIs for Airbyte Cloud."""
845
+
846
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
847
+ client_id: str = Field(..., description="The Airbyte client ID.")
848
+ client_secret: str = Field(..., description="The Airbyte client secret.")
849
+ request_max_retries: int = Field(
850
+ ...,
851
+ description=(
852
+ "The maximum number of times requests to the Airbyte API should be retried "
853
+ "before failing."
854
+ ),
855
+ )
856
+ request_retry_delay: float = Field(
857
+ ...,
858
+ description="Time (in seconds) to wait between each request retry.",
859
+ )
860
+ request_timeout: int = Field(
861
+ ...,
862
+ description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
863
+ )
864
+
865
+ _access_token_value: Optional[str] = PrivateAttr(default=None)
866
+ _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
867
+
868
+ @property
869
+ @cached_method
870
+ def _log(self) -> logging.Logger:
871
+ return get_dagster_logger()
872
+
873
+ @property
874
+ def rest_api_base_url(self) -> str:
875
+ return f"{AIRBYTE_REST_API_BASE}/{AIRBYTE_REST_API_VERSION}"
876
+
877
+ @property
878
+ def configuration_api_base_url(self) -> str:
879
+ return f"{AIRBYTE_CONFIGURATION_API_BASE}/{AIRBYTE_CONFIGURATION_API_VERSION}"
880
+
881
+ @property
882
+ def all_additional_request_params(self) -> Mapping[str, Any]:
883
+ return {**self.authorization_request_params, **self.user_agent_request_params}
884
+
885
+ @property
886
+ def authorization_request_params(self) -> Mapping[str, Any]:
887
+ # Make sure the access token is refreshed before using it when calling the API.
888
+ if self._needs_refreshed_access_token():
889
+ self._refresh_access_token()
890
+ return {
891
+ "Authorization": f"Bearer {self._access_token_value}",
892
+ }
893
+
894
+ @property
895
+ def user_agent_request_params(self) -> Mapping[str, Any]:
896
+ return {
897
+ "User-Agent": "dagster",
898
+ }
899
+
900
+ def _refresh_access_token(self) -> None:
901
+ response = check.not_none(
902
+ self._make_request(
903
+ method="POST",
904
+ endpoint="applications/token",
905
+ base_url=self.rest_api_base_url,
906
+ data={
907
+ "client_id": self.client_id,
908
+ "client_secret": self.client_secret,
909
+ },
910
+ # Must not pass the bearer access token when refreshing it.
911
+ include_additional_request_params=False,
912
+ )
913
+ )
914
+ self._access_token_value = str(response["access_token"])
915
+ self._access_token_timestamp = datetime.now().timestamp()
916
+
917
+ def _needs_refreshed_access_token(self) -> bool:
918
+ return (
919
+ not self._access_token_value
920
+ or not self._access_token_timestamp
921
+ or self._access_token_timestamp
922
+ <= (
923
+ datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
924
+ ).timestamp()
925
+ )
926
+
927
+ def _get_session(self, include_additional_request_params: bool) -> requests.Session:
928
+ headers = {"accept": "application/json"}
929
+ if include_additional_request_params:
930
+ headers = {
931
+ **headers,
932
+ **self.all_additional_request_params,
933
+ }
934
+ session = requests.Session()
935
+ session.headers.update(headers)
936
+ return session
937
+
938
+ def _make_request(
939
+ self,
940
+ method: str,
941
+ endpoint: str,
942
+ base_url: str,
943
+ data: Optional[Mapping[str, Any]] = None,
944
+ params: Optional[Mapping[str, Any]] = None,
945
+ include_additional_request_params: bool = True,
946
+ ) -> Mapping[str, Any]:
947
+ """Creates and sends a request to the desired Airbyte REST API endpoint.
948
+
949
+ Args:
950
+ method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").
951
+ endpoint (str): The Airbyte API endpoint to send this request to.
952
+ base_url (str): The base url to the Airbyte API to use.
953
+ data (Optional[Dict[str, Any]]): JSON-formatted data string to be included in the request.
954
+ params (Optional[Dict[str, Any]]): JSON-formatted query params to be included in the request.
955
+ include_additional_request_params (bool): Whether to include authorization and user-agent headers
956
+ to the request parameters. Defaults to True.
957
+
958
+ Returns:
959
+ Dict[str, Any]: Parsed json data from the response to this request
960
+ """
961
+ url = f"{base_url}/{endpoint}"
962
+
963
+ num_retries = 0
964
+ while True:
965
+ try:
966
+ session = self._get_session(
967
+ include_additional_request_params=include_additional_request_params
968
+ )
969
+ response = session.request(
970
+ method=method, url=url, json=data, params=params, timeout=self.request_timeout
971
+ )
972
+ response.raise_for_status()
973
+ return response.json()
974
+ except RequestException as e:
975
+ self._log.error(
976
+ f"Request to Airbyte API failed for url {url} with method {method} : {e}"
977
+ )
978
+ if num_retries == self.request_max_retries:
979
+ break
980
+ num_retries += 1
981
+ time.sleep(self.request_retry_delay)
982
+
983
+ raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
984
+
985
+ def get_connections(self) -> Mapping[str, Any]:
986
+ """Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
987
+ return self._make_request(
988
+ method="GET",
989
+ endpoint="connections",
990
+ base_url=self.rest_api_base_url,
991
+ params={"workspaceIds": self.workspace_id},
992
+ )
993
+
994
+ def get_connection_details(self, connection_id) -> Mapping[str, Any]:
995
+ """Fetches details about a given connection from the Airbyte Configuration API.
996
+ The Airbyte Configuration API is an internal and may change in the future.
997
+ """
998
+ # Using the Airbyte Configuration API to get the connection details, including streams and their configs.
999
+ # https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
1000
+ # https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
1001
+ return self._make_request(
1002
+ method="POST",
1003
+ endpoint="connections/get",
1004
+ base_url=self.configuration_api_base_url,
1005
+ data={"connectionId": connection_id},
1006
+ )
1007
+
1008
+ def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
1009
+ """Fetches details about a given destination from the Airbyte REST API."""
1010
+ return self._make_request(
1011
+ method="GET",
1012
+ endpoint=f"destinations/{destination_id}",
1013
+ base_url=self.rest_api_base_url,
1014
+ )
1015
+
1016
+ def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
1017
+ return self._make_request(
1018
+ method="POST",
1019
+ endpoint="jobs",
1020
+ base_url=self.rest_api_base_url,
1021
+ data={
1022
+ "connectionId": connection_id,
1023
+ "jobType": "sync",
1024
+ },
1025
+ )
1026
+
1027
+ def get_job_details(self, job_id: int) -> Mapping[str, Any]:
1028
+ return self._make_request(
1029
+ method="GET", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
1030
+ )
1031
+
1032
+ def cancel_job(self, job_id: int) -> Mapping[str, Any]:
1033
+ return self._make_request(
1034
+ method="DELETE", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
1035
+ )
1036
+
1037
+ def sync_and_poll(
1038
+ self,
1039
+ connection_id: str,
1040
+ poll_interval: Optional[float] = None,
1041
+ poll_timeout: Optional[float] = None,
1042
+ cancel_on_termination: bool = True,
1043
+ ) -> AirbyteOutput:
1044
+ """Initializes a sync operation for the given connection, and polls until it completes.
1045
+
1046
+ Args:
1047
+ connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
1048
+ "Connection" tab of a given connection in the Airbyte UI.
1049
+ poll_interval (float): The time (in seconds) that will be waited between successive polls.
1050
+ poll_timeout (float): The maximum time that will wait before this operation is timed
1051
+ out. By default, this will never time out.
1052
+ cancel_on_termination (bool): Whether to cancel a sync in Airbyte if the Dagster runner is terminated.
1053
+ This may be useful to disable if using Airbyte sources that cannot be cancelled and
1054
+ resumed easily, or if your Dagster deployment may experience runner interruptions
1055
+ that do not impact your Airbyte deployment.
1056
+
1057
+ Returns:
1058
+ :py:class:`~AirbyteOutput`:
1059
+ Details of the sync job.
1060
+ """
1061
+ connection_details = self.get_connection_details(connection_id)
1062
+ start_job_details = self.start_sync_job(connection_id)
1063
+ job = AirbyteJob.from_job_details(job_details=start_job_details)
1064
+
1065
+ self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
1066
+ poll_start = datetime.now()
1067
+ poll_interval = (
1068
+ poll_interval if poll_interval is not None else DEFAULT_POLL_INTERVAL_SECONDS
1069
+ )
1070
+ try:
1071
+ while True:
1072
+ if poll_timeout and datetime.now() > poll_start + timedelta(seconds=poll_timeout):
1073
+ raise Failure(
1074
+ f"Timeout: Airbyte job {job.id} is not ready after the timeout"
1075
+ f" {poll_timeout} seconds"
1076
+ )
1077
+
1078
+ time.sleep(poll_interval)
1079
+ # We return these job details in the AirbyteOutput when the job succeeds
1080
+ poll_job_details = self.get_job_details(job.id)
1081
+ job = AirbyteJob.from_job_details(job_details=poll_job_details)
1082
+ if job.status in (
1083
+ AirbyteJobStatusType.RUNNING,
1084
+ AirbyteJobStatusType.PENDING,
1085
+ AirbyteJobStatusType.INCOMPLETE,
1086
+ ):
1087
+ continue
1088
+ elif job.status == AirbyteJobStatusType.SUCCEEDED:
1089
+ break
1090
+ elif job.status in [AirbyteJobStatusType.ERROR, AirbyteJobStatusType.FAILED]:
1091
+ raise Failure(f"Job failed: {job.id}")
1092
+ elif job.status == AirbyteJobStatusType.CANCELLED:
1093
+ raise Failure(f"Job was cancelled: {job.id}")
1094
+ else:
1095
+ raise Failure(
1096
+ f"Encountered unexpected state `{job.status}` for job_id {job.id}"
1097
+ )
1098
+ finally:
1099
+ # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
1100
+ # the python process
1101
+ if cancel_on_termination and job.status not in (
1102
+ AirbyteJobStatusType.SUCCEEDED,
1103
+ AirbyteJobStatusType.ERROR,
1104
+ AirbyteJobStatusType.CANCELLED,
1105
+ AirbyteJobStatusType.FAILED,
1106
+ ):
1107
+ self.cancel_job(job.id)
1108
+
1109
+ return AirbyteOutput(job_details=poll_job_details, connection_details=connection_details)
1110
+
1111
+
1112
+ @experimental
1113
+ class AirbyteCloudWorkspace(ConfigurableResource):
1114
+ """This class represents a Airbyte Cloud workspace and provides utilities
1115
+ to interact with Airbyte APIs.
1116
+ """
1117
+
1118
+ workspace_id: str = Field(..., description="The Airbyte Cloud workspace ID")
1119
+ client_id: str = Field(..., description="The Airbyte Cloud client ID.")
1120
+ client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
1121
+ request_max_retries: int = Field(
1122
+ default=3,
1123
+ description=(
1124
+ "The maximum number of times requests to the Airbyte API should be retried "
1125
+ "before failing."
1126
+ ),
1127
+ )
1128
+ request_retry_delay: float = Field(
1129
+ default=0.25,
1130
+ description="Time (in seconds) to wait between each request retry.",
1131
+ )
1132
+ request_timeout: int = Field(
1133
+ default=15,
1134
+ description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
1135
+ )
1136
+
1137
+ _client: AirbyteCloudClient = PrivateAttr(default=None) # type: ignore
1138
+
1139
+ @cached_method
1140
+ def get_client(self) -> AirbyteCloudClient:
1141
+ return AirbyteCloudClient(
1142
+ workspace_id=self.workspace_id,
1143
+ client_id=self.client_id,
1144
+ client_secret=self.client_secret,
1145
+ request_max_retries=self.request_max_retries,
1146
+ request_retry_delay=self.request_retry_delay,
1147
+ request_timeout=self.request_timeout,
1148
+ )
1149
+
1150
+ def fetch_airbyte_workspace_data(
1151
+ self,
1152
+ ) -> AirbyteWorkspaceData:
1153
+ """Retrieves all Airbyte content from the workspace and returns it as a AirbyteWorkspaceData object.
1154
+
1155
+ Returns:
1156
+ AirbyteWorkspaceData: A snapshot of the Airbyte workspace's content.
1157
+ """
1158
+ connections_by_id = {}
1159
+ destinations_by_id = {}
1160
+
1161
+ client = self.get_client()
1162
+ connections = client.get_connections()["data"]
1163
+
1164
+ for partial_connection_details in connections:
1165
+ full_connection_details = client.get_connection_details(
1166
+ connection_id=partial_connection_details["connectionId"]
1167
+ )
1168
+ connection = AirbyteConnection.from_connection_details(
1169
+ connection_details=full_connection_details
1170
+ )
1171
+ connections_by_id[connection.id] = connection
1172
+
1173
+ destination_details = client.get_destination_details(
1174
+ destination_id=connection.destination_id
1175
+ )
1176
+ destination = AirbyteDestination.from_destination_details(
1177
+ destination_details=destination_details
1178
+ )
1179
+ destinations_by_id[destination.id] = destination
1180
+
1181
+ return AirbyteWorkspaceData(
1182
+ connections_by_id=connections_by_id,
1183
+ destinations_by_id=destinations_by_id,
1184
+ )
1185
+
1186
+ @cached_method
1187
+ def load_asset_specs(
1188
+ self,
1189
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
1190
+ ) -> Sequence[AssetSpec]:
1191
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
1192
+
1193
+ Args:
1194
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1195
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1196
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
1197
+
1198
+ Returns:
1199
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
1200
+
1201
+ Examples:
1202
+ Loading the asset specs for a given Airbyte workspace:
1203
+ .. code-block:: python
1204
+
1205
+ from dagster_airbyte import AirbyteCloudWorkspace
1206
+
1207
+ import dagster as dg
1208
+
1209
+ airbyte_workspace = AirbyteCloudWorkspace(
1210
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1211
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1212
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1213
+ )
1214
+
1215
+ airbyte_specs = airbyte_workspace.load_asset_specs()
1216
+ defs = dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace}
1217
+ """
1218
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1219
+
1220
+ return load_airbyte_cloud_asset_specs(
1221
+ workspace=self, dagster_airbyte_translator=dagster_airbyte_translator
1222
+ )
1223
+
1224
+ def _generate_materialization(
1225
+ self,
1226
+ airbyte_output: AirbyteOutput,
1227
+ dagster_airbyte_translator: DagsterAirbyteTranslator,
1228
+ ):
1229
+ connection = AirbyteConnection.from_connection_details(
1230
+ connection_details=airbyte_output.connection_details
1231
+ )
1232
+
1233
+ for stream in connection.streams.values():
1234
+ if stream.selected:
1235
+ connection_table_name = get_airbyte_connection_table_name(
1236
+ stream_prefix=connection.stream_prefix,
1237
+ stream_name=stream.name,
1238
+ )
1239
+ stream_asset_spec = dagster_airbyte_translator.get_asset_spec(
1240
+ props=AirbyteConnectionTableProps(
1241
+ table_name=connection_table_name,
1242
+ stream_prefix=connection.stream_prefix,
1243
+ stream_name=stream.name,
1244
+ json_schema=stream.json_schema,
1245
+ connection_id=connection.id,
1246
+ connection_name=connection.name,
1247
+ destination_type=None,
1248
+ database=None,
1249
+ schema=None,
1250
+ )
1251
+ )
1252
+
1253
+ yield AssetMaterialization(
1254
+ asset_key=stream_asset_spec.key,
1255
+ description=(
1256
+ f"Table generated via Airbyte Cloud sync "
1257
+ f"for connection {connection.name}: {connection_table_name}"
1258
+ ),
1259
+ metadata=stream_asset_spec.metadata,
1260
+ )
1261
+
1262
+ @public
1263
+ @experimental
1264
+ def sync_and_poll(self, context: AssetExecutionContext):
1265
+ """Executes a sync and poll process to materialize Airbyte Cloud assets.
1266
+ This method can only be used in the context of an asset execution.
1267
+
1268
+ Args:
1269
+ context (AssetExecutionContext): The execution context
1270
+ from within `@airbyte_assets`.
1271
+
1272
+ Returns:
1273
+ Iterator[Union[AssetMaterialization, MaterializeResult]]: An iterator of MaterializeResult
1274
+ or AssetMaterialization.
1275
+ """
1276
+ assets_def = context.assets_def
1277
+ dagster_airbyte_translator = get_translator_from_airbyte_assets(assets_def)
1278
+ connection_id = next(
1279
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id)
1280
+ for spec in assets_def.specs
1281
+ )
1282
+
1283
+ client = self.get_client()
1284
+ airbyte_output = client.sync_and_poll(
1285
+ connection_id=connection_id,
1286
+ )
1287
+
1288
+ materialized_asset_keys = set()
1289
+ for materialization in self._generate_materialization(
1290
+ airbyte_output=airbyte_output, dagster_airbyte_translator=dagster_airbyte_translator
1291
+ ):
1292
+ # Scan through all tables actually created, if it was expected then emit a MaterializeResult.
1293
+ # Otherwise, emit a runtime AssetMaterialization.
1294
+ if materialization.asset_key in context.selected_asset_keys:
1295
+ yield MaterializeResult(
1296
+ asset_key=materialization.asset_key, metadata=materialization.metadata
1297
+ )
1298
+ materialized_asset_keys.add(materialization.asset_key)
1299
+ else:
1300
+ context.log.warning(
1301
+ f"An unexpected asset was materialized: {materialization.asset_key}. "
1302
+ f"Yielding a materialization event."
1303
+ )
1304
+ yield materialization
1305
+
1306
+ unmaterialized_asset_keys = context.selected_asset_keys - materialized_asset_keys
1307
+ if unmaterialized_asset_keys:
1308
+ context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
1309
+
1310
+
1311
+ @experimental
1312
+ def load_airbyte_cloud_asset_specs(
1313
+ workspace: AirbyteCloudWorkspace,
1314
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
1315
+ ) -> Sequence[AssetSpec]:
1316
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
1317
+
1318
+ Args:
1319
+ workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
1320
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1321
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1322
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
1323
+
1324
+ Returns:
1325
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
1326
+
1327
+ Examples:
1328
+ Loading the asset specs for a given Airbyte Cloud workspace:
1329
+
1330
+ .. code-block:: python
1331
+
1332
+ from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
1333
+
1334
+ import dagster as dg
1335
+
1336
+ airbyte_cloud_workspace = AirbyteCloudWorkspace(
1337
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1338
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1339
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1340
+ )
1341
+
1342
+
1343
+ airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
1344
+ defs = dg.Definitions(assets=airbyte_cloud_specs)
1345
+ """
1346
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1347
+
1348
+ with workspace.process_config_and_initialize_cm() as initialized_workspace:
1349
+ return [
1350
+ spec.merge_attributes(
1351
+ metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
1352
+ )
1353
+ for spec in check.is_list(
1354
+ AirbyteCloudWorkspaceDefsLoader(
1355
+ workspace=initialized_workspace,
1356
+ translator=dagster_airbyte_translator,
1357
+ )
1358
+ .build_defs()
1359
+ .assets,
1360
+ AssetSpec,
1361
+ )
1362
+ ]
1363
+
1364
+
1365
+ @record
1366
+ class AirbyteCloudWorkspaceDefsLoader(StateBackedDefinitionsLoader[Mapping[str, Any]]):
1367
+ workspace: AirbyteCloudWorkspace
1368
+ translator: DagsterAirbyteTranslator
1369
+
1370
+ @property
1371
+ def defs_key(self) -> str:
1372
+ return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}/{self.workspace.workspace_id}"
1373
+
1374
+ def fetch_state(self) -> AirbyteWorkspaceData:
1375
+ return self.workspace.fetch_airbyte_workspace_data()
1376
+
1377
+ def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
1378
+ all_asset_specs = [
1379
+ self.translator.get_asset_spec(props)
1380
+ for props in state.to_airbyte_connection_table_props_data()
1381
+ ]
1382
+
1383
+ return Definitions(assets=all_asset_specs)