dagster-airbyte 0.25.1__py3-none-any.whl → 0.25.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dagster-airbyte might be problematic. Click here for more details.

@@ -4,27 +4,58 @@ import logging
4
4
  import sys
5
5
  import time
6
6
  from abc import abstractmethod
7
+ from collections.abc import Mapping, Sequence
7
8
  from contextlib import contextmanager
8
9
  from datetime import datetime, timedelta
9
- from typing import Any, Dict, List, Mapping, Optional, cast
10
+ from typing import Any, Optional, cast
10
11
 
11
12
  import requests
12
13
  from dagster import (
14
+ AssetExecutionContext,
15
+ AssetMaterialization,
13
16
  ConfigurableResource,
17
+ Definitions,
14
18
  Failure,
15
19
  InitResourceContext,
20
+ MaterializeResult,
16
21
  _check as check,
17
22
  get_dagster_logger,
18
23
  resource,
19
24
  )
25
+ from dagster._annotations import experimental, public
20
26
  from dagster._config.pythonic_config import infer_schema_from_config_class
27
+ from dagster._core.definitions.asset_spec import AssetSpec
28
+ from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
21
29
  from dagster._core.definitions.resource_definition import dagster_maintained_resource
30
+ from dagster._model import DagsterModel
31
+ from dagster._record import record
22
32
  from dagster._utils.cached_method import cached_method
23
33
  from dagster._utils.merger import deep_merge_dicts
24
34
  from pydantic import Field, PrivateAttr
25
35
  from requests.exceptions import RequestException
26
36
 
37
+ from dagster_airbyte.translator import (
38
+ AirbyteConnection,
39
+ AirbyteConnectionTableProps,
40
+ AirbyteDestination,
41
+ AirbyteJob,
42
+ AirbyteJobStatusType,
43
+ AirbyteMetadataSet,
44
+ AirbyteWorkspaceData,
45
+ DagsterAirbyteTranslator,
46
+ )
27
47
  from dagster_airbyte.types import AirbyteOutput
48
+ from dagster_airbyte.utils import (
49
+ DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY,
50
+ get_airbyte_connection_table_name,
51
+ get_translator_from_airbyte_assets,
52
+ )
53
+
54
+ AIRBYTE_REST_API_BASE = "https://api.airbyte.com"
55
+ AIRBYTE_REST_API_VERSION = "v1"
56
+
57
+ AIRBYTE_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
58
+ AIRBYTE_CONFIGURATION_API_VERSION = "v1"
28
59
 
29
60
  DEFAULT_POLL_INTERVAL_SECONDS = 10
30
61
 
@@ -32,20 +63,12 @@ DEFAULT_POLL_INTERVAL_SECONDS = 10
32
63
  # Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
33
64
  AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS = 150
34
65
 
35
-
36
- class AirbyteState:
37
- RUNNING = "running"
38
- SUCCEEDED = "succeeded"
39
- CANCELLED = "cancelled"
40
- PENDING = "pending"
41
- FAILED = "failed"
42
- ERROR = "error"
43
- INCOMPLETE = "incomplete"
66
+ AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
44
67
 
45
68
 
46
69
  class AirbyteResourceState:
47
70
  def __init__(self) -> None:
48
- self.request_cache: Dict[str, Optional[Mapping[str, object]]] = {}
71
+ self.request_cache: dict[str, Optional[Mapping[str, object]]] = {}
49
72
  # Int in case we nest contexts
50
73
  self.cache_enabled = 0
51
74
 
@@ -121,7 +144,7 @@ class BaseAirbyteResource(ConfigurableResource):
121
144
  num_retries = 0
122
145
  while True:
123
146
  try:
124
- request_args: Dict[str, Any] = dict(
147
+ request_args: dict[str, Any] = dict(
125
148
  method=method,
126
149
  url=url,
127
150
  headers=headers,
@@ -194,7 +217,7 @@ class BaseAirbyteResource(ConfigurableResource):
194
217
  """
195
218
  connection_details = self.get_connection_details(connection_id)
196
219
  job_details = self.start_sync(connection_id)
197
- job_info = cast(Dict[str, object], job_details.get("job", {}))
220
+ job_info = cast(dict[str, object], job_details.get("job", {}))
198
221
  job_id = cast(int, job_info.get("id"))
199
222
 
200
223
  self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
@@ -212,7 +235,7 @@ class BaseAirbyteResource(ConfigurableResource):
212
235
  )
213
236
  time.sleep(poll_interval or self.poll_interval)
214
237
  job_details = self.get_job_status(connection_id, job_id)
215
- attempts = cast(List, job_details.get("attempts", []))
238
+ attempts = cast(list, job_details.get("attempts", []))
216
239
  cur_attempt = len(attempts)
217
240
  # spit out the available Airbyte log info
218
241
  if cur_attempt:
@@ -229,16 +252,20 @@ class BaseAirbyteResource(ConfigurableResource):
229
252
  logged_lines = 0
230
253
  logged_attempts += 1
231
254
 
232
- job_info = cast(Dict[str, object], job_details.get("job", {}))
255
+ job_info = cast(dict[str, object], job_details.get("job", {}))
233
256
  state = job_info.get("status")
234
257
 
235
- if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
258
+ if state in (
259
+ AirbyteJobStatusType.RUNNING,
260
+ AirbyteJobStatusType.PENDING,
261
+ AirbyteJobStatusType.INCOMPLETE,
262
+ ):
236
263
  continue
237
- elif state == AirbyteState.SUCCEEDED:
264
+ elif state == AirbyteJobStatusType.SUCCEEDED:
238
265
  break
239
- elif state == AirbyteState.ERROR:
266
+ elif state == AirbyteJobStatusType.ERROR:
240
267
  raise Failure(f"Job failed: {job_id}")
241
- elif state == AirbyteState.CANCELLED:
268
+ elif state == AirbyteJobStatusType.CANCELLED:
242
269
  raise Failure(f"Job was cancelled: {job_id}")
243
270
  else:
244
271
  raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
@@ -246,7 +273,12 @@ class BaseAirbyteResource(ConfigurableResource):
246
273
  # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
247
274
  # the python process
248
275
  if (
249
- state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
276
+ state
277
+ not in (
278
+ AirbyteJobStatusType.SUCCEEDED,
279
+ AirbyteJobStatusType.ERROR,
280
+ AirbyteJobStatusType.CANCELLED,
281
+ )
250
282
  and self.cancel_sync_on_run_termination
251
283
  ):
252
284
  self.cancel_job(job_id)
@@ -539,7 +571,7 @@ class AirbyteResource(BaseAirbyteResource):
539
571
 
540
572
  def get_default_workspace(self) -> str:
541
573
  workspaces = cast(
542
- List[Dict[str, Any]],
574
+ list[dict[str, Any]],
543
575
  check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
544
576
  "workspaces", []
545
577
  ),
@@ -551,7 +583,7 @@ class AirbyteResource(BaseAirbyteResource):
551
583
  definitions = check.not_none(
552
584
  self.make_request_cached(endpoint="/source_definitions/list", data={})
553
585
  )
554
- source_definitions = cast(List[Dict[str, Any]], definitions["sourceDefinitions"])
586
+ source_definitions = cast(list[dict[str, Any]], definitions["sourceDefinitions"])
555
587
 
556
588
  return next(
557
589
  (
@@ -565,7 +597,7 @@ class AirbyteResource(BaseAirbyteResource):
565
597
  def get_destination_definition_by_name(self, name: str):
566
598
  name_lower = name.lower()
567
599
  definitions = cast(
568
- Dict[str, List[Dict[str, str]]],
600
+ dict[str, list[dict[str, str]]],
569
601
  check.not_none(
570
602
  self.make_request_cached(endpoint="/destination_definitions/list", data={})
571
603
  ),
@@ -581,7 +613,7 @@ class AirbyteResource(BaseAirbyteResource):
581
613
 
582
614
  def get_source_catalog_id(self, source_id: str):
583
615
  result = cast(
584
- Dict[str, Any],
616
+ dict[str, Any],
585
617
  check.not_none(
586
618
  self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
587
619
  ),
@@ -590,7 +622,7 @@ class AirbyteResource(BaseAirbyteResource):
590
622
 
591
623
  def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
592
624
  return cast(
593
- Dict[str, Any],
625
+ dict[str, Any],
594
626
  check.not_none(
595
627
  self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
596
628
  ),
@@ -602,7 +634,7 @@ class AirbyteResource(BaseAirbyteResource):
602
634
  # Airbyte API changed source of truth for normalization in PR
603
635
  # https://github.com/airbytehq/airbyte/pull/21005
604
636
  norm_dest_def_spec: bool = cast(
605
- Dict[str, Any],
637
+ dict[str, Any],
606
638
  check.not_none(
607
639
  self.make_request_cached(
608
640
  endpoint="/destination_definition_specifications/get",
@@ -616,7 +648,7 @@ class AirbyteResource(BaseAirbyteResource):
616
648
 
617
649
  norm_dest_def: bool = (
618
650
  cast(
619
- Dict[str, Any],
651
+ dict[str, Any],
620
652
  check.not_none(
621
653
  self.make_request_cached(
622
654
  endpoint="/destination_definitions/get",
@@ -649,7 +681,7 @@ class AirbyteResource(BaseAirbyteResource):
649
681
  },
650
682
  )
651
683
  )
652
- job = next((job for job in cast(List, out["jobs"]) if job["job"]["id"] == job_id), None)
684
+ job = next((job for job in cast(list, out["jobs"]) if job["job"]["id"] == job_id), None)
653
685
 
654
686
  return check.not_none(job)
655
687
 
@@ -684,7 +716,7 @@ class AirbyteResource(BaseAirbyteResource):
684
716
  """
685
717
  connection_details = self.get_connection_details(connection_id)
686
718
  job_details = self.start_sync(connection_id)
687
- job_info = cast(Dict[str, object], job_details.get("job", {}))
719
+ job_info = cast(dict[str, object], job_details.get("job", {}))
688
720
  job_id = cast(int, job_info.get("id"))
689
721
 
690
722
  self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
@@ -702,7 +734,7 @@ class AirbyteResource(BaseAirbyteResource):
702
734
  )
703
735
  time.sleep(poll_interval or self.poll_interval)
704
736
  job_details = self.get_job_status(connection_id, job_id)
705
- attempts = cast(List, job_details.get("attempts", []))
737
+ attempts = cast(list, job_details.get("attempts", []))
706
738
  cur_attempt = len(attempts)
707
739
  # spit out the available Airbyte log info
708
740
  if cur_attempt:
@@ -719,16 +751,20 @@ class AirbyteResource(BaseAirbyteResource):
719
751
  logged_lines = 0
720
752
  logged_attempts += 1
721
753
 
722
- job_info = cast(Dict[str, object], job_details.get("job", {}))
754
+ job_info = cast(dict[str, object], job_details.get("job", {}))
723
755
  state = job_info.get("status")
724
756
 
725
- if state in (AirbyteState.RUNNING, AirbyteState.PENDING, AirbyteState.INCOMPLETE):
757
+ if state in (
758
+ AirbyteJobStatusType.RUNNING,
759
+ AirbyteJobStatusType.PENDING,
760
+ AirbyteJobStatusType.INCOMPLETE,
761
+ ):
726
762
  continue
727
- elif state == AirbyteState.SUCCEEDED:
763
+ elif state == AirbyteJobStatusType.SUCCEEDED:
728
764
  break
729
- elif state == AirbyteState.ERROR:
765
+ elif state == AirbyteJobStatusType.ERROR:
730
766
  raise Failure(f"Job failed: {job_id}")
731
- elif state == AirbyteState.CANCELLED:
767
+ elif state == AirbyteJobStatusType.CANCELLED:
732
768
  raise Failure(f"Job was cancelled: {job_id}")
733
769
  else:
734
770
  raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
@@ -736,7 +772,12 @@ class AirbyteResource(BaseAirbyteResource):
736
772
  # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
737
773
  # the python process
738
774
  if (
739
- state not in (AirbyteState.SUCCEEDED, AirbyteState.ERROR, AirbyteState.CANCELLED)
775
+ state
776
+ not in (
777
+ AirbyteJobStatusType.SUCCEEDED,
778
+ AirbyteJobStatusType.ERROR,
779
+ AirbyteJobStatusType.CANCELLED,
780
+ )
740
781
  and self.cancel_sync_on_run_termination
741
782
  ):
742
783
  self.cancel_job(job_id)
@@ -791,3 +832,552 @@ def airbyte_cloud_resource(context) -> AirbyteCloudResource:
791
832
 
792
833
  """
793
834
  return AirbyteCloudResource.from_resource_context(context)
835
+
836
+
837
+ # -------------
838
+ # Resources v2
839
+ # -------------
840
+
841
+
842
+ @experimental
843
+ class AirbyteCloudClient(DagsterModel):
844
+ """This class exposes methods on top of the Airbyte APIs for Airbyte Cloud."""
845
+
846
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
847
+ client_id: str = Field(..., description="The Airbyte client ID.")
848
+ client_secret: str = Field(..., description="The Airbyte client secret.")
849
+ request_max_retries: int = Field(
850
+ ...,
851
+ description=(
852
+ "The maximum number of times requests to the Airbyte API should be retried "
853
+ "before failing."
854
+ ),
855
+ )
856
+ request_retry_delay: float = Field(
857
+ ...,
858
+ description="Time (in seconds) to wait between each request retry.",
859
+ )
860
+ request_timeout: int = Field(
861
+ ...,
862
+ description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
863
+ )
864
+
865
+ _access_token_value: Optional[str] = PrivateAttr(default=None)
866
+ _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
867
+
868
+ @property
869
+ @cached_method
870
+ def _log(self) -> logging.Logger:
871
+ return get_dagster_logger()
872
+
873
+ @property
874
+ def rest_api_base_url(self) -> str:
875
+ return f"{AIRBYTE_REST_API_BASE}/{AIRBYTE_REST_API_VERSION}"
876
+
877
+ @property
878
+ def configuration_api_base_url(self) -> str:
879
+ return f"{AIRBYTE_CONFIGURATION_API_BASE}/{AIRBYTE_CONFIGURATION_API_VERSION}"
880
+
881
+ @property
882
+ def all_additional_request_params(self) -> Mapping[str, Any]:
883
+ return {**self.authorization_request_params, **self.user_agent_request_params}
884
+
885
+ @property
886
+ def authorization_request_params(self) -> Mapping[str, Any]:
887
+ # Make sure the access token is refreshed before using it when calling the API.
888
+ if self._needs_refreshed_access_token():
889
+ self._refresh_access_token()
890
+ return {
891
+ "Authorization": f"Bearer {self._access_token_value}",
892
+ }
893
+
894
+ @property
895
+ def user_agent_request_params(self) -> Mapping[str, Any]:
896
+ return {
897
+ "User-Agent": "dagster",
898
+ }
899
+
900
+ def _refresh_access_token(self) -> None:
901
+ response = check.not_none(
902
+ self._make_request(
903
+ method="POST",
904
+ endpoint="applications/token",
905
+ base_url=self.rest_api_base_url,
906
+ data={
907
+ "client_id": self.client_id,
908
+ "client_secret": self.client_secret,
909
+ },
910
+ # Must not pass the bearer access token when refreshing it.
911
+ include_additional_request_params=False,
912
+ )
913
+ )
914
+ self._access_token_value = str(response["access_token"])
915
+ self._access_token_timestamp = datetime.now().timestamp()
916
+
917
+ def _needs_refreshed_access_token(self) -> bool:
918
+ return (
919
+ not self._access_token_value
920
+ or not self._access_token_timestamp
921
+ or self._access_token_timestamp
922
+ <= (
923
+ datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
924
+ ).timestamp()
925
+ )
926
+
927
+ def _get_session(self, include_additional_request_params: bool) -> requests.Session:
928
+ headers = {"accept": "application/json"}
929
+ if include_additional_request_params:
930
+ headers = {
931
+ **headers,
932
+ **self.all_additional_request_params,
933
+ }
934
+ session = requests.Session()
935
+ session.headers.update(headers)
936
+ return session
937
+
938
+ def _make_request(
939
+ self,
940
+ method: str,
941
+ endpoint: str,
942
+ base_url: str,
943
+ data: Optional[Mapping[str, Any]] = None,
944
+ params: Optional[Mapping[str, Any]] = None,
945
+ include_additional_request_params: bool = True,
946
+ ) -> Mapping[str, Any]:
947
+ """Creates and sends a request to the desired Airbyte REST API endpoint.
948
+
949
+ Args:
950
+ method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").
951
+ endpoint (str): The Airbyte API endpoint to send this request to.
952
+ base_url (str): The base url to the Airbyte API to use.
953
+ data (Optional[Dict[str, Any]]): JSON-formatted data string to be included in the request.
954
+ params (Optional[Dict[str, Any]]): JSON-formatted query params to be included in the request.
955
+ include_additional_request_params (bool): Whether to include authorization and user-agent headers
956
+ to the request parameters. Defaults to True.
957
+
958
+ Returns:
959
+ Dict[str, Any]: Parsed json data from the response to this request
960
+ """
961
+ url = f"{base_url}/{endpoint}"
962
+
963
+ num_retries = 0
964
+ while True:
965
+ try:
966
+ session = self._get_session(
967
+ include_additional_request_params=include_additional_request_params
968
+ )
969
+ response = session.request(
970
+ method=method, url=url, json=data, params=params, timeout=self.request_timeout
971
+ )
972
+ response.raise_for_status()
973
+ return response.json()
974
+ except RequestException as e:
975
+ self._log.error(
976
+ f"Request to Airbyte API failed for url {url} with method {method} : {e}"
977
+ )
978
+ if num_retries == self.request_max_retries:
979
+ break
980
+ num_retries += 1
981
+ time.sleep(self.request_retry_delay)
982
+
983
+ raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
984
+
985
+ def get_connections(self) -> Mapping[str, Any]:
986
+ """Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
987
+ return self._make_request(
988
+ method="GET",
989
+ endpoint="connections",
990
+ base_url=self.rest_api_base_url,
991
+ params={"workspaceIds": self.workspace_id},
992
+ )
993
+
994
+ def get_connection_details(self, connection_id) -> Mapping[str, Any]:
995
+ """Fetches details about a given connection from the Airbyte Configuration API.
996
+ The Airbyte Configuration API is an internal and may change in the future.
997
+ """
998
+ # Using the Airbyte Configuration API to get the connection details, including streams and their configs.
999
+ # https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
1000
+ # https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
1001
+ return self._make_request(
1002
+ method="POST",
1003
+ endpoint="connections/get",
1004
+ base_url=self.configuration_api_base_url,
1005
+ data={"connectionId": connection_id},
1006
+ )
1007
+
1008
+ def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
1009
+ """Fetches details about a given destination from the Airbyte REST API."""
1010
+ return self._make_request(
1011
+ method="GET",
1012
+ endpoint=f"destinations/{destination_id}",
1013
+ base_url=self.rest_api_base_url,
1014
+ )
1015
+
1016
+ def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
1017
+ return self._make_request(
1018
+ method="POST",
1019
+ endpoint="jobs",
1020
+ base_url=self.rest_api_base_url,
1021
+ data={
1022
+ "connectionId": connection_id,
1023
+ "jobType": "sync",
1024
+ },
1025
+ )
1026
+
1027
+ def get_job_details(self, job_id: int) -> Mapping[str, Any]:
1028
+ return self._make_request(
1029
+ method="GET", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
1030
+ )
1031
+
1032
+ def cancel_job(self, job_id: int) -> Mapping[str, Any]:
1033
+ return self._make_request(
1034
+ method="DELETE", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
1035
+ )
1036
+
1037
+ def sync_and_poll(
1038
+ self,
1039
+ connection_id: str,
1040
+ poll_interval: Optional[float] = None,
1041
+ poll_timeout: Optional[float] = None,
1042
+ cancel_on_termination: bool = True,
1043
+ ) -> AirbyteOutput:
1044
+ """Initializes a sync operation for the given connection, and polls until it completes.
1045
+
1046
+ Args:
1047
+ connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
1048
+ "Connection" tab of a given connection in the Airbyte UI.
1049
+ poll_interval (float): The time (in seconds) that will be waited between successive polls.
1050
+ poll_timeout (float): The maximum time that will wait before this operation is timed
1051
+ out. By default, this will never time out.
1052
+ cancel_on_termination (bool): Whether to cancel a sync in Airbyte if the Dagster runner is terminated.
1053
+ This may be useful to disable if using Airbyte sources that cannot be cancelled and
1054
+ resumed easily, or if your Dagster deployment may experience runner interruptions
1055
+ that do not impact your Airbyte deployment.
1056
+
1057
+ Returns:
1058
+ :py:class:`~AirbyteOutput`:
1059
+ Details of the sync job.
1060
+ """
1061
+ connection_details = self.get_connection_details(connection_id)
1062
+ start_job_details = self.start_sync_job(connection_id)
1063
+ job = AirbyteJob.from_job_details(job_details=start_job_details)
1064
+
1065
+ self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
1066
+ poll_start = datetime.now()
1067
+ poll_interval = (
1068
+ poll_interval if poll_interval is not None else DEFAULT_POLL_INTERVAL_SECONDS
1069
+ )
1070
+ try:
1071
+ while True:
1072
+ if poll_timeout and datetime.now() > poll_start + timedelta(seconds=poll_timeout):
1073
+ raise Failure(
1074
+ f"Timeout: Airbyte job {job.id} is not ready after the timeout"
1075
+ f" {poll_timeout} seconds"
1076
+ )
1077
+
1078
+ time.sleep(poll_interval)
1079
+ # We return these job details in the AirbyteOutput when the job succeeds
1080
+ poll_job_details = self.get_job_details(job.id)
1081
+ job = AirbyteJob.from_job_details(job_details=poll_job_details)
1082
+ if job.status in (
1083
+ AirbyteJobStatusType.RUNNING,
1084
+ AirbyteJobStatusType.PENDING,
1085
+ AirbyteJobStatusType.INCOMPLETE,
1086
+ ):
1087
+ continue
1088
+ elif job.status == AirbyteJobStatusType.SUCCEEDED:
1089
+ break
1090
+ elif job.status in [AirbyteJobStatusType.ERROR, AirbyteJobStatusType.FAILED]:
1091
+ raise Failure(f"Job failed: {job.id}")
1092
+ elif job.status == AirbyteJobStatusType.CANCELLED:
1093
+ raise Failure(f"Job was cancelled: {job.id}")
1094
+ else:
1095
+ raise Failure(
1096
+ f"Encountered unexpected state `{job.status}` for job_id {job.id}"
1097
+ )
1098
+ finally:
1099
+ # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
1100
+ # the python process
1101
+ if cancel_on_termination and job.status not in (
1102
+ AirbyteJobStatusType.SUCCEEDED,
1103
+ AirbyteJobStatusType.ERROR,
1104
+ AirbyteJobStatusType.CANCELLED,
1105
+ AirbyteJobStatusType.FAILED,
1106
+ ):
1107
+ self.cancel_job(job.id)
1108
+
1109
+ return AirbyteOutput(job_details=poll_job_details, connection_details=connection_details)
1110
+
1111
+
1112
+ @experimental
1113
+ class AirbyteCloudWorkspace(ConfigurableResource):
1114
+ """This class represents a Airbyte Cloud workspace and provides utilities
1115
+ to interact with Airbyte APIs.
1116
+ """
1117
+
1118
+ workspace_id: str = Field(..., description="The Airbyte Cloud workspace ID")
1119
+ client_id: str = Field(..., description="The Airbyte Cloud client ID.")
1120
+ client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
1121
+ request_max_retries: int = Field(
1122
+ default=3,
1123
+ description=(
1124
+ "The maximum number of times requests to the Airbyte API should be retried "
1125
+ "before failing."
1126
+ ),
1127
+ )
1128
+ request_retry_delay: float = Field(
1129
+ default=0.25,
1130
+ description="Time (in seconds) to wait between each request retry.",
1131
+ )
1132
+ request_timeout: int = Field(
1133
+ default=15,
1134
+ description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
1135
+ )
1136
+
1137
+ _client: AirbyteCloudClient = PrivateAttr(default=None) # type: ignore
1138
+
1139
+ @cached_method
1140
+ def get_client(self) -> AirbyteCloudClient:
1141
+ return AirbyteCloudClient(
1142
+ workspace_id=self.workspace_id,
1143
+ client_id=self.client_id,
1144
+ client_secret=self.client_secret,
1145
+ request_max_retries=self.request_max_retries,
1146
+ request_retry_delay=self.request_retry_delay,
1147
+ request_timeout=self.request_timeout,
1148
+ )
1149
+
1150
+ def fetch_airbyte_workspace_data(
1151
+ self,
1152
+ ) -> AirbyteWorkspaceData:
1153
+ """Retrieves all Airbyte content from the workspace and returns it as a AirbyteWorkspaceData object.
1154
+
1155
+ Returns:
1156
+ AirbyteWorkspaceData: A snapshot of the Airbyte workspace's content.
1157
+ """
1158
+ connections_by_id = {}
1159
+ destinations_by_id = {}
1160
+
1161
+ client = self.get_client()
1162
+ connections = client.get_connections()["data"]
1163
+
1164
+ for partial_connection_details in connections:
1165
+ full_connection_details = client.get_connection_details(
1166
+ connection_id=partial_connection_details["connectionId"]
1167
+ )
1168
+ connection = AirbyteConnection.from_connection_details(
1169
+ connection_details=full_connection_details
1170
+ )
1171
+ connections_by_id[connection.id] = connection
1172
+
1173
+ destination_details = client.get_destination_details(
1174
+ destination_id=connection.destination_id
1175
+ )
1176
+ destination = AirbyteDestination.from_destination_details(
1177
+ destination_details=destination_details
1178
+ )
1179
+ destinations_by_id[destination.id] = destination
1180
+
1181
+ return AirbyteWorkspaceData(
1182
+ connections_by_id=connections_by_id,
1183
+ destinations_by_id=destinations_by_id,
1184
+ )
1185
+
1186
+ @cached_method
1187
+ def load_asset_specs(
1188
+ self,
1189
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
1190
+ ) -> Sequence[AssetSpec]:
1191
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
1192
+
1193
+ Args:
1194
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1195
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1196
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
1197
+
1198
+ Returns:
1199
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
1200
+
1201
+ Examples:
1202
+ Loading the asset specs for a given Airbyte workspace:
1203
+ .. code-block:: python
1204
+
1205
+ from dagster_airbyte import AirbyteCloudWorkspace
1206
+
1207
+ import dagster as dg
1208
+
1209
+ airbyte_workspace = AirbyteCloudWorkspace(
1210
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1211
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1212
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1213
+ )
1214
+
1215
+ airbyte_specs = airbyte_workspace.load_asset_specs()
1216
+ defs = dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace}
1217
+ """
1218
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1219
+
1220
+ return load_airbyte_cloud_asset_specs(
1221
+ workspace=self, dagster_airbyte_translator=dagster_airbyte_translator
1222
+ )
1223
+
1224
+ def _generate_materialization(
1225
+ self,
1226
+ airbyte_output: AirbyteOutput,
1227
+ dagster_airbyte_translator: DagsterAirbyteTranslator,
1228
+ ):
1229
+ connection = AirbyteConnection.from_connection_details(
1230
+ connection_details=airbyte_output.connection_details
1231
+ )
1232
+
1233
+ for stream in connection.streams.values():
1234
+ if stream.selected:
1235
+ connection_table_name = get_airbyte_connection_table_name(
1236
+ stream_prefix=connection.stream_prefix,
1237
+ stream_name=stream.name,
1238
+ )
1239
+ stream_asset_spec = dagster_airbyte_translator.get_asset_spec(
1240
+ props=AirbyteConnectionTableProps(
1241
+ table_name=connection_table_name,
1242
+ stream_prefix=connection.stream_prefix,
1243
+ stream_name=stream.name,
1244
+ json_schema=stream.json_schema,
1245
+ connection_id=connection.id,
1246
+ connection_name=connection.name,
1247
+ destination_type=None,
1248
+ database=None,
1249
+ schema=None,
1250
+ )
1251
+ )
1252
+
1253
+ yield AssetMaterialization(
1254
+ asset_key=stream_asset_spec.key,
1255
+ description=(
1256
+ f"Table generated via Airbyte Cloud sync "
1257
+ f"for connection {connection.name}: {connection_table_name}"
1258
+ ),
1259
+ metadata=stream_asset_spec.metadata,
1260
+ )
1261
+
1262
+ @public
1263
+ @experimental
1264
+ def sync_and_poll(self, context: AssetExecutionContext):
1265
+ """Executes a sync and poll process to materialize Airbyte Cloud assets.
1266
+ This method can only be used in the context of an asset execution.
1267
+
1268
+ Args:
1269
+ context (AssetExecutionContext): The execution context
1270
+ from within `@airbyte_assets`.
1271
+
1272
+ Returns:
1273
+ Iterator[Union[AssetMaterialization, MaterializeResult]]: An iterator of MaterializeResult
1274
+ or AssetMaterialization.
1275
+ """
1276
+ assets_def = context.assets_def
1277
+ dagster_airbyte_translator = get_translator_from_airbyte_assets(assets_def)
1278
+ connection_id = next(
1279
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id)
1280
+ for spec in assets_def.specs
1281
+ )
1282
+
1283
+ client = self.get_client()
1284
+ airbyte_output = client.sync_and_poll(
1285
+ connection_id=connection_id,
1286
+ )
1287
+
1288
+ materialized_asset_keys = set()
1289
+ for materialization in self._generate_materialization(
1290
+ airbyte_output=airbyte_output, dagster_airbyte_translator=dagster_airbyte_translator
1291
+ ):
1292
+ # Scan through all tables actually created, if it was expected then emit a MaterializeResult.
1293
+ # Otherwise, emit a runtime AssetMaterialization.
1294
+ if materialization.asset_key in context.selected_asset_keys:
1295
+ yield MaterializeResult(
1296
+ asset_key=materialization.asset_key, metadata=materialization.metadata
1297
+ )
1298
+ materialized_asset_keys.add(materialization.asset_key)
1299
+ else:
1300
+ context.log.warning(
1301
+ f"An unexpected asset was materialized: {materialization.asset_key}. "
1302
+ f"Yielding a materialization event."
1303
+ )
1304
+ yield materialization
1305
+
1306
+ unmaterialized_asset_keys = context.selected_asset_keys - materialized_asset_keys
1307
+ if unmaterialized_asset_keys:
1308
+ context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
1309
+
1310
+
1311
+ @experimental
1312
+ def load_airbyte_cloud_asset_specs(
1313
+ workspace: AirbyteCloudWorkspace,
1314
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
1315
+ ) -> Sequence[AssetSpec]:
1316
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
1317
+
1318
+ Args:
1319
+ workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
1320
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1321
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1322
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
1323
+
1324
+ Returns:
1325
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
1326
+
1327
+ Examples:
1328
+ Loading the asset specs for a given Airbyte Cloud workspace:
1329
+
1330
+ .. code-block:: python
1331
+
1332
+ from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
1333
+
1334
+ import dagster as dg
1335
+
1336
+ airbyte_cloud_workspace = AirbyteCloudWorkspace(
1337
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1338
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1339
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1340
+ )
1341
+
1342
+
1343
+ airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
1344
+ defs = dg.Definitions(assets=airbyte_cloud_specs)
1345
+ """
1346
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1347
+
1348
+ with workspace.process_config_and_initialize_cm() as initialized_workspace:
1349
+ return [
1350
+ spec.merge_attributes(
1351
+ metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
1352
+ )
1353
+ for spec in check.is_list(
1354
+ AirbyteCloudWorkspaceDefsLoader(
1355
+ workspace=initialized_workspace,
1356
+ translator=dagster_airbyte_translator,
1357
+ )
1358
+ .build_defs()
1359
+ .assets,
1360
+ AssetSpec,
1361
+ )
1362
+ ]
1363
+
1364
+
1365
+ @record
1366
+ class AirbyteCloudWorkspaceDefsLoader(StateBackedDefinitionsLoader[Mapping[str, Any]]):
1367
+ workspace: AirbyteCloudWorkspace
1368
+ translator: DagsterAirbyteTranslator
1369
+
1370
+ @property
1371
+ def defs_key(self) -> str:
1372
+ return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}/{self.workspace.workspace_id}"
1373
+
1374
+ def fetch_state(self) -> AirbyteWorkspaceData:
1375
+ return self.workspace.fetch_airbyte_workspace_data()
1376
+
1377
+ def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
1378
+ all_asset_specs = [
1379
+ self.translator.get_asset_spec(props)
1380
+ for props in state.to_airbyte_connection_table_props_data()
1381
+ ]
1382
+
1383
+ return Definitions(assets=all_asset_specs)