dagster-airbyte 0.26.18__py3-none-any.whl → 0.28.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,37 +1,30 @@
1
- import hashlib
2
- import json
3
1
  import logging
4
- import sys
5
2
  import time
6
- from abc import abstractmethod
7
- from collections.abc import Mapping, Sequence
3
+ from collections.abc import Callable, Iterator, Mapping, Sequence
8
4
  from contextlib import contextmanager
9
5
  from datetime import datetime, timedelta
10
- from typing import Any, Optional, cast
6
+ from typing import Any, ClassVar, Optional, Union
7
+ from urllib.parse import parse_qsl, urlparse
11
8
 
12
9
  import requests
13
10
  from dagster import (
14
11
  AssetExecutionContext,
15
12
  AssetMaterialization,
13
+ AssetSpec,
16
14
  ConfigurableResource,
17
15
  Definitions,
18
16
  Failure,
19
- InitResourceContext,
20
17
  MaterializeResult,
21
18
  _check as check,
22
19
  get_dagster_logger,
23
- resource,
24
20
  )
25
- from dagster._annotations import beta, public, superseded
26
- from dagster._config.pythonic_config import infer_schema_from_config_class
27
- from dagster._core.definitions.asset_spec import AssetSpec
21
+ from dagster._annotations import superseded
28
22
  from dagster._core.definitions.definitions_load_context import StateBackedDefinitionsLoader
29
- from dagster._core.definitions.resource_definition import dagster_maintained_resource
30
- from dagster._record import record
31
- from dagster._utils.cached_method import cached_method
32
- from dagster._utils.merger import deep_merge_dicts
23
+ from dagster._symbol_annotations import beta, public
33
24
  from dagster_shared.dagster_model import DagsterModel
34
- from pydantic import Field, PrivateAttr
25
+ from dagster_shared.record import record
26
+ from dagster_shared.utils.cached_method import cached_method
27
+ from pydantic import Field, PrivateAttr, model_validator
35
28
  from requests.exceptions import RequestException
36
29
 
37
30
  from dagster_airbyte.translator import (
@@ -51,810 +44,55 @@ from dagster_airbyte.utils import (
51
44
  get_translator_from_airbyte_assets,
52
45
  )
53
46
 
54
- AIRBYTE_REST_API_BASE = "https://api.airbyte.com"
55
- AIRBYTE_REST_API_VERSION = "v1"
56
-
57
- AIRBYTE_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
58
- AIRBYTE_CONFIGURATION_API_VERSION = "v1"
47
+ AIRBYTE_CLOUD_REST_API_BASE = "https://api.airbyte.com"
48
+ AIRBYTE_CLOUD_REST_API_VERSION = "v1"
49
+ AIRBYTE_CLOUD_REST_API_BASE_URL = f"{AIRBYTE_CLOUD_REST_API_BASE}/{AIRBYTE_CLOUD_REST_API_VERSION}"
50
+ AIRBYTE_CLOUD_CONFIGURATION_API_BASE = "https://cloud.airbyte.com/api"
51
+ AIRBYTE_CLOUD_CONFIGURATION_API_VERSION = "v1"
52
+ AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL = (
53
+ f"{AIRBYTE_CLOUD_CONFIGURATION_API_BASE}/{AIRBYTE_CLOUD_CONFIGURATION_API_VERSION}"
54
+ )
59
55
 
60
56
  DEFAULT_POLL_INTERVAL_SECONDS = 10
61
57
 
62
58
  # The access token expire every 3 minutes in Airbyte Cloud.
63
59
  # Refresh is needed after 2.5 minutes to avoid the "token expired" error message.
64
- AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS = 150
60
+ AIRBYTE_REFRESH_TIMEDELTA_SECONDS = 150
65
61
 
66
62
  AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX = "dagster-airbyte/reconstruction_metadata"
67
63
 
68
64
 
69
- class AirbyteResourceState:
70
- def __init__(self) -> None:
71
- self.request_cache: dict[str, Optional[Mapping[str, object]]] = {}
72
- # Int in case we nest contexts
73
- self.cache_enabled = 0
74
-
75
-
76
- class BaseAirbyteResource(ConfigurableResource):
77
- request_max_retries: int = Field(
78
- default=3,
79
- description=(
80
- "The maximum number of times requests to the Airbyte API should be retried "
81
- "before failing."
82
- ),
83
- )
84
- request_retry_delay: float = Field(
85
- default=0.25,
86
- description="Time (in seconds) to wait between each request retry.",
87
- )
88
- request_timeout: int = Field(
89
- default=15,
90
- description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
91
- )
92
- cancel_sync_on_run_termination: bool = Field(
93
- default=True,
94
- description=(
95
- "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. This may"
96
- " be useful to disable if using Airbyte sources that cannot be cancelled and"
97
- " resumed easily, or if your Dagster deployment may experience runner interruptions"
98
- " that do not impact your Airbyte deployment."
99
- ),
100
- )
101
- poll_interval: float = Field(
102
- default=DEFAULT_POLL_INTERVAL_SECONDS,
103
- description="Time (in seconds) to wait between checking a sync's status.",
104
- )
105
-
106
- @classmethod
107
- def _is_dagster_maintained(cls) -> bool:
108
- return True
109
-
110
- @property
111
- @cached_method
112
- def _log(self) -> logging.Logger:
113
- return get_dagster_logger()
114
-
115
- @property
116
- @abstractmethod
117
- def api_base_url(self) -> str:
118
- raise NotImplementedError()
119
-
120
- @property
121
- @abstractmethod
122
- def all_additional_request_params(self) -> Mapping[str, Any]:
123
- raise NotImplementedError()
124
-
125
- def make_request(
126
- self,
127
- endpoint: str,
128
- data: Optional[Mapping[str, object]] = None,
129
- method: str = "POST",
130
- include_additional_request_params: bool = True,
131
- ) -> Optional[Mapping[str, object]]:
132
- """Creates and sends a request to the desired Airbyte REST API endpoint.
133
-
134
- Args:
135
- endpoint (str): The Airbyte API endpoint to send this request to.
136
- data (Optional[str]): JSON-formatted data string to be included in the request.
137
-
138
- Returns:
139
- Optional[Dict[str, Any]]: Parsed json data from the response to this request
140
- """
141
- url = self.api_base_url + endpoint
142
- headers = {"accept": "application/json"}
143
-
144
- num_retries = 0
145
- while True:
146
- try:
147
- request_args: dict[str, Any] = dict(
148
- method=method,
149
- url=url,
150
- headers=headers,
151
- timeout=self.request_timeout,
152
- )
153
- if data:
154
- request_args["json"] = data
155
-
156
- if include_additional_request_params:
157
- request_args = deep_merge_dicts(
158
- request_args,
159
- self.all_additional_request_params,
160
- )
161
-
162
- response = requests.request(
163
- **request_args,
164
- )
165
- response.raise_for_status()
166
- if response.status_code == 204:
167
- return None
168
- return response.json()
169
- except RequestException as e:
170
- self._log.error("Request to Airbyte API failed: %s", e)
171
- if num_retries == self.request_max_retries:
172
- break
173
- num_retries += 1
174
- time.sleep(self.request_retry_delay)
175
-
176
- raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
177
-
178
- @abstractmethod
179
- def start_sync(self, connection_id: str) -> Mapping[str, object]:
180
- raise NotImplementedError()
181
-
182
- @abstractmethod
183
- def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
184
- raise NotImplementedError()
185
-
186
- @abstractmethod
187
- def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
188
- raise NotImplementedError()
189
-
190
- @abstractmethod
191
- def cancel_job(self, job_id: int):
192
- raise NotImplementedError()
193
-
194
- @property
195
- @abstractmethod
196
- def _should_forward_logs(self) -> bool:
197
- raise NotImplementedError()
198
-
199
- def sync_and_poll(
200
- self,
201
- connection_id: str,
202
- poll_interval: Optional[float] = None,
203
- poll_timeout: Optional[float] = None,
204
- ) -> AirbyteOutput:
205
- """Initializes a sync operation for the given connector, and polls until it completes.
206
-
207
- Args:
208
- connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
209
- "Connection" tab of a given connection in the Arbyte UI.
210
- poll_interval (float): The time (in seconds) that will be waited between successive polls.
211
- poll_timeout (float): The maximum time that will waited before this operation is timed
212
- out. By default, this will never time out.
213
-
214
- Returns:
215
- :py:class:`~AirbyteOutput`:
216
- Details of the sync job.
217
- """
218
- connection_details = self.get_connection_details(connection_id)
219
- job_details = self.start_sync(connection_id)
220
- job_info = cast("dict[str, object]", job_details.get("job", {}))
221
- job_id = cast("int", job_info.get("id"))
222
-
223
- self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
224
- start = time.monotonic()
225
- logged_attempts = 0
226
- logged_lines = 0
227
- state = None
228
-
229
- try:
230
- while True:
231
- if poll_timeout and start + poll_timeout < time.monotonic():
232
- raise Failure(
233
- f"Timeout: Airbyte job {job_id} is not ready after the timeout"
234
- f" {poll_timeout} seconds"
235
- )
236
- time.sleep(poll_interval or self.poll_interval)
237
- job_details = self.get_job_status(connection_id, job_id)
238
- attempts = cast("list", job_details.get("attempts", []))
239
- cur_attempt = len(attempts)
240
- # spit out the available Airbyte log info
241
- if cur_attempt:
242
- if self._should_forward_logs:
243
- log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
244
-
245
- for line in log_lines[logged_lines:]:
246
- sys.stdout.write(line + "\n")
247
- sys.stdout.flush()
248
- logged_lines = len(log_lines)
249
-
250
- # if there's a next attempt, this one will have no more log messages
251
- if logged_attempts < cur_attempt - 1:
252
- logged_lines = 0
253
- logged_attempts += 1
254
-
255
- job_info = cast("dict[str, object]", job_details.get("job", {}))
256
- state = job_info.get("status")
257
-
258
- if state in (
259
- AirbyteJobStatusType.RUNNING,
260
- AirbyteJobStatusType.PENDING,
261
- AirbyteJobStatusType.INCOMPLETE,
262
- ):
263
- continue
264
- elif state == AirbyteJobStatusType.SUCCEEDED:
265
- break
266
- elif state == AirbyteJobStatusType.ERROR:
267
- raise Failure(f"Job failed: {job_id}")
268
- elif state == AirbyteJobStatusType.CANCELLED:
269
- raise Failure(f"Job was cancelled: {job_id}")
270
- else:
271
- raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
272
- finally:
273
- # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
274
- # the python process
275
- if (
276
- state
277
- not in (
278
- AirbyteJobStatusType.SUCCEEDED,
279
- AirbyteJobStatusType.ERROR,
280
- AirbyteJobStatusType.CANCELLED,
281
- )
282
- and self.cancel_sync_on_run_termination
283
- ):
284
- self.cancel_job(job_id)
285
-
286
- return AirbyteOutput(job_details=job_details, connection_details=connection_details)
287
-
288
-
289
- @superseded(
290
- additional_warn_text=(
291
- "Using `AirbyteCloudResource` with `build_airbyte_assets`is no longer best practice. "
292
- "Use `AirbyteCloudWorkspace` with `build_airbyte_assets_definitions` instead."
293
- )
294
- )
295
- class AirbyteCloudResource(BaseAirbyteResource):
296
- """This resource allows users to programmatically interface with the Airbyte Cloud API to launch
297
- syncs and monitor their progress.
298
-
299
- **Examples:**
300
-
301
- .. code-block:: python
302
-
303
- from dagster import job, EnvVar
304
- from dagster_airbyte import AirbyteResource
305
-
306
- my_airbyte_resource = AirbyteCloudResource(
307
- client_id=EnvVar("AIRBYTE_CLIENT_ID"),
308
- client_secret=EnvVar("AIRBYTE_CLIENT_SECRET"),
309
- )
310
-
311
- airbyte_assets = build_airbyte_assets(
312
- connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
313
- destination_tables=["releases", "tags", "teams"],
314
- )
315
-
316
- defs = Definitions(
317
- assets=[airbyte_assets],
318
- resources={"airbyte": my_airbyte_resource},
319
- )
320
- """
321
-
322
- client_id: str = Field(..., description="The Airbyte Cloud client ID.")
323
- client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
324
-
325
- _access_token_value: Optional[str] = PrivateAttr(default=None)
326
- _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
327
-
328
- def setup_for_execution(self, context: InitResourceContext) -> None:
329
- # Refresh access token when the resource is initialized
330
- self._refresh_access_token()
331
-
332
- @property
333
- def api_base_url(self) -> str:
334
- return "https://api.airbyte.com/v1"
335
-
336
- @property
337
- def all_additional_request_params(self) -> Mapping[str, Any]:
338
- # Make sure the access token is refreshed before using it when calling the API.
339
- if self._needs_refreshed_access_token():
340
- self._refresh_access_token()
341
- return {
342
- "headers": {
343
- "Authorization": f"Bearer {self._access_token_value}",
344
- "User-Agent": "dagster",
345
- }
346
- }
347
-
348
- def make_request(
349
- self,
350
- endpoint: str,
351
- data: Optional[Mapping[str, object]] = None,
352
- method: str = "POST",
353
- include_additional_request_params: bool = True,
354
- ) -> Optional[Mapping[str, object]]:
355
- # Make sure the access token is refreshed before using it when calling the API.
356
- if include_additional_request_params and self._needs_refreshed_access_token():
357
- self._refresh_access_token()
358
- return super().make_request(
359
- endpoint=endpoint,
360
- data=data,
361
- method=method,
362
- include_additional_request_params=include_additional_request_params,
363
- )
364
-
365
- def start_sync(self, connection_id: str) -> Mapping[str, object]:
366
- job_sync = check.not_none(
367
- self.make_request(
368
- endpoint="/jobs",
369
- data={
370
- "connectionId": connection_id,
371
- "jobType": "sync",
372
- },
373
- )
374
- )
375
- return {"job": {"id": job_sync["jobId"], "status": job_sync["status"]}}
376
-
377
- def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
378
- return {}
379
-
380
- def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
381
- job_status = check.not_none(self.make_request(endpoint=f"/jobs/{job_id}", method="GET"))
382
- return {"job": {"id": job_status["jobId"], "status": job_status["status"]}}
383
-
384
- def cancel_job(self, job_id: int):
385
- self.make_request(endpoint=f"/jobs/{job_id}", method="DELETE")
386
-
387
- @property
388
- def _should_forward_logs(self) -> bool:
389
- # Airbyte Cloud does not support streaming logs yet
390
- return False
391
-
392
- def _refresh_access_token(self) -> None:
393
- response = check.not_none(
394
- self.make_request(
395
- endpoint="/applications/token",
396
- data={
397
- "client_id": self.client_id,
398
- "client_secret": self.client_secret,
399
- },
400
- # Must not pass the bearer access token when refreshing it.
401
- include_additional_request_params=False,
402
- )
403
- )
404
- self._access_token_value = str(response["access_token"])
405
- self._access_token_timestamp = datetime.now().timestamp()
406
-
407
- def _needs_refreshed_access_token(self) -> bool:
408
- return (
409
- not self._access_token_value
410
- or not self._access_token_timestamp
411
- or self._access_token_timestamp
412
- <= datetime.timestamp(
413
- datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
414
- )
415
- )
416
-
417
-
418
- class AirbyteResource(BaseAirbyteResource):
419
- """This resource allows users to programatically interface with the Airbyte REST API to launch
420
- syncs and monitor their progress.
421
-
422
- **Examples:**
423
-
424
- .. code-block:: python
425
-
426
- from dagster import job, EnvVar
427
- from dagster_airbyte import AirbyteResource
428
-
429
- my_airbyte_resource = AirbyteResource(
430
- host=EnvVar("AIRBYTE_HOST"),
431
- port=EnvVar("AIRBYTE_PORT"),
432
- # If using basic auth
433
- username=EnvVar("AIRBYTE_USERNAME"),
434
- password=EnvVar("AIRBYTE_PASSWORD"),
435
- )
436
-
437
- airbyte_assets = build_airbyte_assets(
438
- connection_id="87b7fe85-a22c-420e-8d74-b30e7ede77df",
439
- destination_tables=["releases", "tags", "teams"],
440
- )
441
-
442
- defs = Definitions(
443
- assets=[airbyte_assets],
444
- resources={"airbyte": my_airbyte_resource},
445
- )
446
- """
65
+ @beta
66
+ class AirbyteClient(DagsterModel):
67
+ """This class exposes methods on top of the Airbyte APIs for Airbyte."""
447
68
 
448
- host: str = Field(description="The Airbyte server address.")
449
- port: str = Field(description="Port used for the Airbyte server.")
450
- username: Optional[str] = Field(default=None, description="Username if using basic auth.")
451
- password: Optional[str] = Field(default=None, description="Password if using basic auth.")
452
- use_https: bool = Field(
453
- default=False, description="Whether to use HTTPS to connect to the Airbyte server."
454
- )
455
- forward_logs: bool = Field(
456
- default=True,
69
+ rest_api_base_url: str = Field(
70
+ default=AIRBYTE_CLOUD_REST_API_BASE_URL,
457
71
  description=(
458
- "Whether to forward Airbyte logs to the compute log, can be expensive for"
459
- " long-running syncs."
72
+ "The base URL for the Airbyte REST API. "
73
+ "For Airbyte Cloud, leave this as the default. "
74
+ "For self-managed Airbyte, this is usually <your Airbyte host>/api/public/v1."
75
+ ),
76
+ )
77
+ configuration_api_base_url: str = Field(
78
+ default=AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL,
79
+ description=(
80
+ "The base URL for the Airbyte Configuration API. "
81
+ "For Airbyte Cloud, leave this as the default. "
82
+ "For self-managed Airbyte, this is usually <your Airbyte host>/api/v1."
460
83
  ),
461
84
  )
462
- request_additional_params: Mapping[str, Any] = Field(
463
- default=dict(),
464
- description=(
465
- "Any additional kwargs to pass to the requests library when making requests to Airbyte."
466
- ),
467
- )
468
-
469
- @property
470
- @cached_method
471
- def _state(self) -> AirbyteResourceState:
472
- return AirbyteResourceState()
473
-
474
- @property
475
- @cached_method
476
- def _log(self) -> logging.Logger:
477
- return get_dagster_logger()
478
-
479
- @property
480
- def api_base_url(self) -> str:
481
- return (
482
- ("https://" if self.use_https else "http://")
483
- + (f"{self.host}:{self.port}" if self.port else self.host)
484
- + "/api/v1"
485
- )
486
-
487
- @property
488
- def _should_forward_logs(self) -> bool:
489
- return self.forward_logs
490
-
491
- @contextmanager
492
- def cache_requests(self):
493
- """Context manager that enables caching certain requests to the Airbyte API,
494
- cleared when the context is exited.
495
- """
496
- self.clear_request_cache()
497
- self._state.cache_enabled += 1
498
- try:
499
- yield
500
- finally:
501
- self.clear_request_cache()
502
- self._state.cache_enabled -= 1
503
-
504
- def clear_request_cache(self) -> None:
505
- self._state.request_cache = {}
506
-
507
- def make_request_cached(self, endpoint: str, data: Optional[Mapping[str, object]]):
508
- if not self._state.cache_enabled > 0:
509
- return self.make_request(endpoint, data)
510
- data_json = json.dumps(data, sort_keys=True)
511
- sha = hashlib.sha1()
512
- sha.update(endpoint.encode("utf-8"))
513
- sha.update(data_json.encode("utf-8"))
514
- digest = sha.hexdigest()
515
-
516
- if digest not in self._state.request_cache:
517
- self._state.request_cache[digest] = self.make_request(endpoint, data)
518
- return self._state.request_cache[digest]
519
-
520
- @property
521
- def all_additional_request_params(self) -> Mapping[str, Any]:
522
- auth_param = (
523
- {"auth": (self.username, self.password)} if self.username and self.password else {}
524
- )
525
- return {**auth_param, **self.request_additional_params}
526
-
527
- def make_request( # pyright: ignore[reportIncompatibleMethodOverride]
528
- self, endpoint: str, data: Optional[Mapping[str, object]]
529
- ) -> Optional[Mapping[str, object]]:
530
- """Creates and sends a request to the desired Airbyte REST API endpoint.
531
-
532
- Args:
533
- endpoint (str): The Airbyte API endpoint to send this request to.
534
- data (Optional[str]): JSON-formatted data string to be included in the request.
535
-
536
- Returns:
537
- Optional[Dict[str, Any]]: Parsed json data from the response to this request
538
- """
539
- url = self.api_base_url + endpoint
540
- headers = {"accept": "application/json"}
541
-
542
- num_retries = 0
543
- while True:
544
- try:
545
- response = requests.request(
546
- **deep_merge_dicts( # type: ignore
547
- dict(
548
- method="POST",
549
- url=url,
550
- headers=headers,
551
- json=data,
552
- timeout=self.request_timeout,
553
- auth=(
554
- (self.username, self.password)
555
- if self.username and self.password
556
- else None
557
- ),
558
- ),
559
- self.request_additional_params,
560
- ),
561
- )
562
- response.raise_for_status()
563
- if response.status_code == 204:
564
- return None
565
- return response.json()
566
- except RequestException as e:
567
- self._log.error("Request to Airbyte API failed: %s", e)
568
- if num_retries == self.request_max_retries:
569
- break
570
- num_retries += 1
571
- time.sleep(self.request_retry_delay)
572
-
573
- raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
574
-
575
- def cancel_job(self, job_id: int):
576
- self.make_request(endpoint="/jobs/cancel", data={"id": job_id})
577
-
578
- def get_default_workspace(self) -> str:
579
- workspaces = cast(
580
- "list[dict[str, Any]]",
581
- check.not_none(self.make_request_cached(endpoint="/workspaces/list", data={})).get(
582
- "workspaces", []
583
- ),
584
- )
585
- return workspaces[0]["workspaceId"]
586
-
587
- def get_source_definition_by_name(self, name: str) -> Optional[str]:
588
- name_lower = name.lower()
589
- definitions = check.not_none(
590
- self.make_request_cached(endpoint="/source_definitions/list", data={})
591
- )
592
- source_definitions = cast("list[dict[str, Any]]", definitions["sourceDefinitions"])
593
-
594
- return next(
595
- (
596
- definition["sourceDefinitionId"]
597
- for definition in source_definitions
598
- if definition["name"].lower() == name_lower
599
- ),
600
- None,
601
- )
602
-
603
- def get_destination_definition_by_name(self, name: str):
604
- name_lower = name.lower()
605
- definitions = cast(
606
- "dict[str, list[dict[str, str]]]",
607
- check.not_none(
608
- self.make_request_cached(endpoint="/destination_definitions/list", data={})
609
- ),
610
- )
611
- return next(
612
- (
613
- definition["destinationDefinitionId"]
614
- for definition in definitions["destinationDefinitions"]
615
- if definition["name"].lower() == name_lower
616
- ),
617
- None,
618
- )
619
-
620
- def get_source_catalog_id(self, source_id: str):
621
- result = cast(
622
- "dict[str, Any]",
623
- check.not_none(
624
- self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
625
- ),
626
- )
627
- return result["catalogId"]
628
-
629
- def get_source_schema(self, source_id: str) -> Mapping[str, Any]:
630
- return cast(
631
- "dict[str, Any]",
632
- check.not_none(
633
- self.make_request(endpoint="/sources/discover_schema", data={"sourceId": source_id})
634
- ),
635
- )
636
-
637
- def does_dest_support_normalization(
638
- self, destination_definition_id: str, workspace_id: str
639
- ) -> bool:
640
- # Airbyte API changed source of truth for normalization in PR
641
- # https://github.com/airbytehq/airbyte/pull/21005
642
- norm_dest_def_spec: bool = cast(
643
- "dict[str, Any]",
644
- check.not_none(
645
- self.make_request_cached(
646
- endpoint="/destination_definition_specifications/get",
647
- data={
648
- "destinationDefinitionId": destination_definition_id,
649
- "workspaceId": workspace_id,
650
- },
651
- )
652
- ),
653
- ).get("supportsNormalization", False)
654
-
655
- norm_dest_def: bool = (
656
- cast(
657
- "dict[str, Any]",
658
- check.not_none(
659
- self.make_request_cached(
660
- endpoint="/destination_definitions/get",
661
- data={
662
- "destinationDefinitionId": destination_definition_id,
663
- },
664
- )
665
- ),
666
- )
667
- .get("normalizationConfig", {})
668
- .get("supported", False)
669
- )
670
-
671
- return any([norm_dest_def_spec, norm_dest_def])
672
-
673
- def get_job_status(self, connection_id: str, job_id: int) -> Mapping[str, object]:
674
- if self.forward_logs:
675
- return check.not_none(self.make_request(endpoint="/jobs/get", data={"id": job_id}))
676
- else:
677
- # the "list all jobs" endpoint doesn't return logs, which actually makes it much more
678
- # lightweight for long-running syncs with many logs
679
- out = check.not_none(
680
- self.make_request(
681
- endpoint="/jobs/list",
682
- data={
683
- "configTypes": ["sync"],
684
- "configId": connection_id,
685
- # sync should be the most recent, so pageSize 5 is sufficient
686
- "pagination": {"pageSize": 5},
687
- },
688
- )
689
- )
690
- job = next(
691
- (job for job in cast("list", out["jobs"]) if job["job"]["id"] == job_id), None
692
- )
693
-
694
- return check.not_none(job)
695
-
696
- def start_sync(self, connection_id: str) -> Mapping[str, object]:
697
- return check.not_none(
698
- self.make_request(endpoint="/connections/sync", data={"connectionId": connection_id})
699
- )
700
-
701
- def get_connection_details(self, connection_id: str) -> Mapping[str, object]:
702
- return check.not_none(
703
- self.make_request(endpoint="/connections/get", data={"connectionId": connection_id})
704
- )
705
-
706
- def sync_and_poll(
707
- self,
708
- connection_id: str,
709
- poll_interval: Optional[float] = None,
710
- poll_timeout: Optional[float] = None,
711
- ) -> AirbyteOutput:
712
- """Initializes a sync operation for the given connector, and polls until it completes.
713
-
714
- Args:
715
- connection_id (str): The Airbyte Connector ID. You can retrieve this value from the
716
- "Connection" tab of a given connection in the Arbyte UI.
717
- poll_interval (float): The time (in seconds) that will be waited between successive polls.
718
- poll_timeout (float): The maximum time that will waited before this operation is timed
719
- out. By default, this will never time out.
720
-
721
- Returns:
722
- :py:class:`~AirbyteOutput`:
723
- Details of the sync job.
724
- """
725
- connection_details = self.get_connection_details(connection_id)
726
- job_details = self.start_sync(connection_id)
727
- job_info = cast("dict[str, object]", job_details.get("job", {}))
728
- job_id = cast("int", job_info.get("id"))
729
-
730
- self._log.info(f"Job {job_id} initialized for connection_id={connection_id}.")
731
- start = time.monotonic()
732
- logged_attempts = 0
733
- logged_lines = 0
734
- state = None
735
-
736
- try:
737
- while True:
738
- if poll_timeout and start + poll_timeout < time.monotonic():
739
- raise Failure(
740
- f"Timeout: Airbyte job {job_id} is not ready after the timeout"
741
- f" {poll_timeout} seconds"
742
- )
743
- time.sleep(poll_interval or self.poll_interval)
744
- job_details = self.get_job_status(connection_id, job_id)
745
- attempts = cast("list", job_details.get("attempts", []))
746
- cur_attempt = len(attempts)
747
- # spit out the available Airbyte log info
748
- if cur_attempt:
749
- if self.forward_logs:
750
- log_lines = attempts[logged_attempts].get("logs", {}).get("logLines", [])
751
-
752
- for line in log_lines[logged_lines:]:
753
- sys.stdout.write(line + "\n")
754
- sys.stdout.flush()
755
- logged_lines = len(log_lines)
756
-
757
- # if there's a next attempt, this one will have no more log messages
758
- if logged_attempts < cur_attempt - 1:
759
- logged_lines = 0
760
- logged_attempts += 1
761
-
762
- job_info = cast("dict[str, object]", job_details.get("job", {}))
763
- state = job_info.get("status")
764
-
765
- if state in (
766
- AirbyteJobStatusType.RUNNING,
767
- AirbyteJobStatusType.PENDING,
768
- AirbyteJobStatusType.INCOMPLETE,
769
- ):
770
- continue
771
- elif state == AirbyteJobStatusType.SUCCEEDED:
772
- break
773
- elif state == AirbyteJobStatusType.ERROR:
774
- raise Failure(f"Job failed: {job_id}")
775
- elif state == AirbyteJobStatusType.CANCELLED:
776
- raise Failure(f"Job was cancelled: {job_id}")
777
- else:
778
- raise Failure(f"Encountered unexpected state `{state}` for job_id {job_id}")
779
- finally:
780
- # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
781
- # the python process
782
- if (
783
- state
784
- not in (
785
- AirbyteJobStatusType.SUCCEEDED,
786
- AirbyteJobStatusType.ERROR,
787
- AirbyteJobStatusType.CANCELLED,
788
- )
789
- and self.cancel_sync_on_run_termination
790
- ):
791
- self.cancel_job(job_id)
792
-
793
- return AirbyteOutput(job_details=job_details, connection_details=connection_details)
794
-
795
-
796
- @dagster_maintained_resource
797
- @resource(config_schema=AirbyteResource.to_config_schema())
798
- def airbyte_resource(context) -> AirbyteResource:
799
- """This resource allows users to programatically interface with the Airbyte REST API to launch
800
- syncs and monitor their progress. This currently implements only a subset of the functionality
801
- exposed by the API.
802
-
803
- For a complete set of documentation on the Airbyte REST API, including expected response JSON
804
- schema, see the `Airbyte API Docs <https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#overview>`_.
805
-
806
- To configure this resource, we recommend using the `configured
807
- <https://legacy-docs.dagster.io/concepts/configuration/configured>`_ method.
808
-
809
- **Examples:**
810
-
811
- .. code-block:: python
812
-
813
- from dagster import job
814
- from dagster_airbyte import airbyte_resource
815
-
816
- my_airbyte_resource = airbyte_resource.configured(
817
- {
818
- "host": {"env": "AIRBYTE_HOST"},
819
- "port": {"env": "AIRBYTE_PORT"},
820
- # If using basic auth
821
- "username": {"env": "AIRBYTE_USERNAME"},
822
- "password": {"env": "AIRBYTE_PASSWORD"},
823
- }
824
- )
825
-
826
- @job(resource_defs={"airbyte":my_airbyte_resource})
827
- def my_airbyte_job():
828
- ...
829
-
830
- """
831
- return AirbyteResource.from_resource_context(context)
832
-
833
-
834
- @superseded(additional_warn_text=("Use `AirbyteCloudWorkspace` instead."))
835
- @dagster_maintained_resource
836
- @resource(config_schema=infer_schema_from_config_class(AirbyteCloudResource))
837
- def airbyte_cloud_resource(context) -> AirbyteCloudResource:
838
- """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
839
- syncs and monitor their progress. Currently, this resource may only be used with the more basic
840
- `dagster-airbyte` APIs, including the ops and assets.
841
-
842
- """
843
- return AirbyteCloudResource.from_resource_context(context)
844
-
845
-
846
- # -------------
847
- # Resources v2
848
- # -------------
849
-
850
-
851
- @beta
852
- class AirbyteCloudClient(DagsterModel):
853
- """This class exposes methods on top of the Airbyte APIs for Airbyte Cloud."""
854
-
855
85
  workspace_id: str = Field(..., description="The Airbyte workspace ID")
856
- client_id: str = Field(..., description="The Airbyte client ID.")
857
- client_secret: str = Field(..., description="The Airbyte client secret.")
86
+ client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
87
+ client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
88
+ username: Optional[str] = Field(
89
+ default=None,
90
+ description="The Airbyte username for authentication. Used for self-managed Airbyte with basic auth.",
91
+ )
92
+ password: Optional[str] = Field(
93
+ default=None,
94
+ description="The Airbyte password for authentication. Used for self-managed Airbyte with basic auth.",
95
+ )
858
96
  request_max_retries: int = Field(
859
97
  ...,
860
98
  description=(
@@ -870,30 +108,82 @@ class AirbyteCloudClient(DagsterModel):
870
108
  ...,
871
109
  description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
872
110
  )
111
+ max_items_per_page: int = Field(
112
+ default=100,
113
+ description=(
114
+ "The maximum number of items per page. "
115
+ "Used for paginated resources like connections, destinations, etc. "
116
+ ),
117
+ )
118
+ poll_interval: float = Field(
119
+ default=DEFAULT_POLL_INTERVAL_SECONDS,
120
+ description="The time (in seconds) that will be waited between successive polls.",
121
+ )
122
+ poll_timeout: Optional[float] = Field(
123
+ default=None,
124
+ description=(
125
+ "The maximum time that will wait before this operation is timed "
126
+ "out. By default, this will never time out."
127
+ ),
128
+ )
129
+ cancel_on_termination: bool = Field(
130
+ default=True,
131
+ description=(
132
+ "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
133
+ "This may be useful to disable if using Airbyte sources that cannot be cancelled and "
134
+ "resumed easily, or if your Dagster deployment may experience runner interruptions "
135
+ "that do not impact your Airbyte deployment."
136
+ ),
137
+ )
138
+ poll_previous_running_sync: bool = Field(
139
+ default=False,
140
+ description=(
141
+ "If set to True, Dagster will check for previous running sync for the same connection "
142
+ "and begin polling it instead of starting a new sync."
143
+ ),
144
+ )
873
145
 
874
146
  _access_token_value: Optional[str] = PrivateAttr(default=None)
875
147
  _access_token_timestamp: Optional[float] = PrivateAttr(default=None)
876
148
 
149
+ @model_validator(mode="before")
150
+ def validate_authentication(cls, values):
151
+ has_client_id = values.get("client_id") is not None
152
+ has_client_secret = values.get("client_secret") is not None
153
+ has_username = values.get("username") is not None
154
+ has_password = values.get("password") is not None
155
+
156
+ check.invariant(
157
+ has_username == has_password,
158
+ "Missing config: both username and password are required for Airbyte authentication.",
159
+ )
160
+
161
+ check.invariant(
162
+ has_client_id == has_client_secret,
163
+ "Missing config: both client_id and client_secret are required for Airbyte authentication.",
164
+ )
165
+
166
+ check.invariant(
167
+ not ((has_client_id or has_client_secret) and (has_username or has_password)),
168
+ "Invalid config: cannot provide both client_id/client_secret and username/password for Airbyte authentication.",
169
+ )
170
+ return values
171
+
877
172
  @property
878
173
  @cached_method
879
174
  def _log(self) -> logging.Logger:
880
175
  return get_dagster_logger()
881
176
 
882
177
  @property
883
- def rest_api_base_url(self) -> str:
884
- return f"{AIRBYTE_REST_API_BASE}/{AIRBYTE_REST_API_VERSION}"
885
-
886
- @property
887
- def configuration_api_base_url(self) -> str:
888
- return f"{AIRBYTE_CONFIGURATION_API_BASE}/{AIRBYTE_CONFIGURATION_API_VERSION}"
178
+ def all_additional_request_headers(self) -> Mapping[str, Any]:
179
+ return {**self.authorization_request_headers, **self.user_agent_request_headers}
889
180
 
890
181
  @property
891
- def all_additional_request_params(self) -> Mapping[str, Any]:
892
- return {**self.authorization_request_params, **self.user_agent_request_params}
893
-
894
- @property
895
- def authorization_request_params(self) -> Mapping[str, Any]:
182
+ def authorization_request_headers(self) -> Mapping[str, Any]:
896
183
  # Make sure the access token is refreshed before using it when calling the API.
184
+ if not (self.client_id and self.client_secret):
185
+ return {}
186
+
897
187
  if self._needs_refreshed_access_token():
898
188
  self._refresh_access_token()
899
189
  return {
@@ -901,23 +191,22 @@ class AirbyteCloudClient(DagsterModel):
901
191
  }
902
192
 
903
193
  @property
904
- def user_agent_request_params(self) -> Mapping[str, Any]:
194
+ def user_agent_request_headers(self) -> Mapping[str, Any]:
905
195
  return {
906
196
  "User-Agent": "dagster",
907
197
  }
908
198
 
909
199
  def _refresh_access_token(self) -> None:
910
200
  response = check.not_none(
911
- self._make_request(
201
+ self._single_request(
912
202
  method="POST",
913
- endpoint="applications/token",
914
- base_url=self.rest_api_base_url,
203
+ url=f"{self.rest_api_base_url}/applications/token",
915
204
  data={
916
205
  "client_id": self.client_id,
917
206
  "client_secret": self.client_secret,
918
207
  },
919
208
  # Must not pass the bearer access token when refreshing it.
920
- include_additional_request_params=False,
209
+ include_additional_request_headers=False,
921
210
  )
922
211
  )
923
212
  self._access_token_value = str(response["access_token"])
@@ -928,52 +217,38 @@ class AirbyteCloudClient(DagsterModel):
928
217
  not self._access_token_value
929
218
  or not self._access_token_timestamp
930
219
  or self._access_token_timestamp
931
- <= (
932
- datetime.now() - timedelta(seconds=AIRBYTE_CLOUD_REFRESH_TIMEDELTA_SECONDS)
933
- ).timestamp()
220
+ <= (datetime.now() - timedelta(seconds=AIRBYTE_REFRESH_TIMEDELTA_SECONDS)).timestamp()
934
221
  )
935
222
 
936
- def _get_session(self, include_additional_request_params: bool) -> requests.Session:
223
+ def _get_session(self, include_additional_request_headers: bool) -> requests.Session:
937
224
  headers = {"accept": "application/json"}
938
- if include_additional_request_params:
225
+ if include_additional_request_headers:
939
226
  headers = {
940
227
  **headers,
941
- **self.all_additional_request_params,
228
+ **self.all_additional_request_headers,
942
229
  }
943
230
  session = requests.Session()
944
231
  session.headers.update(headers)
232
+
233
+ if self.username and self.password:
234
+ session.auth = (self.username, self.password)
235
+
945
236
  return session
946
237
 
947
- def _make_request(
238
+ def _single_request(
948
239
  self,
949
240
  method: str,
950
- endpoint: str,
951
- base_url: str,
241
+ url: str,
952
242
  data: Optional[Mapping[str, Any]] = None,
953
243
  params: Optional[Mapping[str, Any]] = None,
954
- include_additional_request_params: bool = True,
244
+ include_additional_request_headers: bool = True,
955
245
  ) -> Mapping[str, Any]:
956
- """Creates and sends a request to the desired Airbyte REST API endpoint.
957
-
958
- Args:
959
- method (str): The http method to use for this request (e.g. "POST", "GET", "PATCH").
960
- endpoint (str): The Airbyte API endpoint to send this request to.
961
- base_url (str): The base url to the Airbyte API to use.
962
- data (Optional[Dict[str, Any]]): JSON-formatted data string to be included in the request.
963
- params (Optional[Dict[str, Any]]): JSON-formatted query params to be included in the request.
964
- include_additional_request_params (bool): Whether to include authorization and user-agent headers
965
- to the request parameters. Defaults to True.
966
-
967
- Returns:
968
- Dict[str, Any]: Parsed json data from the response to this request
969
- """
970
- url = f"{base_url}/{endpoint}"
971
-
246
+ """Execute a single HTTP request with retry logic."""
972
247
  num_retries = 0
973
248
  while True:
974
249
  try:
975
250
  session = self._get_session(
976
- include_additional_request_params=include_additional_request_params
251
+ include_additional_request_headers=include_additional_request_headers
977
252
  )
978
253
  response = session.request(
979
254
  method=method, url=url, json=data, params=params, timeout=self.request_timeout
@@ -989,17 +264,75 @@ class AirbyteCloudClient(DagsterModel):
989
264
  num_retries += 1
990
265
  time.sleep(self.request_retry_delay)
991
266
 
992
- raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
267
+ raise Failure(f"Max retries ({self.request_max_retries}) exceeded with url: {url}.")
268
+
269
+ return {}
270
+
271
+ def _paginated_request(
272
+ self,
273
+ method: str,
274
+ url: str,
275
+ params: dict[str, Any],
276
+ data: Optional[Mapping[str, Any]] = None,
277
+ include_additional_request_params: bool = True,
278
+ ) -> Sequence[Mapping[str, Any]]:
279
+ """Execute paginated requests and yield all items."""
280
+ result_data = []
281
+ params = {"limit": self.max_items_per_page, **params}
282
+ while True:
283
+ response = self._single_request(
284
+ method=method,
285
+ url=url,
286
+ data=data,
287
+ params=params,
288
+ include_additional_request_headers=include_additional_request_params,
289
+ )
290
+
291
+ # Handle different response structures
292
+ result_data.extend(response.get("data", []))
293
+ next_url = response.get("next", "")
294
+ if not next_url:
295
+ break
296
+
297
+ # Parse the query string for the next page
298
+ next_params = parse_qsl(urlparse(next_url).query)
299
+ # Overwrite the pagination params with the ones for the next page
300
+ params.update(dict(next_params))
301
+
302
+ return result_data
993
303
 
994
- def get_connections(self) -> Mapping[str, Any]:
304
+ def validate_workspace_id(self) -> None:
305
+ """Fetches workspace details. This is used to validate that the workspace exists."""
306
+ self._single_request(
307
+ method="GET",
308
+ url=f"{self.rest_api_base_url}/workspaces/{self.workspace_id}",
309
+ )
310
+
311
+ def get_connections(self) -> Sequence[Mapping[str, Any]]:
995
312
  """Fetches all connections of an Airbyte workspace from the Airbyte REST API."""
996
- return self._make_request(
313
+ return self._paginated_request(
997
314
  method="GET",
998
- endpoint="connections",
999
- base_url=self.rest_api_base_url,
315
+ url=f"{self.rest_api_base_url}/connections",
1000
316
  params={"workspaceIds": self.workspace_id},
1001
317
  )
1002
318
 
319
+ def get_jobs_for_connection(
320
+ self, connection_id: str, created_after: datetime | None = None
321
+ ) -> Sequence[AirbyteJob]:
322
+ """Fetches all jobs for a specific connection of an Airbyte workspace from the Airbyte REST API."""
323
+ params = {"workspaceIds": self.workspace_id, "connectionId": connection_id}
324
+ if created_after:
325
+ params["createdAtStart"] = created_after.strftime("%Y-%m-%dT%H:%M:%SZ")
326
+
327
+ return [
328
+ AirbyteJob.from_job_details(job_details=job_details)
329
+ for job_details in self._paginated_request(
330
+ method="GET",
331
+ url=f"{self.rest_api_base_url}/jobs",
332
+ params=params,
333
+ )
334
+ ]
335
+
1003
336
  def get_connection_details(self, connection_id) -> Mapping[str, Any]:
1004
337
  """Fetches details about a given connection from the Airbyte Configuration API.
1005
338
  The Airbyte Configuration API is an internal and may change in the future.
@@ -1007,26 +340,23 @@ class AirbyteCloudClient(DagsterModel):
1007
340
  # Using the Airbyte Configuration API to get the connection details, including streams and their configs.
1008
341
  # https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com/rapidoc-api-docs.html#post-/v1/connections/get
1009
342
  # https://github.com/airbytehq/airbyte-platform/blob/v1.0.0/airbyte-api/server-api/src/main/openapi/config.yaml
1010
- return self._make_request(
343
+ return self._single_request(
1011
344
  method="POST",
1012
- endpoint="connections/get",
1013
- base_url=self.configuration_api_base_url,
345
+ url=f"{self.configuration_api_base_url}/connections/get",
1014
346
  data={"connectionId": connection_id},
1015
347
  )
1016
348
 
1017
349
  def get_destination_details(self, destination_id: str) -> Mapping[str, Any]:
1018
350
  """Fetches details about a given destination from the Airbyte REST API."""
1019
- return self._make_request(
351
+ return self._single_request(
1020
352
  method="GET",
1021
- endpoint=f"destinations/{destination_id}",
1022
- base_url=self.rest_api_base_url,
353
+ url=f"{self.rest_api_base_url}/destinations/{destination_id}",
1023
354
  )
1024
355
 
1025
356
  def start_sync_job(self, connection_id: str) -> Mapping[str, Any]:
1026
- return self._make_request(
357
+ return self._single_request(
1027
358
  method="POST",
1028
- endpoint="jobs",
1029
- base_url=self.rest_api_base_url,
359
+ url=f"{self.rest_api_base_url}/jobs",
1030
360
  data={
1031
361
  "connectionId": connection_id,
1032
362
  "jobType": "sync",
@@ -1034,59 +364,76 @@ class AirbyteCloudClient(DagsterModel):
1034
364
  )
1035
365
 
1036
366
  def get_job_details(self, job_id: int) -> Mapping[str, Any]:
1037
- return self._make_request(
1038
- method="GET", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
367
+ return self._single_request(
368
+ method="GET",
369
+ url=f"{self.rest_api_base_url}/jobs/{job_id}",
1039
370
  )
1040
371
 
1041
372
  def cancel_job(self, job_id: int) -> Mapping[str, Any]:
1042
- return self._make_request(
1043
- method="DELETE", endpoint=f"jobs/{job_id}", base_url=self.rest_api_base_url
373
+ return self._single_request(
374
+ method="DELETE",
375
+ url=f"{self.rest_api_base_url}/jobs/{job_id}",
1044
376
  )
1045
377
 
1046
- def sync_and_poll(
1047
- self,
1048
- connection_id: str,
1049
- poll_interval: Optional[float] = None,
1050
- poll_timeout: Optional[float] = None,
1051
- cancel_on_termination: bool = True,
1052
- ) -> AirbyteOutput:
378
+ def sync_and_poll(self, connection_id: str) -> AirbyteOutput:
1053
379
  """Initializes a sync operation for the given connection, and polls until it completes.
1054
380
 
1055
381
  Args:
1056
382
  connection_id (str): The Airbyte Connection ID. You can retrieve this value from the
1057
383
  "Connection" tab of a given connection in the Airbyte UI.
1058
- poll_interval (float): The time (in seconds) that will be waited between successive polls.
1059
- poll_timeout (float): The maximum time that will wait before this operation is timed
1060
- out. By default, this will never time out.
1061
- cancel_on_termination (bool): Whether to cancel a sync in Airbyte if the Dagster runner is terminated.
1062
- This may be useful to disable if using Airbyte sources that cannot be cancelled and
1063
- resumed easily, or if your Dagster deployment may experience runner interruptions
1064
- that do not impact your Airbyte deployment.
1065
384
 
1066
385
  Returns:
1067
386
  :py:class:`~AirbyteOutput`:
1068
387
  Details of the sync job.
1069
388
  """
1070
389
  connection_details = self.get_connection_details(connection_id)
1071
- start_job_details = self.start_sync_job(connection_id)
1072
- job = AirbyteJob.from_job_details(job_details=start_job_details)
1073
390
 
1074
- self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
391
+ existing_jobs = [
392
+ job
393
+ for job in self.get_jobs_for_connection(
394
+ connection_id=connection_id,
395
+ created_after=datetime.now() - timedelta(days=2),
396
+ )
397
+ if job.status
398
+ in (
399
+ AirbyteJobStatusType.RUNNING,
400
+ AirbyteJobStatusType.PENDING,
401
+ AirbyteJobStatusType.INCOMPLETE,
402
+ )
403
+ ]
404
+
405
+ if not existing_jobs:
406
+ start_job_details = self.start_sync_job(connection_id)
407
+ job = AirbyteJob.from_job_details(job_details=start_job_details)
408
+ self._log.info(f"Job {job.id} initialized for connection_id={connection_id}.")
409
+ else:
410
+ if self.poll_previous_running_sync:
411
+ if len(existing_jobs) == 1:
412
+ job = existing_jobs[0]
413
+ self._log.info(
414
+ f"Job {job.id} already running for connection_id={connection_id}. Resume polling."
415
+ )
416
+ else:
417
+ raise Failure(f"Found multiple running jobs for connection_id={connection_id}.")
418
+ else:
419
+ raise Failure(f"Found sync job for connection_id={connection_id} already running.")
420
+
1075
421
  poll_start = datetime.now()
1076
- poll_interval = (
1077
- poll_interval if poll_interval is not None else DEFAULT_POLL_INTERVAL_SECONDS
1078
- )
422
+
1079
423
  try:
1080
424
  while True:
1081
- if poll_timeout and datetime.now() > poll_start + timedelta(seconds=poll_timeout):
425
+ if self.poll_timeout and datetime.now() > poll_start + timedelta(
426
+ seconds=self.poll_timeout
427
+ ):
1082
428
  raise Failure(
1083
429
  f"Timeout: Airbyte job {job.id} is not ready after the timeout"
1084
- f" {poll_timeout} seconds"
430
+ f" {self.poll_timeout} seconds"
1085
431
  )
1086
432
 
1087
- time.sleep(poll_interval)
433
+ time.sleep(self.poll_interval)
1088
434
  # We return these job details in the AirbyteOutput when the job succeeds
1089
435
  poll_job_details = self.get_job_details(job.id)
436
+ self._log.debug(poll_job_details)
1090
437
  job = AirbyteJob.from_job_details(job_details=poll_job_details)
1091
438
  if job.status in (
1092
439
  AirbyteJobStatusType.RUNNING,
@@ -1107,7 +454,7 @@ class AirbyteCloudClient(DagsterModel):
1107
454
  finally:
1108
455
  # if Airbyte sync has not completed, make sure to cancel it so that it doesn't outlive
1109
456
  # the python process
1110
- if cancel_on_termination and job.status not in (
457
+ if self.cancel_on_termination and job.status not in (
1111
458
  AirbyteJobStatusType.SUCCEEDED,
1112
459
  AirbyteJobStatusType.ERROR,
1113
460
  AirbyteJobStatusType.CANCELLED,
@@ -1119,14 +466,11 @@ class AirbyteCloudClient(DagsterModel):
1119
466
 
1120
467
 
1121
468
  @beta
1122
- class AirbyteCloudWorkspace(ConfigurableResource):
1123
- """This class represents a Airbyte Cloud workspace and provides utilities
469
+ class BaseAirbyteWorkspace(ConfigurableResource):
470
+ """This class represents a Airbyte workspace and provides utilities
1124
471
  to interact with Airbyte APIs.
1125
472
  """
1126
473
 
1127
- workspace_id: str = Field(..., description="The Airbyte Cloud workspace ID")
1128
- client_id: str = Field(..., description="The Airbyte Cloud client ID.")
1129
- client_secret: str = Field(..., description="The Airbyte Cloud client secret.")
1130
474
  request_max_retries: int = Field(
1131
475
  default=3,
1132
476
  description=(
@@ -1142,19 +486,42 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1142
486
  default=15,
1143
487
  description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
1144
488
  )
489
+ max_items_per_page: int = Field(
490
+ default=100,
491
+ description=(
492
+ "The maximum number of items per page. "
493
+ "Used for paginated resources like connections, destinations, etc. "
494
+ ),
495
+ )
496
+ poll_interval: float = Field(
497
+ default=DEFAULT_POLL_INTERVAL_SECONDS,
498
+ description="The time (in seconds) that will be waited between successive polls.",
499
+ )
500
+ poll_timeout: Optional[float] = Field(
501
+ default=None,
502
+ description=(
503
+ "The maximum time that will wait before this operation is timed "
504
+ "out. By default, this will never time out."
505
+ ),
506
+ )
507
+ cancel_on_termination: bool = Field(
508
+ default=True,
509
+ description=(
510
+ "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
511
+ "This may be useful to disable if using Airbyte sources that cannot be cancelled and "
512
+ "resumed easily, or if your Dagster deployment may experience runner interruptions "
513
+ "that do not impact your Airbyte deployment."
514
+ ),
515
+ )
516
+ poll_previous_running_sync: bool = Field(
517
+ default=False,
518
+ description=(
519
+ "If set to True, Dagster will check for previous running sync for the same connection "
520
+ "and begin polling it instead of starting a new sync."
521
+ ),
522
+ )
1145
523
 
1146
- _client: AirbyteCloudClient = PrivateAttr(default=None) # type: ignore
1147
-
1148
- @cached_method
1149
- def get_client(self) -> AirbyteCloudClient:
1150
- return AirbyteCloudClient(
1151
- workspace_id=self.workspace_id,
1152
- client_id=self.client_id,
1153
- client_secret=self.client_secret,
1154
- request_max_retries=self.request_max_retries,
1155
- request_retry_delay=self.request_retry_delay,
1156
- request_timeout=self.request_timeout,
1157
- )
524
+ _client: AirbyteClient = PrivateAttr(default=None) # type: ignore
1158
525
 
1159
526
  @cached_method
1160
527
  def fetch_airbyte_workspace_data(
@@ -1169,7 +536,10 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1169
536
  destinations_by_id = {}
1170
537
 
1171
538
  client = self.get_client()
1172
- connections = client.get_connections()["data"]
539
+
540
+ client.validate_workspace_id()
541
+
542
+ connections = client.get_connections()
1173
543
 
1174
544
  for partial_connection_details in connections:
1175
545
  full_connection_details = client.get_connection_details(
@@ -1197,6 +567,7 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1197
567
  def load_asset_specs(
1198
568
  self,
1199
569
  dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
570
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
1200
571
  ) -> Sequence[AssetSpec]:
1201
572
  """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
1202
573
 
@@ -1204,6 +575,8 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1204
575
  dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1205
576
  to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1206
577
  Defaults to :py:class:`DagsterAirbyteTranslator`.
578
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
579
+ which Airbyte connection assets are created for.
1207
580
 
1208
581
  Returns:
1209
582
  List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
@@ -1212,23 +585,25 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1212
585
  Loading the asset specs for a given Airbyte workspace:
1213
586
  .. code-block:: python
1214
587
 
1215
- from dagster_airbyte import AirbyteCloudWorkspace
588
+ from dagster_airbyte import AirbyteWorkspace
1216
589
 
1217
590
  import dagster as dg
1218
591
 
1219
- airbyte_workspace = AirbyteCloudWorkspace(
1220
- workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1221
- client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1222
- client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
592
+ airbyte_workspace = AirbyteWorkspace(
593
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
594
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
595
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
1223
596
  )
1224
597
 
1225
598
  airbyte_specs = airbyte_workspace.load_asset_specs()
1226
- defs = dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace}
599
+ dg.Definitions(assets=airbyte_specs, resources={"airbyte": airbyte_workspace})
1227
600
  """
1228
601
  dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1229
602
 
1230
- return load_airbyte_cloud_asset_specs(
1231
- workspace=self, dagster_airbyte_translator=dagster_airbyte_translator
603
+ return load_airbyte_asset_specs(
604
+ workspace=self,
605
+ dagster_airbyte_translator=dagster_airbyte_translator,
606
+ connection_selector_fn=connection_selector_fn,
1232
607
  )
1233
608
 
1234
609
  def _generate_materialization(
@@ -1263,7 +638,7 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1263
638
  yield AssetMaterialization(
1264
639
  asset_key=stream_asset_spec.key,
1265
640
  description=(
1266
- f"Table generated via Airbyte Cloud sync "
641
+ f"Table generated via Airbyte sync "
1267
642
  f"for connection {connection.name}: {connection_table_name}"
1268
643
  ),
1269
644
  metadata=stream_asset_spec.metadata,
@@ -1272,7 +647,7 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1272
647
  @public
1273
648
  @beta
1274
649
  def sync_and_poll(self, context: AssetExecutionContext):
1275
- """Executes a sync and poll process to materialize Airbyte Cloud assets.
650
+ """Executes a sync and poll process to materialize Airbyte assets.
1276
651
  This method can only be used in the context of an asset execution.
1277
652
 
1278
653
  Args:
@@ -1317,53 +692,264 @@ class AirbyteCloudWorkspace(ConfigurableResource):
1317
692
  if unmaterialized_asset_keys:
1318
693
  context.log.warning(f"Assets were not materialized: {unmaterialized_asset_keys}")
1319
694
 
695
+ @contextmanager
696
+ def process_config_and_initialize_cm_cached(self) -> Iterator["AirbyteWorkspace"]:
697
+ # Hack to avoid reconstructing initialized copies of this resource, which invalidates
698
+ # @cached_method caches. This means that multiple calls to load_airbyte_asset_specs
699
+ # will not trigger multiple API calls to fetch the workspace data.
700
+ # Bespoke impl since @cached_method doesn't play nice with iterators; it's exhausted after
701
+ # the first call.
702
+ if hasattr(self, "_initialized"):
703
+ yield getattr(self, "_initialized")
704
+ else:
705
+ with self.process_config_and_initialize_cm() as initialized_workspace:
706
+ initialized = initialized_workspace
707
+ setattr(self, "_initialized", initialized)
708
+ yield initialized
709
+
1320
710
 
1321
711
  @beta
1322
- def load_airbyte_cloud_asset_specs(
1323
- workspace: AirbyteCloudWorkspace,
712
+ class AirbyteWorkspace(BaseAirbyteWorkspace):
713
+ """This resource allows users to programatically interface with the Airbyte REST API to launch
714
+ syncs and monitor their progress for a given Airbyte workspace.
715
+
716
+ **Examples:**
717
+ Using OAuth client credentials:
718
+
719
+ .. code-block:: python
720
+
721
+ import dagster as dg
722
+ from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
723
+
724
+ airbyte_workspace = AirbyteWorkspace(
725
+ rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
726
+ configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
727
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
728
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
729
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
730
+ )
731
+
732
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
733
+
734
+ defs = dg.Definitions(
735
+ assets=all_airbyte_assets,
736
+ resources={"airbyte": airbyte_workspace},
737
+ )
738
+
739
+ Using basic Authentication:
740
+
741
+ .. code-block:: python
742
+
743
+ import dagster as dg
744
+ from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
745
+
746
+ airbyte_workspace = AirbyteWorkspace(
747
+ rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
748
+ configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
749
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
750
+ username=dg.EnvVar("AIRBYTE_USERNAME"),
751
+ password=dg.EnvVar("AIRBYTE_PASSWORD"),
752
+ )
753
+
754
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
755
+
756
+ defs = dg.Definitions(
757
+ assets=all_airbyte_assets,
758
+ resources={"airbyte": airbyte_workspace},
759
+ )
760
+
761
+ Using no authentication:
762
+
763
+ .. code-block:: python
764
+
765
+ import dagster as dg
766
+ from dagster_airbyte import AirbyteWorkspace, build_airbyte_assets_definitions
767
+
768
+ airbyte_workspace = AirbyteWorkspace(
769
+ rest_api_base_url=dg.EnvVar("AIRBYTE_REST_API_BASE_URL"),
770
+ configuration_api_base_url=dg.EnvVar("AIRBYTE_CONFIGURATION_API_BASE_URL"),
771
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
772
+ )
773
+
774
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
775
+
776
+ defs = dg.Definitions(
777
+ assets=all_airbyte_assets,
778
+ resources={"airbyte": airbyte_workspace},
779
+ )
780
+ """
781
+
782
+ rest_api_base_url: str = Field(
783
+ ...,
784
+ description="The base URL for the Airbyte REST API.",
785
+ examples=[
786
+ "http://localhost:8000/api/public/v1",
787
+ "https://my-airbyte-server.com/api/public/v1",
788
+ "http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/public/v1",
789
+ ],
790
+ )
791
+ configuration_api_base_url: str = Field(
792
+ ...,
793
+ description="The base URL for the Airbyte Configuration API.",
794
+ examples=[
795
+ "http://localhost:8000/api/v1",
796
+ "https://my-airbyte-server.com/api/v1",
797
+ "http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/v1",
798
+ ],
799
+ )
800
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
801
+ client_id: Optional[str] = Field(default=None, description="The Airbyte client ID.")
802
+ client_secret: Optional[str] = Field(default=None, description="The Airbyte client secret.")
803
+ username: Optional[str] = Field(
804
+ default=None, description="The Airbyte username for authentication."
805
+ )
806
+ password: Optional[str] = Field(
807
+ default=None, description="The Airbyte password for authentication."
808
+ )
809
+
810
+ @cached_method
811
+ def get_client(self) -> AirbyteClient:
812
+ return AirbyteClient(
813
+ rest_api_base_url=self.rest_api_base_url,
814
+ configuration_api_base_url=self.configuration_api_base_url,
815
+ workspace_id=self.workspace_id,
816
+ client_id=self.client_id,
817
+ client_secret=self.client_secret,
818
+ username=self.username,
819
+ password=self.password,
820
+ request_max_retries=self.request_max_retries,
821
+ request_retry_delay=self.request_retry_delay,
822
+ request_timeout=self.request_timeout,
823
+ max_items_per_page=self.max_items_per_page,
824
+ poll_interval=self.poll_interval,
825
+ poll_timeout=self.poll_timeout,
826
+ cancel_on_termination=self.cancel_on_termination,
827
+ poll_previous_running_sync=self.poll_previous_running_sync,
828
+ )
829
+
830
+
831
+ @beta
832
+ class AirbyteCloudWorkspace(BaseAirbyteWorkspace):
833
+ """This resource allows users to programatically interface with the Airbyte Cloud REST API to launch
834
+ syncs and monitor their progress for a given Airbyte Cloud workspace.
835
+
836
+ **Examples:**
837
+
838
+ .. code-block:: python
839
+
840
+ from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
841
+
842
+ import dagster as dg
843
+
844
+ airbyte_workspace = AirbyteCloudWorkspace(
845
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
846
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
847
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
848
+ )
849
+
850
+ all_airbyte_assets = build_airbyte_assets_definitions(workspace=airbyte_workspace)
851
+
852
+ defs = dg.Definitions(
853
+ assets=all_airbyte_assets,
854
+ resources={"airbyte": airbyte_workspace},
855
+ )
856
+ """
857
+
858
+ rest_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_REST_API_BASE_URL
859
+ configuration_api_base_url: ClassVar[str] = AIRBYTE_CLOUD_CONFIGURATION_API_BASE_URL
860
+ workspace_id: str = Field(..., description="The Airbyte workspace ID")
861
+ client_id: str = Field(..., description="The Airbyte client ID.")
862
+ client_secret: str = Field(..., description="The Airbyte client secret.")
863
+
864
+ @cached_method
865
+ def get_client(self) -> AirbyteClient:
866
+ return AirbyteClient(
867
+ rest_api_base_url=self.rest_api_base_url,
868
+ configuration_api_base_url=self.configuration_api_base_url,
869
+ workspace_id=self.workspace_id,
870
+ client_id=self.client_id,
871
+ client_secret=self.client_secret,
872
+ request_max_retries=self.request_max_retries,
873
+ request_retry_delay=self.request_retry_delay,
874
+ request_timeout=self.request_timeout,
875
+ max_items_per_page=self.max_items_per_page,
876
+ poll_interval=self.poll_interval,
877
+ poll_timeout=self.poll_timeout,
878
+ cancel_on_termination=self.cancel_on_termination,
879
+ poll_previous_running_sync=self.poll_previous_running_sync,
880
+ )
881
+
882
+
883
+ @public
884
+ @beta
885
+ def load_airbyte_asset_specs(
886
+ workspace: BaseAirbyteWorkspace,
1324
887
  dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
888
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
1325
889
  ) -> Sequence[AssetSpec]:
1326
890
  """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
1327
891
 
1328
892
  Args:
1329
- workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
893
+ workspace (BaseAirbyteWorkspace): The Airbyte workspace to fetch assets from.
1330
894
  dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1331
895
  to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1332
896
  Defaults to :py:class:`DagsterAirbyteTranslator`.
897
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
898
+ which Airbyte connection assets are created for.
1333
899
 
1334
900
  Returns:
1335
901
  List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
1336
902
 
1337
903
  Examples:
1338
- Loading the asset specs for a given Airbyte Cloud workspace:
904
+ Loading the asset specs for a given Airbyte workspace:
1339
905
 
1340
906
  .. code-block:: python
1341
907
 
1342
- from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
908
+ from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
1343
909
 
1344
910
  import dagster as dg
1345
911
 
1346
- airbyte_cloud_workspace = AirbyteCloudWorkspace(
1347
- workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1348
- client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1349
- client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
912
+ airbyte_workspace = AirbyteWorkspace(
913
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
914
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
915
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
1350
916
  )
1351
917
 
918
+ airbyte_specs = load_airbyte_asset_specs(airbyte_workspace)
919
+ dg.Definitions(assets=airbyte_specs)
1352
920
 
1353
- airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
1354
- defs = dg.Definitions(assets=airbyte_cloud_specs)
921
+ Filter connections by name:
922
+
923
+ .. code-block:: python
924
+
925
+ from dagster_airbyte import AirbyteWorkspace, load_airbyte_asset_specs
926
+
927
+ import dagster as dg
928
+
929
+ airbyte_workspace = AirbyteWorkspace(
930
+ workspace_id=dg.EnvVar("AIRBYTE_WORKSPACE_ID"),
931
+ client_id=dg.EnvVar("AIRBYTE_CLIENT_ID"),
932
+ client_secret=dg.EnvVar("AIRBYTE_CLIENT_SECRET"),
933
+ )
934
+
935
+ airbyte_specs = load_airbyte_asset_specs(
936
+ workspace=airbyte_workspace,
937
+ connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
938
+ )
939
+ dg.Definitions(assets=airbyte_specs)
1355
940
  """
1356
941
  dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1357
942
 
1358
- with workspace.process_config_and_initialize_cm() as initialized_workspace:
943
+ with workspace.process_config_and_initialize_cm_cached() as initialized_workspace:
1359
944
  return [
1360
945
  spec.merge_attributes(
1361
946
  metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: dagster_airbyte_translator}
1362
947
  )
1363
948
  for spec in check.is_list(
1364
- AirbyteCloudWorkspaceDefsLoader(
949
+ AirbyteWorkspaceDefsLoader(
1365
950
  workspace=initialized_workspace,
1366
951
  translator=dagster_airbyte_translator,
952
+ connection_selector_fn=connection_selector_fn,
1367
953
  )
1368
954
  .build_defs()
1369
955
  .assets,
@@ -1372,22 +958,90 @@ def load_airbyte_cloud_asset_specs(
1372
958
  ]
1373
959
 
1374
960
 
961
+ @public
962
+ @superseded(additional_warn_text="Use load_airbyte_asset_specs instead.")
963
+ def load_airbyte_cloud_asset_specs(
964
+ workspace: AirbyteCloudWorkspace,
965
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
966
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
967
+ ) -> Sequence[AssetSpec]:
968
+ """Returns a list of AssetSpecs representing the Airbyte content in the workspace.
969
+
970
+ Args:
971
+ workspace (AirbyteCloudWorkspace): The Airbyte Cloud workspace to fetch assets from.
972
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
973
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
974
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
975
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
976
+ which Airbyte connection assets are created for.
977
+
978
+ Returns:
979
+ List[AssetSpec]: The set of assets representing the Airbyte content in the workspace.
980
+
981
+ Examples:
982
+ Loading the asset specs for a given Airbyte Cloud workspace:
983
+
984
+ .. code-block:: python
985
+
986
+ from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
987
+
988
+ import dagster as dg
989
+
990
+ airbyte_cloud_workspace = AirbyteCloudWorkspace(
991
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
992
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
993
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
994
+ )
995
+
996
+ airbyte_cloud_specs = load_airbyte_cloud_asset_specs(airbyte_cloud_workspace)
997
+ dg.Definitions(assets=airbyte_cloud_specs)
998
+
999
+ Filter connections by name:
1000
+
1001
+ .. code-block:: python
1002
+
1003
+ from dagster_airbyte import AirbyteCloudWorkspace, load_airbyte_cloud_asset_specs
1004
+
1005
+ import dagster as dg
1006
+
1007
+ airbyte_cloud_workspace = AirbyteCloudWorkspace(
1008
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1009
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1010
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1011
+ )
1012
+
1013
+ airbyte_cloud_specs = load_airbyte_cloud_asset_specs(
1014
+ workspace=airbyte_cloud_workspace,
1015
+ connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
1016
+ )
1017
+ dg.Definitions(assets=airbyte_cloud_specs)
1018
+ """
1019
+ return load_airbyte_asset_specs(
1020
+ workspace=workspace,
1021
+ dagster_airbyte_translator=dagster_airbyte_translator,
1022
+ connection_selector_fn=connection_selector_fn,
1023
+ )
1024
+
1025
+
1375
1026
  @record
1376
- class AirbyteCloudWorkspaceDefsLoader(StateBackedDefinitionsLoader[Mapping[str, Any]]):
1377
- workspace: AirbyteCloudWorkspace
1027
+ class AirbyteWorkspaceDefsLoader(StateBackedDefinitionsLoader[AirbyteWorkspaceData]):
1028
+ workspace: Union[AirbyteWorkspace, AirbyteCloudWorkspace]
1378
1029
  translator: DagsterAirbyteTranslator
1030
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]]
1379
1031
 
1380
1032
  @property
1381
1033
  def defs_key(self) -> str:
1382
- return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}/{self.workspace.workspace_id}"
1034
+ return f"{AIRBYTE_RECONSTRUCTION_METADATA_KEY_PREFIX}.{self.workspace.workspace_id}"
1383
1035
 
1384
- def fetch_state(self) -> AirbyteWorkspaceData: # pyright: ignore[reportIncompatibleMethodOverride]
1036
+ def fetch_state(self) -> AirbyteWorkspaceData:
1385
1037
  return self.workspace.fetch_airbyte_workspace_data()
1386
1038
 
1387
- def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions: # pyright: ignore[reportIncompatibleMethodOverride]
1039
+ def defs_from_state(self, state: AirbyteWorkspaceData) -> Definitions:
1388
1040
  all_asset_specs = [
1389
1041
  self.translator.get_asset_spec(props)
1390
1042
  for props in state.to_airbyte_connection_table_props_data()
1043
+ if not self.connection_selector_fn
1044
+ or self.connection_selector_fn(state.connections_by_id[props.connection_id])
1391
1045
  ]
1392
1046
 
1393
1047
  return Definitions(assets=all_asset_specs)