dagster-airbyte 0.24.3__py3-none-any.whl → 0.28.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,433 @@
1
+ from collections import defaultdict
2
+ from collections.abc import Callable, Iterable, Sequence
3
+ from functools import cached_property
4
+ from pathlib import Path
5
+ from typing import Annotated, Optional, Union
6
+
7
+ import dagster as dg
8
+ import pydantic
9
+ from dagster._annotations import superseded
10
+ from dagster._symbol_annotations.public import public
11
+ from dagster._utils.names import clean_name
12
+ from dagster.components.component.state_backed_component import StateBackedComponent
13
+ from dagster.components.resolved.base import resolve_fields
14
+ from dagster.components.utils.defs_state import (
15
+ DefsStateConfig,
16
+ DefsStateConfigArgs,
17
+ ResolvedDefsStateConfig,
18
+ )
19
+ from dagster.components.utils.translation import (
20
+ ComponentTranslator,
21
+ TranslationFn,
22
+ TranslationFnResolver,
23
+ create_component_translator_cls,
24
+ )
25
+ from dagster_shared import check
26
+ from dagster_shared.serdes.serdes import deserialize_value
27
+
28
+ from dagster_airbyte.components.workspace_component.scaffolder import (
29
+ AirbyteWorkspaceComponentScaffolder,
30
+ )
31
+ from dagster_airbyte.resources import (
32
+ DEFAULT_POLL_INTERVAL_SECONDS,
33
+ AirbyteCloudWorkspace,
34
+ AirbyteWorkspace,
35
+ BaseAirbyteWorkspace,
36
+ )
37
+ from dagster_airbyte.translator import (
38
+ AirbyteConnection,
39
+ AirbyteConnectionTableProps,
40
+ AirbyteMetadataSet,
41
+ AirbyteWorkspaceData,
42
+ DagsterAirbyteTranslator,
43
+ )
44
+ from dagster_airbyte.utils import DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY
45
+
46
+
47
+ class BaseAirbyteWorkspaceModel(dg.Model, dg.Resolvable):
48
+ request_max_retries: Annotated[
49
+ int,
50
+ pydantic.Field(
51
+ default=3,
52
+ description=(
53
+ "The maximum number of times requests to the Airbyte API should be retried "
54
+ "before failing."
55
+ ),
56
+ ),
57
+ ]
58
+ request_retry_delay: Annotated[
59
+ float,
60
+ pydantic.Field(
61
+ default=0.25,
62
+ description="Time (in seconds) to wait between each request retry.",
63
+ ),
64
+ ]
65
+ request_timeout: Annotated[
66
+ int,
67
+ pydantic.Field(
68
+ default=15,
69
+ description="Time (in seconds) after which the requests to Airbyte are declared timed out.",
70
+ ),
71
+ ]
72
+ max_items_per_page: Annotated[
73
+ int,
74
+ pydantic.Field(
75
+ default=100,
76
+ description=(
77
+ "The maximum number of items per page. "
78
+ "Used for paginated resources like connections, destinations, etc. "
79
+ ),
80
+ ),
81
+ ]
82
+ poll_interval: Annotated[
83
+ float,
84
+ pydantic.Field(
85
+ default=DEFAULT_POLL_INTERVAL_SECONDS,
86
+ description="The time (in seconds) that will be waited between successive polls.",
87
+ ),
88
+ ]
89
+ poll_timeout: Annotated[
90
+ Optional[float],
91
+ pydantic.Field(
92
+ default=None,
93
+ description=(
94
+ "The maximum time that will wait before this operation is timed "
95
+ "out. By default, this will never time out."
96
+ ),
97
+ ),
98
+ ]
99
+ cancel_on_termination: Annotated[
100
+ bool,
101
+ pydantic.Field(
102
+ default=True,
103
+ description=(
104
+ "Whether to cancel a sync in Airbyte if the Dagster runner is terminated. "
105
+ "This may be useful to disable if using Airbyte sources that cannot be cancelled and "
106
+ "resumed easily, or if your Dagster deployment may experience runner interruptions "
107
+ "that do not impact your Airbyte deployment."
108
+ ),
109
+ ),
110
+ ]
111
+ poll_previous_running_sync: Annotated[
112
+ bool,
113
+ pydantic.Field(
114
+ default=False,
115
+ description=(
116
+ "If set to True, Dagster will check for previous running sync for the same connection "
117
+ "and begin polling it instead of starting a new sync."
118
+ ),
119
+ ),
120
+ ]
121
+
122
+
123
+ class AirbyteWorkspaceModel(BaseAirbyteWorkspaceModel):
124
+ rest_api_base_url: Annotated[
125
+ str,
126
+ pydantic.Field(
127
+ ...,
128
+ description=(
129
+ "The base URL for the Airbyte REST API. "
130
+ "For Airbyte Cloud, leave this as the default. "
131
+ "For self-managed Airbyte, this is usually <your Airbyte host>/api/public/v1."
132
+ ),
133
+ examples=[
134
+ "http://localhost:8000/api/public/v1",
135
+ "https://my-airbyte-server.com/api/public/v1",
136
+ "http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/public/v1",
137
+ ],
138
+ ),
139
+ ]
140
+ configuration_api_base_url: Annotated[
141
+ str,
142
+ pydantic.Field(
143
+ ...,
144
+ description=(
145
+ "The base URL for the Airbyte Configuration API. "
146
+ "For Airbyte Cloud, leave this as the default. "
147
+ "For self-managed Airbyte, this is usually <your Airbyte host>/api/v1."
148
+ ),
149
+ examples=[
150
+ "http://localhost:8000/api/v1",
151
+ "https://my-airbyte-server.com/api/v1",
152
+ "http://airbyte-airbyte-server-svc.airbyte.svc.cluster.local:8001/api/v1",
153
+ ],
154
+ ),
155
+ ]
156
+ workspace_id: Annotated[str, pydantic.Field(..., description="The Airbyte workspace ID.")]
157
+ client_id: Annotated[
158
+ Optional[str],
159
+ pydantic.Field(None, description="Client ID used to authenticate to Airbyte."),
160
+ ]
161
+ client_secret: Annotated[
162
+ Optional[str],
163
+ pydantic.Field(None, description="Client secret used to authenticate to Airbyte."),
164
+ ]
165
+ username: Annotated[
166
+ Optional[str],
167
+ pydantic.Field(
168
+ None,
169
+ description="Username used to authenticate to Airbyte. Used for self-managed Airbyte with basic auth.",
170
+ ),
171
+ ]
172
+ password: Annotated[
173
+ Optional[str],
174
+ pydantic.Field(
175
+ None,
176
+ description="Password used to authenticate to Airbyte. Used for self-managed Airbyte with basic auth.",
177
+ ),
178
+ ]
179
+
180
+
181
+ class AirbyteCloudWorkspaceModel(BaseAirbyteWorkspaceModel):
182
+ workspace_id: Annotated[str, pydantic.Field(..., description="The Airbyte workspace ID.")]
183
+ client_id: Annotated[
184
+ Optional[str],
185
+ pydantic.Field(..., description="Client ID used to authenticate to Airbyte."),
186
+ ]
187
+ client_secret: Annotated[
188
+ Optional[str],
189
+ pydantic.Field(..., description="Client secret used to authenticate to Airbyte."),
190
+ ]
191
+
192
+
193
+ class AirbyteConnectionSelectorByName(dg.Model):
194
+ by_name: Annotated[
195
+ Sequence[str],
196
+ pydantic.Field(..., description="A list of connection names to include in the collection."),
197
+ ]
198
+
199
+
200
+ class AirbyteConnectionSelectorById(dg.Model):
201
+ by_id: Annotated[
202
+ Sequence[str],
203
+ pydantic.Field(..., description="A list of connection IDs to include in the collection."),
204
+ ]
205
+
206
+
207
+ def resolve_connection_selector(
208
+ context: dg.ResolutionContext, model
209
+ ) -> Optional[Callable[[AirbyteConnection], bool]]:
210
+ if isinstance(model, str):
211
+ model = context.resolve_value(model)
212
+
213
+ if isinstance(model, AirbyteConnectionSelectorByName):
214
+ return lambda connection: connection.name in model.by_name
215
+ elif isinstance(model, AirbyteConnectionSelectorById):
216
+ return lambda connection: connection.id in model.by_id
217
+ else:
218
+ check.failed(f"Unknown connection target type: {type(model)}")
219
+
220
+
221
+ def resolve_airbyte_workspace_type(context: dg.ResolutionContext, model):
222
+ if isinstance(model, AirbyteWorkspaceModel):
223
+ return AirbyteWorkspace(**resolve_fields(model, AirbyteWorkspaceModel, context))
224
+ elif isinstance(model, AirbyteCloudWorkspaceModel):
225
+ return AirbyteCloudWorkspace(**resolve_fields(model, AirbyteCloudWorkspaceModel, context))
226
+ else:
227
+ check.failed(f"Unknown Airbyte workspace type: {type(model)}")
228
+
229
+
230
+ @public
231
+ @dg.scaffold_with(AirbyteWorkspaceComponentScaffolder)
232
+ class AirbyteWorkspaceComponent(StateBackedComponent, dg.Model, dg.Resolvable):
233
+ """Loads Airbyte connections from a given Airbyte workspace as Dagster assets.
234
+ Materializing these assets will trigger a sync of the Airbyte connection, enabling
235
+ you to schedule Airbyte syncs using Dagster.
236
+
237
+ Example:
238
+
239
+ .. code-block:: yaml
240
+
241
+ # defs.yaml
242
+
243
+ type: dagster_airbyte.AirbyteWorkspaceComponent
244
+ attributes:
245
+ workspace:
246
+ rest_api_base_url: http://localhost:8000/api/public/v1
247
+ configuration_api_base_url: http://localhost:8000/api/v1
248
+ workspace_id: your-workspace-id
249
+ client_id: "{{ env.AIRBYTE_CLIENT_ID }}"
250
+ client_secret: "{{ env.AIRBYTE_CLIENT_SECRET }}"
251
+ connection_selector:
252
+ by_name:
253
+ - my_postgres_to_snowflake_connection
254
+ - my_mysql_to_bigquery_connection
255
+ """
256
+
257
+ workspace: Annotated[
258
+ Union[AirbyteWorkspace, AirbyteCloudWorkspace],
259
+ dg.Resolver(
260
+ resolve_airbyte_workspace_type,
261
+ model_field_type=Union[AirbyteWorkspaceModel, AirbyteCloudWorkspaceModel],
262
+ ),
263
+ ]
264
+ connection_selector: Annotated[
265
+ Optional[Callable[[AirbyteConnection], bool]],
266
+ dg.Resolver(
267
+ resolve_connection_selector,
268
+ model_field_type=Union[
269
+ str, AirbyteConnectionSelectorByName, AirbyteConnectionSelectorById
270
+ ],
271
+ description="Function used to select Airbyte connections to pull into Dagster.",
272
+ ),
273
+ ] = None
274
+ translation: Optional[
275
+ Annotated[
276
+ TranslationFn[AirbyteConnectionTableProps],
277
+ TranslationFnResolver(template_vars_for_translation_fn=lambda data: {"props": data}),
278
+ ]
279
+ ] = pydantic.Field(
280
+ default=None,
281
+ description="Function used to translate Airbyte connection table properties into Dagster asset specs.",
282
+ )
283
+ defs_state: ResolvedDefsStateConfig = DefsStateConfigArgs.legacy_code_server_snapshots()
284
+
285
+ @property
286
+ def defs_state_config(self) -> DefsStateConfig:
287
+ default_key = f"{self.__class__.__name__}[{self.workspace.workspace_id}]"
288
+ return DefsStateConfig.from_args(self.defs_state, default_key=default_key)
289
+
290
+ @cached_property
291
+ def translator(self) -> DagsterAirbyteTranslator:
292
+ return AirbyteComponentTranslator(self)
293
+
294
+ @cached_property
295
+ def _base_translator(self) -> DagsterAirbyteTranslator:
296
+ return DagsterAirbyteTranslator()
297
+
298
+ @public
299
+ def get_asset_spec(self, props: AirbyteConnectionTableProps) -> dg.AssetSpec:
300
+ """Generates an AssetSpec for a given Airbyte connection table.
301
+
302
+ This method can be overridden in a subclass to customize how Airbyte connection tables
303
+ are converted to Dagster asset specs. By default, it delegates to the configured
304
+ DagsterAirbyteTranslator.
305
+
306
+ Args:
307
+ props: The AirbyteConnectionTableProps containing information about the connection
308
+ and table/stream being synced
309
+
310
+ Returns:
311
+ An AssetSpec that represents the Airbyte connection table as a Dagster asset
312
+
313
+ Example:
314
+ Override this method to add custom metadata to all Airbyte assets:
315
+
316
+ .. code-block:: python
317
+
318
+ from dagster_airbyte import AirbyteWorkspaceComponent
319
+ import dagster as dg
320
+
321
+ class CustomAirbyteWorkspaceComponent(AirbyteWorkspaceComponent):
322
+ def get_asset_spec(self, props):
323
+ base_spec = super().get_asset_spec(props)
324
+ return base_spec.replace_attributes(
325
+ metadata={
326
+ **base_spec.metadata,
327
+ "data_source": "airbyte",
328
+ "connection_id": props.connection_id
329
+ }
330
+ )
331
+ """
332
+ return self._base_translator.get_asset_spec(props)
333
+
334
+ @public
335
+ def execute(
336
+ self, context: dg.AssetExecutionContext, airbyte: BaseAirbyteWorkspace
337
+ ) -> Iterable[Union[dg.AssetMaterialization, dg.MaterializeResult]]:
338
+ """Executes an Airbyte sync for the selected connection.
339
+
340
+ This method can be overridden in a subclass to customize the sync execution behavior,
341
+ such as adding custom logging or handling sync results differently.
342
+
343
+ Args:
344
+ context: The asset execution context provided by Dagster
345
+ airbyte: The BaseAirbyteWorkspace resource used to trigger and monitor syncs
346
+
347
+ Yields:
348
+ AssetMaterialization or MaterializeResult events from the Airbyte sync
349
+
350
+ Example:
351
+ Override this method to add custom logging during sync execution:
352
+
353
+ .. code-block:: python
354
+
355
+ from dagster_airbyte import AirbyteWorkspaceComponent
356
+ import dagster as dg
357
+
358
+ class CustomAirbyteWorkspaceComponent(AirbyteWorkspaceComponent):
359
+ def execute(self, context, airbyte):
360
+ context.log.info(f"Starting Airbyte sync for connection")
361
+ yield from super().execute(context, airbyte)
362
+ context.log.info("Airbyte sync completed successfully")
363
+ """
364
+ yield from airbyte.sync_and_poll(context=context)
365
+
366
+ def _load_asset_specs(self, state: AirbyteWorkspaceData) -> Sequence[dg.AssetSpec]:
367
+ connection_selector_fn = self.connection_selector or (lambda connection: True)
368
+ return [
369
+ self.translator.get_asset_spec(props).merge_attributes(
370
+ metadata={DAGSTER_AIRBYTE_TRANSLATOR_METADATA_KEY: self.translator}
371
+ )
372
+ for props in state.to_airbyte_connection_table_props_data()
373
+ if connection_selector_fn(state.connections_by_id[props.connection_id])
374
+ ]
375
+
376
+ def _get_airbyte_assets_def(
377
+ self, connection_name: str, specs: Sequence[dg.AssetSpec]
378
+ ) -> dg.AssetsDefinition:
379
+ @dg.multi_asset(
380
+ name=f"airbyte_{clean_name(connection_name)}",
381
+ can_subset=True,
382
+ specs=specs,
383
+ )
384
+ def _asset(context: dg.AssetExecutionContext):
385
+ yield from self.execute(context=context, airbyte=self.workspace)
386
+
387
+ return _asset
388
+
389
+ async def write_state_to_path(self, state_path: Path) -> None:
390
+ state = self.workspace.fetch_airbyte_workspace_data()
391
+ state_path.write_text(dg.serialize_value(state))
392
+
393
+ def build_defs_from_state(
394
+ self, context: dg.ComponentLoadContext, state_path: Optional[Path]
395
+ ) -> dg.Definitions:
396
+ if state_path is None:
397
+ return dg.Definitions()
398
+ state = deserialize_value(state_path.read_text(), AirbyteWorkspaceData)
399
+
400
+ # group specs by their connector names
401
+ specs_by_connection_name = defaultdict(list)
402
+ for spec in self._load_asset_specs(state):
403
+ connection_name = check.not_none(
404
+ AirbyteMetadataSet.extract(spec.metadata).connection_name
405
+ )
406
+ specs_by_connection_name[connection_name].append(spec)
407
+
408
+ # create one assets definition per connection
409
+ assets = [
410
+ self._get_airbyte_assets_def(connection_name, specs)
411
+ for connection_name, specs in specs_by_connection_name.items()
412
+ ]
413
+ return dg.Definitions(assets=assets)
414
+
415
+
416
+ # Subclassing to create the alias to be able to use the superseded decorator.
417
+ @superseded(additional_warn_text="Superseded. Use AirbyteWorkspaceComponent instead.")
418
+ class AirbyteCloudWorkspaceComponent(AirbyteWorkspaceComponent): ...
419
+
420
+
421
+ class AirbyteComponentTranslator(
422
+ create_component_translator_cls(AirbyteWorkspaceComponent, DagsterAirbyteTranslator),
423
+ ComponentTranslator[AirbyteWorkspaceComponent],
424
+ ):
425
+ def __init__(self, component: AirbyteWorkspaceComponent):
426
+ self._component = component
427
+
428
+ def get_asset_spec(self, props: AirbyteConnectionTableProps) -> dg.AssetSpec:
429
+ base_asset_spec = super().get_asset_spec(props)
430
+ if self.component.translation is None:
431
+ return base_asset_spec
432
+ else:
433
+ return self.component.translation(base_asset_spec, props)
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+
3
+ from dagster.components.component.component_scaffolder import Scaffolder
4
+ from dagster.components.component_scaffolding import scaffold_component
5
+ from dagster.components.scaffold.scaffold import ScaffoldRequest
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class AirbyteScaffolderParams(BaseModel):
10
+ workspace_id: Optional[str] = None
11
+ client_id: Optional[str] = None
12
+ client_secret: Optional[str] = None
13
+
14
+
15
+ class AirbyteWorkspaceComponentScaffolder(Scaffolder[AirbyteScaffolderParams]):
16
+ @classmethod
17
+ def get_scaffold_params(cls) -> type[AirbyteScaffolderParams]:
18
+ return AirbyteScaffolderParams
19
+
20
+ def scaffold(self, request: ScaffoldRequest[AirbyteScaffolderParams]) -> None:
21
+ scaffold_component(
22
+ request,
23
+ {
24
+ "workspace": {
25
+ "workspace_id": request.params.workspace_id,
26
+ "client_id": request.params.client_id,
27
+ "client_secret": request.params.client_secret,
28
+ }
29
+ },
30
+ )