dagster-airbyte 0.24.3__py3-none-any.whl → 0.28.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,54 +1,53 @@
1
1
  import hashlib
2
2
  import inspect
3
3
  import os
4
- import re
5
4
  from abc import abstractmethod
5
+ from collections.abc import Callable, Iterable, Mapping, Sequence
6
6
  from functools import partial
7
7
  from itertools import chain
8
- from typing import (
9
- Any,
10
- Callable,
11
- Dict,
12
- Iterable,
13
- List,
14
- Mapping,
15
- NamedTuple,
16
- Optional,
17
- Sequence,
18
- Set,
19
- Tuple,
20
- Union,
21
- cast,
22
- )
8
+ from typing import Any, NamedTuple, Optional, Union, cast
23
9
 
24
10
  import yaml
25
11
  from dagster import (
12
+ AssetExecutionContext,
26
13
  AssetKey,
27
14
  AssetOut,
28
15
  AutoMaterializePolicy,
29
- FreshnessPolicy,
16
+ LegacyFreshnessPolicy,
30
17
  Nothing,
31
18
  Output,
32
19
  ResourceDefinition,
33
20
  SourceAsset,
34
21
  _check as check,
35
22
  )
36
- from dagster._annotations import deprecated
23
+ from dagster._annotations import beta, hidden_param, only_allow_hidden_params_in_kwargs, superseded
37
24
  from dagster._core.definitions import AssetsDefinition, multi_asset
38
- from dagster._core.definitions.cacheable_assets import (
25
+ from dagster._core.definitions.assets.definition.cacheable_assets_definition import (
39
26
  AssetsDefinitionCacheableData,
40
27
  CacheableAssetsDefinition,
41
28
  )
42
29
  from dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix
43
- from dagster._core.definitions.metadata import MetadataValue, TableSchemaMetadataValue
30
+ from dagster._core.definitions.metadata.metadata_set import TableMetadataSet
44
31
  from dagster._core.definitions.metadata.table import TableSchema
45
32
  from dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError
46
33
  from dagster._core.execution.context.init import build_init_resource_context
47
34
  from dagster._utils.merger import merge_dicts
48
35
 
49
- from dagster_airbyte.resources import AirbyteCloudResource, AirbyteResource, BaseAirbyteResource
36
+ from dagster_airbyte.asset_decorator import airbyte_assets
37
+ from dagster_airbyte.legacy_resources import (
38
+ AirbyteCloudResource,
39
+ AirbyteResource,
40
+ BaseAirbyteResource,
41
+ )
42
+ from dagster_airbyte.resources import AirbyteCloudWorkspace, AirbyteWorkspace, BaseAirbyteWorkspace
43
+ from dagster_airbyte.translator import (
44
+ AirbyteConnection,
45
+ AirbyteMetadataSet,
46
+ DagsterAirbyteTranslator,
47
+ )
50
48
  from dagster_airbyte.types import AirbyteTableMetadata
51
49
  from dagster_airbyte.utils import (
50
+ clean_name,
52
51
  generate_materializations,
53
52
  generate_table_schema,
54
53
  is_basic_normalization_operation,
@@ -62,14 +61,18 @@ def _table_to_output_name_fn(table: str) -> str:
62
61
  def _build_airbyte_asset_defn_metadata(
63
62
  connection_id: str,
64
63
  destination_tables: Sequence[str],
64
+ destination_raw_table_names_by_table: Mapping[str, str],
65
+ destination_database: Optional[str],
66
+ destination_schema: Optional[str],
65
67
  table_to_asset_key_fn: Callable[[str], AssetKey],
66
68
  asset_key_prefix: Optional[Sequence[str]] = None,
67
- normalization_tables: Optional[Mapping[str, Set[str]]] = None,
69
+ normalization_tables: Optional[Mapping[str, set[str]]] = None,
70
+ normalization_raw_table_names_by_table: Optional[Mapping[str, str]] = None,
68
71
  upstream_assets: Optional[Iterable[AssetKey]] = None,
69
72
  group_name: Optional[str] = None,
70
73
  io_manager_key: Optional[str] = None,
71
74
  schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,
72
- freshness_policy: Optional[FreshnessPolicy] = None,
75
+ legacy_freshness_policy: Optional[LegacyFreshnessPolicy] = None,
73
76
  auto_materialize_policy: Optional[AutoMaterializePolicy] = None,
74
77
  ) -> AssetsDefinitionCacheableData:
75
78
  asset_key_prefix = (
@@ -93,7 +96,7 @@ def _build_airbyte_asset_defn_metadata(
93
96
  for table in tables
94
97
  }
95
98
 
96
- internal_deps: Dict[str, Set[AssetKey]] = {}
99
+ internal_deps: dict[str, set[AssetKey]] = {}
97
100
 
98
101
  metadata_encodable_normalization_tables = (
99
102
  {k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}
@@ -112,6 +115,30 @@ def _build_airbyte_asset_defn_metadata(
112
115
  for table in destination_tables:
113
116
  internal_deps[table] = set(upstream_assets or [])
114
117
 
118
+ table_names: dict[str, str] = {}
119
+ for table in destination_tables:
120
+ if destination_database and destination_schema and table:
121
+ # Use the destination raw table name to create the table name
122
+ table_names[table] = ".".join(
123
+ [
124
+ destination_database,
125
+ destination_schema,
126
+ destination_raw_table_names_by_table[table],
127
+ ]
128
+ )
129
+ if normalization_tables and normalization_raw_table_names_by_table:
130
+ for normalization_table in normalization_tables.get(table, set()):
131
+ table_names[normalization_table] = ".".join(
132
+ [
133
+ destination_database,
134
+ destination_schema,
135
+ destination_raw_table_names_by_table[table],
136
+ normalization_raw_table_names_by_table[normalization_table],
137
+ ]
138
+ )
139
+
140
+ schema_by_table_name = schema_by_table_name if schema_by_table_name else {}
141
+
115
142
  return AssetsDefinitionCacheableData(
116
143
  keys_by_input_name=(
117
144
  {asset_key.path[-1]: asset_key for asset_key in upstream_assets}
@@ -125,14 +152,19 @@ def _build_airbyte_asset_defn_metadata(
125
152
  can_subset=False,
126
153
  metadata_by_output_name=(
127
154
  {
128
- table: {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}
155
+ table: {
156
+ **TableMetadataSet(
157
+ column_schema=schema_by_table_name.get(table),
158
+ table_name=table_names.get(table),
159
+ ),
160
+ }
129
161
  for table in tables
130
162
  }
131
- if schema_by_table_name
132
- else None
133
163
  ),
134
- freshness_policies_by_output_name=(
135
- {output: freshness_policy for output in outputs} if freshness_policy else None
164
+ legacy_freshness_policies_by_output_name=(
165
+ {output: legacy_freshness_policy for output in outputs}
166
+ if legacy_freshness_policy
167
+ else None
136
168
  ),
137
169
  auto_materialize_policies_by_output_name=(
138
170
  {output: auto_materialize_policy for output in outputs}
@@ -153,31 +185,28 @@ def _build_airbyte_assets_from_metadata(
153
185
  assets_defn_meta: AssetsDefinitionCacheableData,
154
186
  resource_defs: Optional[Mapping[str, ResourceDefinition]],
155
187
  ) -> AssetsDefinition:
156
- metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)
157
- connection_id = cast(str, metadata["connection_id"])
158
- group_name = cast(Optional[str], metadata["group_name"])
159
- destination_tables = cast(List[str], metadata["destination_tables"])
160
- normalization_tables = cast(Mapping[str, List[str]], metadata["normalization_tables"])
161
- io_manager_key = cast(Optional[str], metadata["io_manager_key"])
188
+ metadata = cast("Mapping[str, Any]", assets_defn_meta.extra_metadata)
189
+ connection_id = cast("str", metadata["connection_id"])
190
+ group_name = cast("Optional[str]", metadata["group_name"])
191
+ destination_tables = cast("list[str]", metadata["destination_tables"])
192
+ normalization_tables = cast("Mapping[str, list[str]]", metadata["normalization_tables"])
193
+ io_manager_key = cast("Optional[str]", metadata["io_manager_key"])
162
194
 
163
195
  @multi_asset(
164
- name=f"airbyte_sync_{connection_id[:5]}",
196
+ name=f"airbyte_sync_{connection_id.replace('-', '_')}",
165
197
  deps=list((assets_defn_meta.keys_by_input_name or {}).values()),
166
198
  outs={
167
199
  k: AssetOut(
168
200
  key=v,
169
201
  metadata=(
170
- {
171
- k: cast(TableSchemaMetadataValue, v)
172
- for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()
173
- }
202
+ assets_defn_meta.metadata_by_output_name.get(k)
174
203
  if assets_defn_meta.metadata_by_output_name
175
204
  else None
176
205
  ),
177
206
  io_manager_key=io_manager_key,
178
- freshness_policy=(
179
- assets_defn_meta.freshness_policies_by_output_name.get(k)
180
- if assets_defn_meta.freshness_policies_by_output_name
207
+ legacy_freshness_policy=(
208
+ assets_defn_meta.legacy_freshness_policies_by_output_name.get(k)
209
+ if assets_defn_meta.legacy_freshness_policies_by_output_name
181
210
  else None
182
211
  ),
183
212
  dagster_type=Nothing,
@@ -222,18 +251,27 @@ def _build_airbyte_assets_from_metadata(
222
251
  return _assets
223
252
 
224
253
 
254
+ @hidden_param(
255
+ param="legacy_freshness_policy",
256
+ breaking_version="1.13.0",
257
+ )
258
+ @hidden_param(
259
+ param="auto_materialize_policy",
260
+ breaking_version="1.10.0",
261
+ )
225
262
  def build_airbyte_assets(
226
263
  connection_id: str,
227
264
  destination_tables: Sequence[str],
265
+ destination_database: Optional[str] = None,
266
+ destination_schema: Optional[str] = None,
228
267
  asset_key_prefix: Optional[Sequence[str]] = None,
229
268
  group_name: Optional[str] = None,
230
- normalization_tables: Optional[Mapping[str, Set[str]]] = None,
269
+ normalization_tables: Optional[Mapping[str, set[str]]] = None,
231
270
  deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,
232
- upstream_assets: Optional[Set[AssetKey]] = None,
271
+ upstream_assets: Optional[set[AssetKey]] = None,
233
272
  schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,
234
- freshness_policy: Optional[FreshnessPolicy] = None,
235
273
  stream_to_asset_map: Optional[Mapping[str, str]] = None,
236
- auto_materialize_policy: Optional[AutoMaterializePolicy] = None,
274
+ **kwargs,
237
275
  ) -> Sequence[AssetsDefinition]:
238
276
  """Builds a set of assets representing the tables created by an Airbyte sync operation.
239
277
 
@@ -243,6 +281,8 @@ def build_airbyte_assets(
243
281
  destination_tables (List[str]): The names of the tables that you want to be represented
244
282
  in the Dagster asset graph for this sync. This will generally map to the name of the
245
283
  stream in Airbyte, unless a stream prefix has been specified in Airbyte.
284
+ destination_database (Optional[str]): The name of the destination database.
285
+ destination_schema (Optional[str]): The name of the destination schema.
246
286
  normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's
247
287
  normalization feature, you may specify a mapping of destination table to a list of
248
288
  derived tables that will be created by the normalization process.
@@ -251,11 +291,13 @@ def build_airbyte_assets(
251
291
  deps (Optional[Sequence[Union[AssetsDefinition, SourceAsset, str, AssetKey]]]):
252
292
  A list of assets to add as sources.
253
293
  upstream_assets (Optional[Set[AssetKey]]): Deprecated, use deps instead. A list of assets to add as sources.
254
- freshness_policy (Optional[FreshnessPolicy]): A freshness policy to apply to the assets
255
294
  stream_to_asset_map (Optional[Mapping[str, str]]): A mapping of an Airbyte stream name to a Dagster asset.
256
295
  This allows the use of the "prefix" setting in Airbyte with special characters that aren't valid asset names.
257
- auto_materialize_policy (Optional[AutoMaterializePolicy]): An auto materialization policy to apply to the assets.
258
296
  """
297
+ only_allow_hidden_params_in_kwargs(build_airbyte_assets, kwargs)
298
+ legacy_freshness_policy = kwargs.get("legacy_freshness_policy")
299
+ auto_materialize_policy = kwargs.get("auto_materialize_policy")
300
+
259
301
  if upstream_assets is not None and deps is not None:
260
302
  raise DagsterInvalidDefinitionError(
261
303
  "Cannot specify both deps and upstream_assets to build_airbyte_assets. Use only deps"
@@ -269,15 +311,36 @@ def build_airbyte_assets(
269
311
  tables = chain.from_iterable(
270
312
  chain([destination_tables], normalization_tables.values() if normalization_tables else [])
271
313
  )
314
+
315
+ table_names: dict[str, str] = {}
316
+ for table in destination_tables:
317
+ if destination_database and destination_schema and table:
318
+ table_names[table] = ".".join([destination_database, destination_schema, table])
319
+ if normalization_tables:
320
+ for normalization_table in normalization_tables.get(table, set()):
321
+ table_names[normalization_table] = ".".join(
322
+ [
323
+ destination_database,
324
+ destination_schema,
325
+ table,
326
+ normalization_table,
327
+ ]
328
+ )
329
+
330
+ schema_by_table_name = schema_by_table_name if schema_by_table_name else {}
331
+
272
332
  outputs = {
273
333
  table: AssetOut(
274
334
  key=AssetKey([*asset_key_prefix, table]),
275
335
  metadata=(
276
- {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}
277
- if schema_by_table_name
278
- else None
336
+ {
337
+ **TableMetadataSet(
338
+ column_schema=schema_by_table_name.get(table),
339
+ table_name=table_names.get(table),
340
+ ),
341
+ }
279
342
  ),
280
- freshness_policy=freshness_policy,
343
+ legacy_freshness_policy=legacy_freshness_policy,
281
344
  auto_materialize_policy=auto_materialize_policy,
282
345
  )
283
346
  for table in tables
@@ -301,7 +364,7 @@ def build_airbyte_assets(
301
364
  internal_deps[table] = set(upstream_deps) if upstream_deps else set()
302
365
 
303
366
  @multi_asset(
304
- name=f"airbyte_sync_{connection_id[:5]}",
367
+ name=f"airbyte_sync_{connection_id.replace('-', '_')}",
305
368
  deps=upstream_deps,
306
369
  outs=outputs,
307
370
  internal_asset_deps=internal_deps,
@@ -377,7 +440,7 @@ def _get_normalization_tables_for_schema(
377
440
  For more information on Airbyte's normalization process, see:
378
441
  https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting
379
442
  """
380
- out: Dict[str, AirbyteTableMetadata] = {}
443
+ out: dict[str, AirbyteTableMetadata] = {}
381
444
  # Object types are broken into a new table, as long as they have children
382
445
 
383
446
  sub_schemas = _get_sub_schemas(schema)
@@ -389,7 +452,7 @@ def _get_normalization_tables_for_schema(
389
452
 
390
453
  if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:
391
454
  out[prefix + key] = AirbyteTableMetadata(
392
- schema=generate_table_schema(sub_schema.get("properties", {}))
455
+ raw_table_name=key, schema=generate_table_schema(sub_schema.get("properties", {}))
393
456
  )
394
457
  for k, v in sub_schema["properties"].items():
395
458
  out = merge_dicts(
@@ -398,7 +461,8 @@ def _get_normalization_tables_for_schema(
398
461
  # Array types are also broken into a new table
399
462
  elif "array" in schema_types:
400
463
  out[prefix + key] = AirbyteTableMetadata(
401
- schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {}))
464
+ raw_table_name=key,
465
+ schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {})),
402
466
  )
403
467
  if sub_schema.get("items", {}).get("properties"):
404
468
  for k, v in sub_schema["items"]["properties"].items():
@@ -409,11 +473,6 @@ def _get_normalization_tables_for_schema(
409
473
  return out
410
474
 
411
475
 
412
- def _clean_name(name: str) -> str:
413
- """Cleans an input to be a valid Dagster asset name."""
414
- return re.sub(r"[^a-z0-9]+", "_", name.lower())
415
-
416
-
417
476
  class AirbyteConnectionMetadata(
418
477
  NamedTuple(
419
478
  "_AirbyteConnectionMetadata",
@@ -421,13 +480,14 @@ class AirbyteConnectionMetadata(
421
480
  ("name", str),
422
481
  ("stream_prefix", str),
423
482
  ("has_basic_normalization", bool),
424
- ("stream_data", List[Mapping[str, Any]]),
483
+ ("stream_data", list[Mapping[str, Any]]),
484
+ ("destination", Mapping[str, Any]),
425
485
  ],
426
486
  )
427
487
  ):
428
488
  """Contains information about an Airbyte connection.
429
489
 
430
- Attributes:
490
+ Args:
431
491
  name (str): The name of the connection.
432
492
  stream_prefix (str): A prefix to add to all stream names.
433
493
  has_basic_normalization (bool): Whether or not the connection has basic normalization enabled.
@@ -436,7 +496,10 @@ class AirbyteConnectionMetadata(
436
496
 
437
497
  @classmethod
438
498
  def from_api_json(
439
- cls, contents: Mapping[str, Any], operations: Mapping[str, Any]
499
+ cls,
500
+ contents: Mapping[str, Any],
501
+ operations: Mapping[str, Any],
502
+ destination: Mapping[str, Any],
440
503
  ) -> "AirbyteConnectionMetadata":
441
504
  return cls(
442
505
  name=contents["name"],
@@ -446,11 +509,14 @@ class AirbyteConnectionMetadata(
446
509
  for op in operations.get("operations", [])
447
510
  ),
448
511
  stream_data=contents.get("syncCatalog", {}).get("streams", []),
512
+ destination=destination,
449
513
  )
450
514
 
451
515
  @classmethod
452
- def from_config(cls, contents: Mapping[str, Any]) -> "AirbyteConnectionMetadata":
453
- config_contents = cast(Mapping[str, Any], contents.get("configuration"))
516
+ def from_config(
517
+ cls, contents: Mapping[str, Any], destination: Mapping[str, Any]
518
+ ) -> "AirbyteConnectionMetadata":
519
+ config_contents = cast("Mapping[str, Any]", contents.get("configuration"))
454
520
  check.invariant(
455
521
  config_contents is not None, "Airbyte connection config is missing 'configuration' key"
456
522
  )
@@ -463,6 +529,7 @@ class AirbyteConnectionMetadata(
463
529
  for op in config_contents.get("operations", [])
464
530
  ),
465
531
  stream_data=config_contents.get("sync_catalog", {}).get("streams", []),
532
+ destination=destination,
466
533
  )
467
534
 
468
535
  def parse_stream_tables(
@@ -472,14 +539,14 @@ class AirbyteConnectionMetadata(
472
539
  tables associated with each enabled stream and values representing any affiliated
473
540
  tables created by Airbyte's normalization process, if enabled.
474
541
  """
475
- tables: Dict[str, AirbyteTableMetadata] = {}
542
+ tables: dict[str, AirbyteTableMetadata] = {}
476
543
 
477
544
  enabled_streams = [
478
545
  stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)
479
546
  ]
480
547
 
481
548
  for stream in enabled_streams:
482
- name = cast(str, stream.get("stream", {}).get("name"))
549
+ name = cast("str", stream.get("stream", {}).get("name"))
483
550
  prefixed_name = f"{self.stream_prefix}{name}"
484
551
 
485
552
  schema = (
@@ -487,7 +554,7 @@ class AirbyteConnectionMetadata(
487
554
  if "json_schema" in stream["stream"]
488
555
  else stream["stream"]["jsonSchema"]
489
556
  )
490
- normalization_tables: Dict[str, AirbyteTableMetadata] = {}
557
+ normalization_tables: dict[str, AirbyteTableMetadata] = {}
491
558
  schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))
492
559
  if self.has_basic_normalization and return_normalization_tables:
493
560
  for k, v in schema_props.items():
@@ -497,6 +564,7 @@ class AirbyteConnectionMetadata(
497
564
  prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"
498
565
  normalization_tables[prefixed_norm_table_name] = meta
499
566
  tables[prefixed_name] = AirbyteTableMetadata(
567
+ raw_table_name=name,
500
568
  schema=generate_table_schema(schema_props),
501
569
  normalization_tables=normalization_tables,
502
570
  )
@@ -514,7 +582,7 @@ def _get_schema_by_table_name(
514
582
  [
515
583
  (k, v.schema)
516
584
  for k, v in cast(
517
- Dict[str, AirbyteTableMetadata], meta.normalization_tables
585
+ "dict[str, AirbyteTableMetadata]", meta.normalization_tables
518
586
  ).items()
519
587
  ]
520
588
  for meta in stream_table_metadata.values()
@@ -535,7 +603,7 @@ class AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):
535
603
  connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],
536
604
  connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],
537
605
  connection_to_freshness_policy_fn: Optional[
538
- Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
606
+ Callable[[AirbyteConnectionMetadata], Optional[LegacyFreshnessPolicy]]
539
607
  ],
540
608
  connection_to_auto_materialize_policy_fn: Optional[
541
609
  Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
@@ -565,25 +633,46 @@ class AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):
565
633
  super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")
566
634
 
567
635
  @abstractmethod
568
- def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:
636
+ def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
569
637
  pass
570
638
 
571
639
  def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:
572
- asset_defn_data: List[AssetsDefinitionCacheableData] = []
640
+ asset_defn_data: list[AssetsDefinitionCacheableData] = []
573
641
  for connection_id, connection in self._get_connections():
574
642
  stream_table_metadata = connection.parse_stream_tables(
575
643
  self._create_assets_for_normalization_tables
576
644
  )
577
645
  schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)
578
646
 
647
+ destination_database = connection.destination.get("configuration", {}).get("database")
648
+ destination_schema = connection.destination.get("configuration", {}).get("schema")
649
+
579
650
  table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)
651
+
652
+ destination_tables = list(stream_table_metadata.keys())
653
+ destination_raw_table_names_by_table = {
654
+ table: metadata.raw_table_name for table, metadata in stream_table_metadata.items()
655
+ }
656
+ normalization_tables = {
657
+ table: set(metadata.normalization_tables.keys())
658
+ for table, metadata in stream_table_metadata.items()
659
+ }
660
+ normalization_raw_table_names_by_table = {
661
+ normalization_table: metadata.normalization_tables[
662
+ normalization_table
663
+ ].raw_table_name
664
+ for table, metadata in stream_table_metadata.items()
665
+ for normalization_table in normalization_tables[table]
666
+ }
667
+
580
668
  asset_data_for_conn = _build_airbyte_asset_defn_metadata(
581
669
  connection_id=connection_id,
582
- destination_tables=list(stream_table_metadata.keys()),
583
- normalization_tables={
584
- table: set(metadata.normalization_tables.keys())
585
- for table, metadata in stream_table_metadata.items()
586
- },
670
+ destination_tables=destination_tables,
671
+ destination_raw_table_names_by_table=destination_raw_table_names_by_table,
672
+ destination_database=destination_database,
673
+ destination_schema=destination_schema,
674
+ normalization_tables=normalization_tables,
675
+ normalization_raw_table_names_by_table=normalization_raw_table_names_by_table,
587
676
  asset_key_prefix=self._key_prefix,
588
677
  group_name=(
589
678
  self._connection_meta_to_group_fn(connection)
@@ -597,7 +686,7 @@ class AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):
597
686
  ),
598
687
  schema_by_table_name=schema_by_table_name,
599
688
  table_to_asset_key_fn=table_to_asset_key,
600
- freshness_policy=self._connection_to_freshness_policy_fn(connection),
689
+ legacy_freshness_policy=self._connection_to_freshness_policy_fn(connection),
601
690
  auto_materialize_policy=self._connection_to_auto_materialize_policy_fn(connection),
602
691
  )
603
692
 
@@ -630,7 +719,7 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
630
719
  connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]],
631
720
  connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],
632
721
  connection_to_freshness_policy_fn: Optional[
633
- Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
722
+ Callable[[AirbyteConnectionMetadata], Optional[LegacyFreshnessPolicy]]
634
723
  ],
635
724
  connection_to_auto_materialize_policy_fn: Optional[
636
725
  Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
@@ -662,11 +751,11 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
662
751
  )
663
752
  self._airbyte_instance: AirbyteResource = self._partially_initialized_airbyte_instance
664
753
 
665
- def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:
754
+ def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
666
755
  workspace_id = self._workspace_id
667
756
  if not workspace_id:
668
757
  workspaces = cast(
669
- List[Dict[str, Any]],
758
+ "list[dict[str, Any]]",
670
759
  check.not_none(
671
760
  self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})
672
761
  ).get("workspaces", []),
@@ -678,7 +767,7 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
678
767
  workspace_id = workspaces[0].get("workspaceId")
679
768
 
680
769
  connections = cast(
681
- List[Dict[str, Any]],
770
+ "list[dict[str, Any]]",
682
771
  check.not_none(
683
772
  self._airbyte_instance.make_request(
684
773
  endpoint="/connections/list", data={"workspaceId": workspace_id}
@@ -686,12 +775,12 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
686
775
  ).get("connections", []),
687
776
  )
688
777
 
689
- output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []
778
+ output_connections: list[tuple[str, AirbyteConnectionMetadata]] = []
690
779
  for connection_json in connections:
691
- connection_id = cast(str, connection_json.get("connectionId"))
780
+ connection_id = cast("str", connection_json.get("connectionId"))
692
781
 
693
782
  operations_json = cast(
694
- Dict[str, Any],
783
+ "dict[str, Any]",
695
784
  check.not_none(
696
785
  self._airbyte_instance.make_request(
697
786
  endpoint="/operations/list",
@@ -699,7 +788,21 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
699
788
  )
700
789
  ),
701
790
  )
702
- connection = AirbyteConnectionMetadata.from_api_json(connection_json, operations_json)
791
+
792
+ destination_id = cast("str", connection_json.get("destinationId"))
793
+ destination_json = cast(
794
+ "dict[str, Any]",
795
+ check.not_none(
796
+ self._airbyte_instance.make_request(
797
+ endpoint="/destinations/get",
798
+ data={"destinationId": destination_id},
799
+ )
800
+ ),
801
+ )
802
+
803
+ connection = AirbyteConnectionMetadata.from_api_json(
804
+ connection_json, operations_json, destination_json
805
+ )
703
806
 
704
807
  # Filter out connections that don't match the filter function
705
808
  if self._connection_filter and not self._connection_filter(connection):
@@ -730,7 +833,7 @@ class AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition)
730
833
  connection_directories: Optional[Sequence[str]],
731
834
  connection_to_asset_key_fn: Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]],
732
835
  connection_to_freshness_policy_fn: Optional[
733
- Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
836
+ Callable[[AirbyteConnectionMetadata], Optional[LegacyFreshnessPolicy]]
734
837
  ],
735
838
  connection_to_auto_materialize_policy_fn: Optional[
736
839
  Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
@@ -750,16 +853,26 @@ class AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition)
750
853
  self._project_dir = project_dir
751
854
  self._connection_directories = connection_directories
752
855
 
753
- def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:
856
+ def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
754
857
  connections_dir = os.path.join(self._project_dir, "connections")
755
858
 
756
- output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []
859
+ output_connections: list[tuple[str, AirbyteConnectionMetadata]] = []
757
860
 
758
861
  connection_directories = self._connection_directories or os.listdir(connections_dir)
759
862
  for connection_name in connection_directories:
760
863
  connection_dir = os.path.join(connections_dir, connection_name)
761
864
  with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:
762
- connection = AirbyteConnectionMetadata.from_config(yaml.safe_load(f.read()))
865
+ connection_data = yaml.safe_load(f.read())
866
+
867
+ destination_configuration_path = cast(
868
+ "str", connection_data.get("destination_configuration_path")
869
+ )
870
+ with open(
871
+ os.path.join(self._project_dir, destination_configuration_path), encoding="utf-8"
872
+ ) as f:
873
+ destination_data = yaml.safe_load(f.read())
874
+
875
+ connection = AirbyteConnectionMetadata.from_config(connection_data, destination_data)
763
876
 
764
877
  # Filter out connections that don't match the filter function
765
878
  if self._connection_filter and not self._connection_filter(connection):
@@ -788,7 +901,7 @@ class AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition)
788
901
  )
789
902
  state_file = state_files[0]
790
903
 
791
- with open(os.path.join(connection_dir, cast(str, state_file)), encoding="utf-8") as f:
904
+ with open(os.path.join(connection_dir, cast("str", state_file)), encoding="utf-8") as f:
792
905
  state = yaml.safe_load(f.read())
793
906
  connection_id = state.get("resource_id")
794
907
 
@@ -796,12 +909,17 @@ class AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition)
796
909
  return output_connections
797
910
 
798
911
 
912
+ @superseded(
913
+ additional_warn_text=(
914
+ "If you are using Airbyte 1.6.0 or higher, please see the migration guide: https://docs.dagster.io/integrations/libraries/airbyte/migration-guide"
915
+ )
916
+ )
799
917
  def load_assets_from_airbyte_instance(
800
918
  airbyte: Union[AirbyteResource, ResourceDefinition],
801
919
  workspace_id: Optional[str] = None,
802
920
  key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,
803
921
  create_assets_for_normalization_tables: bool = True,
804
- connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,
922
+ connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = clean_name,
805
923
  connection_meta_to_group_fn: Optional[
806
924
  Callable[[AirbyteConnectionMetadata], Optional[str]]
807
925
  ] = None,
@@ -812,7 +930,7 @@ def load_assets_from_airbyte_instance(
812
930
  Callable[[AirbyteConnectionMetadata, str], AssetKey]
813
931
  ] = None,
814
932
  connection_to_freshness_policy_fn: Optional[
815
- Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
933
+ Callable[[AirbyteConnectionMetadata], Optional[LegacyFreshnessPolicy]]
816
934
  ] = None,
817
935
  connection_to_auto_materialize_policy_fn: Optional[
818
936
  Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
@@ -906,7 +1024,7 @@ def load_assets_from_airbyte_instance(
906
1024
  check.invariant(
907
1025
  not connection_meta_to_group_fn
908
1026
  or not connection_to_group_fn
909
- or connection_to_group_fn == _clean_name,
1027
+ or connection_to_group_fn == clean_name,
910
1028
  "Cannot specify both connection_meta_to_group_fn and connection_to_group_fn",
911
1029
  )
912
1030
 
@@ -927,127 +1045,141 @@ def load_assets_from_airbyte_instance(
927
1045
  )
928
1046
 
929
1047
 
930
- @deprecated(
931
- breaking_version="1.9",
932
- additional_warn_text="The Airbyte Octavia CLI has been deprecated. Consider using load_assets_from_airbyte_instance instead.",
933
- )
934
- def load_assets_from_airbyte_project(
935
- project_dir: str,
936
- workspace_id: Optional[str] = None,
937
- key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,
938
- create_assets_for_normalization_tables: bool = True,
939
- connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,
940
- connection_meta_to_group_fn: Optional[
941
- Callable[[AirbyteConnectionMetadata], Optional[str]]
942
- ] = None,
943
- io_manager_key: Optional[str] = None,
944
- connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,
945
- connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,
946
- connection_directories: Optional[Sequence[str]] = None,
947
- connection_to_asset_key_fn: Optional[
948
- Callable[[AirbyteConnectionMetadata, str], AssetKey]
949
- ] = None,
950
- connection_to_freshness_policy_fn: Optional[
951
- Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
952
- ] = None,
953
- connection_to_auto_materialize_policy_fn: Optional[
954
- Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
955
- ] = None,
956
- ) -> CacheableAssetsDefinition:
957
- """Loads an Airbyte project into a set of Dagster assets.
1048
+ # -----------------------
1049
+ # Reworked assets factory
1050
+ # -----------------------
958
1051
 
959
- Point to the root folder of an Airbyte project synced using the Octavia CLI. For
960
- more information, see https://airbyte.com/tutorials/version-control-airbyte-configurations.
1052
+
1053
+ @beta
1054
+ def build_airbyte_assets_definitions(
1055
+ *,
1056
+ workspace: Union[AirbyteWorkspace, AirbyteCloudWorkspace],
1057
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
1058
+ connection_selector_fn: Optional[Callable[[AirbyteConnection], bool]] = None,
1059
+ ) -> Sequence[AssetsDefinition]:
1060
+ """The list of AssetsDefinition for all connections in the Airbyte workspace.
961
1061
 
962
1062
  Args:
963
- project_dir (str): The path to the root of your Airbyte project, containing sources, destinations,
964
- and connections folders.
965
- workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only
966
- required if multiple workspace state YAMLfiles exist in the project.
967
- key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.
968
- create_assets_for_normalization_tables (bool): If True, assets will be created for tables
969
- created by Airbyte's normalization feature. If False, only the destination tables
970
- will be created. Defaults to True.
971
- connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset
972
- group name for a given Airbyte connection name. If None, no groups will be created. Defaults
973
- to a basic sanitization function.
974
- connection_meta_to_group_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[str]]]): Function
975
- which returns an asset group name for a given Airbyte connection metadata. If None and connection_to_group_fn
976
- is None, no groups will be created. Defaults to None.
977
- io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".
978
- Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.
979
- connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an
980
- I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,
981
- the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".
982
- connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which
983
- takes in connection metadata and returns False if the connection should be excluded from the output assets.
984
- connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.
985
- If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter
986
- if the project has many connections or if the connection yaml files are large.
987
- connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which
988
- takes in connection metadata and table name and returns an asset key for the table. If None, the default asset
989
- key is based on the table name. Any asset key prefix will be applied to the output of this function.
990
- connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):
991
- Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.
992
- If None, no freshness policies will be applied to the assets.
993
- connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):
994
- Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.
995
- If None, no auto materialization policies will be applied to the assets.
1063
+ workspace (Union[AirbyteWorkspace, AirbyteCloudWorkspace]): The Airbyte workspace to fetch assets from.
1064
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1065
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1066
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
1067
+ connection_selector_fn (Optional[Callable[[AirbyteConnection], bool]]): A function that allows for filtering
1068
+ which Airbyte connection assets are created for.
996
1069
 
997
- **Examples:**
1070
+ Returns:
1071
+ List[AssetsDefinition]: The list of AssetsDefinition for all connections in the Airbyte workspace.
998
1072
 
999
- Loading all Airbyte connections as assets:
1073
+ Examples:
1074
+ Sync the tables of a Airbyte connection:
1000
1075
 
1001
- .. code-block:: python
1076
+ .. code-block:: python
1002
1077
 
1003
- from dagster_airbyte import load_assets_from_airbyte_project
1078
+ from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
1004
1079
 
1005
- airbyte_assets = load_assets_from_airbyte_project(
1006
- project_dir="path/to/airbyte/project",
1007
- )
1080
+ import dagster as dg
1008
1081
 
1009
- Filtering the set of loaded connections:
1082
+ airbyte_workspace = AirbyteCloudWorkspace(
1083
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1084
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1085
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1086
+ )
1010
1087
 
1011
- .. code-block:: python
1088
+ airbyte_assets = build_airbyte_assets_definitions(workspace=workspace)
1012
1089
 
1013
- from dagster_airbyte import load_assets_from_airbyte_project
1090
+ defs = dg.Definitions(
1091
+ assets=airbyte_assets,
1092
+ resources={"airbyte": airbyte_workspace},
1093
+ )
1014
1094
 
1015
- airbyte_assets = load_assets_from_airbyte_project(
1016
- project_dir="path/to/airbyte/project",
1017
- connection_filter=lambda meta: "snowflake" in meta.name,
1018
- )
1095
+ Sync the tables of a Airbyte connection with a custom translator:
1096
+
1097
+ .. code-block:: python
1098
+
1099
+ from dagster_airbyte import (
1100
+ DagsterAirbyteTranslator,
1101
+ AirbyteConnectionTableProps,
1102
+ AirbyteCloudWorkspace,
1103
+ build_airbyte_assets_definitions
1104
+ )
1105
+
1106
+ import dagster as dg
1107
+
1108
+ class CustomDagsterAirbyteTranslator(DagsterAirbyteTranslator):
1109
+ def get_asset_spec(self, props: AirbyteConnectionTableProps) -> dg.AssetSpec:
1110
+ default_spec = super().get_asset_spec(props)
1111
+ return default_spec.merge_attributes(
1112
+ metadata={"custom": "metadata"},
1113
+ )
1114
+
1115
+ airbyte_workspace = AirbyteCloudWorkspace(
1116
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1117
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1118
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1119
+ )
1120
+
1121
+ airbyte_assets = build_airbyte_assets_definitions(
1122
+ workspace=workspace,
1123
+ dagster_airbyte_translator=CustomDagsterAirbyteTranslator()
1124
+ )
1125
+
1126
+ defs = dg.Definitions(
1127
+ assets=airbyte_assets,
1128
+ resources={"airbyte": airbyte_workspace},
1129
+ )
1130
+
1131
+ Filter connections by name:
1132
+
1133
+ .. code-block:: python
1134
+
1135
+ from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
1136
+
1137
+ import dagster as dg
1138
+
1139
+ airbyte_workspace = AirbyteCloudWorkspace(
1140
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1141
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1142
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1143
+ )
1144
+
1145
+ airbyte_assets = build_airbyte_assets_definitions(
1146
+ workspace=workspace,
1147
+ connection_selector_fn=lambda connection: connection.name in ["connection1", "connection2"]
1148
+ )
1149
+
1150
+ defs = dg.Definitions(
1151
+ assets=airbyte_assets,
1152
+ resources={"airbyte": airbyte_workspace},
1153
+ )
1019
1154
  """
1020
- if isinstance(key_prefix, str):
1021
- key_prefix = [key_prefix]
1022
- key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)
1155
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1156
+ connection_selector_fn = connection_selector_fn or (lambda connection: True)
1023
1157
 
1024
- check.invariant(
1025
- not io_manager_key or not connection_to_io_manager_key_fn,
1026
- "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",
1158
+ all_asset_specs = workspace.load_asset_specs(
1159
+ dagster_airbyte_translator=dagster_airbyte_translator,
1160
+ connection_selector_fn=connection_selector_fn,
1027
1161
  )
1028
- if not connection_to_io_manager_key_fn:
1029
- connection_to_io_manager_key_fn = lambda _: io_manager_key
1030
1162
 
1031
- check.invariant(
1032
- not connection_meta_to_group_fn
1033
- or not connection_to_group_fn
1034
- or connection_to_group_fn == _clean_name,
1035
- "Cannot specify both connection_meta_to_group_fn and connection_to_group_fn",
1036
- )
1163
+ connections = {
1164
+ (
1165
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id),
1166
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_name),
1167
+ )
1168
+ for spec in all_asset_specs
1169
+ }
1037
1170
 
1038
- if not connection_meta_to_group_fn and connection_to_group_fn:
1039
- connection_meta_to_group_fn = lambda meta: connection_to_group_fn(meta.name)
1171
+ _asset_fns = []
1172
+ for connection_id, connection_name in connections:
1040
1173
 
1041
- return AirbyteYAMLCacheableAssetsDefinition(
1042
- project_dir=project_dir,
1043
- workspace_id=workspace_id,
1044
- key_prefix=key_prefix,
1045
- create_assets_for_normalization_tables=create_assets_for_normalization_tables,
1046
- connection_meta_to_group_fn=connection_meta_to_group_fn,
1047
- connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,
1048
- connection_filter=connection_filter,
1049
- connection_directories=connection_directories,
1050
- connection_to_asset_key_fn=connection_to_asset_key_fn,
1051
- connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,
1052
- connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,
1053
- )
1174
+ @airbyte_assets(
1175
+ connection_id=connection_id,
1176
+ workspace=workspace,
1177
+ name=f"airbyte_{clean_name(connection_name)}",
1178
+ dagster_airbyte_translator=dagster_airbyte_translator,
1179
+ )
1180
+ def _asset_fn(context: AssetExecutionContext, airbyte: BaseAirbyteWorkspace):
1181
+ yield from airbyte.sync_and_poll(context=context)
1182
+
1183
+ _asset_fns.append(_asset_fn)
1184
+
1185
+ return _asset_fns