dagster-airbyte 0.23.7__py3-none-any.whl → 0.25.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dagster-airbyte might be problematic. Click here for more details.

@@ -1,28 +1,15 @@
1
1
  import hashlib
2
2
  import inspect
3
3
  import os
4
- import re
5
4
  from abc import abstractmethod
5
+ from collections.abc import Iterable, Mapping, Sequence
6
6
  from functools import partial
7
7
  from itertools import chain
8
- from typing import (
9
- Any,
10
- Callable,
11
- Dict,
12
- Iterable,
13
- List,
14
- Mapping,
15
- NamedTuple,
16
- Optional,
17
- Sequence,
18
- Set,
19
- Tuple,
20
- Union,
21
- cast,
22
- )
8
+ from typing import Any, Callable, NamedTuple, Optional, Union, cast
23
9
 
24
10
  import yaml
25
11
  from dagster import (
12
+ AssetExecutionContext,
26
13
  AssetKey,
27
14
  AssetOut,
28
15
  AutoMaterializePolicy,
@@ -33,21 +20,30 @@ from dagster import (
33
20
  SourceAsset,
34
21
  _check as check,
35
22
  )
23
+ from dagster._annotations import experimental
36
24
  from dagster._core.definitions import AssetsDefinition, multi_asset
37
25
  from dagster._core.definitions.cacheable_assets import (
38
26
  AssetsDefinitionCacheableData,
39
27
  CacheableAssetsDefinition,
40
28
  )
41
29
  from dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix
42
- from dagster._core.definitions.metadata import MetadataValue, TableSchemaMetadataValue
30
+ from dagster._core.definitions.metadata.metadata_set import TableMetadataSet
43
31
  from dagster._core.definitions.metadata.table import TableSchema
44
32
  from dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError
45
33
  from dagster._core.execution.context.init import build_init_resource_context
46
34
  from dagster._utils.merger import merge_dicts
47
35
 
48
- from dagster_airbyte.resources import AirbyteCloudResource, AirbyteResource, BaseAirbyteResource
36
+ from dagster_airbyte.asset_decorator import airbyte_assets
37
+ from dagster_airbyte.resources import (
38
+ AirbyteCloudResource,
39
+ AirbyteCloudWorkspace,
40
+ AirbyteResource,
41
+ BaseAirbyteResource,
42
+ )
43
+ from dagster_airbyte.translator import AirbyteMetadataSet, DagsterAirbyteTranslator
49
44
  from dagster_airbyte.types import AirbyteTableMetadata
50
45
  from dagster_airbyte.utils import (
46
+ clean_name,
51
47
  generate_materializations,
52
48
  generate_table_schema,
53
49
  is_basic_normalization_operation,
@@ -61,9 +57,13 @@ def _table_to_output_name_fn(table: str) -> str:
61
57
  def _build_airbyte_asset_defn_metadata(
62
58
  connection_id: str,
63
59
  destination_tables: Sequence[str],
60
+ destination_raw_table_names_by_table: Mapping[str, str],
61
+ destination_database: Optional[str],
62
+ destination_schema: Optional[str],
64
63
  table_to_asset_key_fn: Callable[[str], AssetKey],
65
64
  asset_key_prefix: Optional[Sequence[str]] = None,
66
- normalization_tables: Optional[Mapping[str, Set[str]]] = None,
65
+ normalization_tables: Optional[Mapping[str, set[str]]] = None,
66
+ normalization_raw_table_names_by_table: Optional[Mapping[str, str]] = None,
67
67
  upstream_assets: Optional[Iterable[AssetKey]] = None,
68
68
  group_name: Optional[str] = None,
69
69
  io_manager_key: Optional[str] = None,
@@ -92,7 +92,7 @@ def _build_airbyte_asset_defn_metadata(
92
92
  for table in tables
93
93
  }
94
94
 
95
- internal_deps: Dict[str, Set[AssetKey]] = {}
95
+ internal_deps: dict[str, set[AssetKey]] = {}
96
96
 
97
97
  metadata_encodable_normalization_tables = (
98
98
  {k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}
@@ -111,6 +111,30 @@ def _build_airbyte_asset_defn_metadata(
111
111
  for table in destination_tables:
112
112
  internal_deps[table] = set(upstream_assets or [])
113
113
 
114
+ table_names: dict[str, str] = {}
115
+ for table in destination_tables:
116
+ if destination_database and destination_schema and table:
117
+ # Use the destination raw table name to create the table name
118
+ table_names[table] = ".".join(
119
+ [
120
+ destination_database,
121
+ destination_schema,
122
+ destination_raw_table_names_by_table[table],
123
+ ]
124
+ )
125
+ if normalization_tables and normalization_raw_table_names_by_table:
126
+ for normalization_table in normalization_tables.get(table, set()):
127
+ table_names[normalization_table] = ".".join(
128
+ [
129
+ destination_database,
130
+ destination_schema,
131
+ destination_raw_table_names_by_table[table],
132
+ normalization_raw_table_names_by_table[normalization_table],
133
+ ]
134
+ )
135
+
136
+ schema_by_table_name = schema_by_table_name if schema_by_table_name else {}
137
+
114
138
  return AssetsDefinitionCacheableData(
115
139
  keys_by_input_name=(
116
140
  {asset_key.path[-1]: asset_key for asset_key in upstream_assets}
@@ -124,11 +148,14 @@ def _build_airbyte_asset_defn_metadata(
124
148
  can_subset=False,
125
149
  metadata_by_output_name=(
126
150
  {
127
- table: {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}
151
+ table: {
152
+ **TableMetadataSet(
153
+ column_schema=schema_by_table_name.get(table),
154
+ table_name=table_names.get(table),
155
+ ),
156
+ }
128
157
  for table in tables
129
158
  }
130
- if schema_by_table_name
131
- else None
132
159
  ),
133
160
  freshness_policies_by_output_name=(
134
161
  {output: freshness_policy for output in outputs} if freshness_policy else None
@@ -155,21 +182,18 @@ def _build_airbyte_assets_from_metadata(
155
182
  metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)
156
183
  connection_id = cast(str, metadata["connection_id"])
157
184
  group_name = cast(Optional[str], metadata["group_name"])
158
- destination_tables = cast(List[str], metadata["destination_tables"])
159
- normalization_tables = cast(Mapping[str, List[str]], metadata["normalization_tables"])
185
+ destination_tables = cast(list[str], metadata["destination_tables"])
186
+ normalization_tables = cast(Mapping[str, list[str]], metadata["normalization_tables"])
160
187
  io_manager_key = cast(Optional[str], metadata["io_manager_key"])
161
188
 
162
189
  @multi_asset(
163
- name=f"airbyte_sync_{connection_id[:5]}",
190
+ name=f"airbyte_sync_{connection_id.replace('-', '_')}",
164
191
  deps=list((assets_defn_meta.keys_by_input_name or {}).values()),
165
192
  outs={
166
193
  k: AssetOut(
167
194
  key=v,
168
195
  metadata=(
169
- {
170
- k: cast(TableSchemaMetadataValue, v)
171
- for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()
172
- }
196
+ assets_defn_meta.metadata_by_output_name.get(k)
173
197
  if assets_defn_meta.metadata_by_output_name
174
198
  else None
175
199
  ),
@@ -224,11 +248,13 @@ def _build_airbyte_assets_from_metadata(
224
248
  def build_airbyte_assets(
225
249
  connection_id: str,
226
250
  destination_tables: Sequence[str],
251
+ destination_database: Optional[str] = None,
252
+ destination_schema: Optional[str] = None,
227
253
  asset_key_prefix: Optional[Sequence[str]] = None,
228
254
  group_name: Optional[str] = None,
229
- normalization_tables: Optional[Mapping[str, Set[str]]] = None,
255
+ normalization_tables: Optional[Mapping[str, set[str]]] = None,
230
256
  deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,
231
- upstream_assets: Optional[Set[AssetKey]] = None,
257
+ upstream_assets: Optional[set[AssetKey]] = None,
232
258
  schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,
233
259
  freshness_policy: Optional[FreshnessPolicy] = None,
234
260
  stream_to_asset_map: Optional[Mapping[str, str]] = None,
@@ -242,6 +268,8 @@ def build_airbyte_assets(
242
268
  destination_tables (List[str]): The names of the tables that you want to be represented
243
269
  in the Dagster asset graph for this sync. This will generally map to the name of the
244
270
  stream in Airbyte, unless a stream prefix has been specified in Airbyte.
271
+ destination_database (Optional[str]): The name of the destination database.
272
+ destination_schema (Optional[str]): The name of the destination schema.
245
273
  normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's
246
274
  normalization feature, you may specify a mapping of destination table to a list of
247
275
  derived tables that will be created by the normalization process.
@@ -268,13 +296,34 @@ def build_airbyte_assets(
268
296
  tables = chain.from_iterable(
269
297
  chain([destination_tables], normalization_tables.values() if normalization_tables else [])
270
298
  )
299
+
300
+ table_names: dict[str, str] = {}
301
+ for table in destination_tables:
302
+ if destination_database and destination_schema and table:
303
+ table_names[table] = ".".join([destination_database, destination_schema, table])
304
+ if normalization_tables:
305
+ for normalization_table in normalization_tables.get(table, set()):
306
+ table_names[normalization_table] = ".".join(
307
+ [
308
+ destination_database,
309
+ destination_schema,
310
+ table,
311
+ normalization_table,
312
+ ]
313
+ )
314
+
315
+ schema_by_table_name = schema_by_table_name if schema_by_table_name else {}
316
+
271
317
  outputs = {
272
318
  table: AssetOut(
273
319
  key=AssetKey([*asset_key_prefix, table]),
274
320
  metadata=(
275
- {"table_schema": MetadataValue.table_schema(schema_by_table_name[table])}
276
- if schema_by_table_name
277
- else None
321
+ {
322
+ **TableMetadataSet(
323
+ column_schema=schema_by_table_name.get(table),
324
+ table_name=table_names.get(table),
325
+ ),
326
+ }
278
327
  ),
279
328
  freshness_policy=freshness_policy,
280
329
  auto_materialize_policy=auto_materialize_policy,
@@ -300,7 +349,7 @@ def build_airbyte_assets(
300
349
  internal_deps[table] = set(upstream_deps) if upstream_deps else set()
301
350
 
302
351
  @multi_asset(
303
- name=f"airbyte_sync_{connection_id[:5]}",
352
+ name=f"airbyte_sync_{connection_id.replace('-', '_')}",
304
353
  deps=upstream_deps,
305
354
  outs=outputs,
306
355
  internal_asset_deps=internal_deps,
@@ -376,7 +425,7 @@ def _get_normalization_tables_for_schema(
376
425
  For more information on Airbyte's normalization process, see:
377
426
  https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting
378
427
  """
379
- out: Dict[str, AirbyteTableMetadata] = {}
428
+ out: dict[str, AirbyteTableMetadata] = {}
380
429
  # Object types are broken into a new table, as long as they have children
381
430
 
382
431
  sub_schemas = _get_sub_schemas(schema)
@@ -388,7 +437,7 @@ def _get_normalization_tables_for_schema(
388
437
 
389
438
  if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:
390
439
  out[prefix + key] = AirbyteTableMetadata(
391
- schema=generate_table_schema(sub_schema.get("properties", {}))
440
+ raw_table_name=key, schema=generate_table_schema(sub_schema.get("properties", {}))
392
441
  )
393
442
  for k, v in sub_schema["properties"].items():
394
443
  out = merge_dicts(
@@ -397,7 +446,8 @@ def _get_normalization_tables_for_schema(
397
446
  # Array types are also broken into a new table
398
447
  elif "array" in schema_types:
399
448
  out[prefix + key] = AirbyteTableMetadata(
400
- schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {}))
449
+ raw_table_name=key,
450
+ schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {})),
401
451
  )
402
452
  if sub_schema.get("items", {}).get("properties"):
403
453
  for k, v in sub_schema["items"]["properties"].items():
@@ -408,11 +458,6 @@ def _get_normalization_tables_for_schema(
408
458
  return out
409
459
 
410
460
 
411
- def _clean_name(name: str) -> str:
412
- """Cleans an input to be a valid Dagster asset name."""
413
- return re.sub(r"[^a-z0-9]+", "_", name.lower())
414
-
415
-
416
461
  class AirbyteConnectionMetadata(
417
462
  NamedTuple(
418
463
  "_AirbyteConnectionMetadata",
@@ -420,7 +465,8 @@ class AirbyteConnectionMetadata(
420
465
  ("name", str),
421
466
  ("stream_prefix", str),
422
467
  ("has_basic_normalization", bool),
423
- ("stream_data", List[Mapping[str, Any]]),
468
+ ("stream_data", list[Mapping[str, Any]]),
469
+ ("destination", Mapping[str, Any]),
424
470
  ],
425
471
  )
426
472
  ):
@@ -435,7 +481,10 @@ class AirbyteConnectionMetadata(
435
481
 
436
482
  @classmethod
437
483
  def from_api_json(
438
- cls, contents: Mapping[str, Any], operations: Mapping[str, Any]
484
+ cls,
485
+ contents: Mapping[str, Any],
486
+ operations: Mapping[str, Any],
487
+ destination: Mapping[str, Any],
439
488
  ) -> "AirbyteConnectionMetadata":
440
489
  return cls(
441
490
  name=contents["name"],
@@ -445,10 +494,13 @@ class AirbyteConnectionMetadata(
445
494
  for op in operations.get("operations", [])
446
495
  ),
447
496
  stream_data=contents.get("syncCatalog", {}).get("streams", []),
497
+ destination=destination,
448
498
  )
449
499
 
450
500
  @classmethod
451
- def from_config(cls, contents: Mapping[str, Any]) -> "AirbyteConnectionMetadata":
501
+ def from_config(
502
+ cls, contents: Mapping[str, Any], destination: Mapping[str, Any]
503
+ ) -> "AirbyteConnectionMetadata":
452
504
  config_contents = cast(Mapping[str, Any], contents.get("configuration"))
453
505
  check.invariant(
454
506
  config_contents is not None, "Airbyte connection config is missing 'configuration' key"
@@ -462,6 +514,7 @@ class AirbyteConnectionMetadata(
462
514
  for op in config_contents.get("operations", [])
463
515
  ),
464
516
  stream_data=config_contents.get("sync_catalog", {}).get("streams", []),
517
+ destination=destination,
465
518
  )
466
519
 
467
520
  def parse_stream_tables(
@@ -471,7 +524,7 @@ class AirbyteConnectionMetadata(
471
524
  tables associated with each enabled stream and values representing any affiliated
472
525
  tables created by Airbyte's normalization process, if enabled.
473
526
  """
474
- tables: Dict[str, AirbyteTableMetadata] = {}
527
+ tables: dict[str, AirbyteTableMetadata] = {}
475
528
 
476
529
  enabled_streams = [
477
530
  stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)
@@ -486,7 +539,7 @@ class AirbyteConnectionMetadata(
486
539
  if "json_schema" in stream["stream"]
487
540
  else stream["stream"]["jsonSchema"]
488
541
  )
489
- normalization_tables: Dict[str, AirbyteTableMetadata] = {}
542
+ normalization_tables: dict[str, AirbyteTableMetadata] = {}
490
543
  schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))
491
544
  if self.has_basic_normalization and return_normalization_tables:
492
545
  for k, v in schema_props.items():
@@ -496,6 +549,7 @@ class AirbyteConnectionMetadata(
496
549
  prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"
497
550
  normalization_tables[prefixed_norm_table_name] = meta
498
551
  tables[prefixed_name] = AirbyteTableMetadata(
552
+ raw_table_name=name,
499
553
  schema=generate_table_schema(schema_props),
500
554
  normalization_tables=normalization_tables,
501
555
  )
@@ -513,7 +567,7 @@ def _get_schema_by_table_name(
513
567
  [
514
568
  (k, v.schema)
515
569
  for k, v in cast(
516
- Dict[str, AirbyteTableMetadata], meta.normalization_tables
570
+ dict[str, AirbyteTableMetadata], meta.normalization_tables
517
571
  ).items()
518
572
  ]
519
573
  for meta in stream_table_metadata.values()
@@ -564,25 +618,46 @@ class AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):
564
618
  super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")
565
619
 
566
620
  @abstractmethod
567
- def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:
621
+ def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
568
622
  pass
569
623
 
570
624
  def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:
571
- asset_defn_data: List[AssetsDefinitionCacheableData] = []
625
+ asset_defn_data: list[AssetsDefinitionCacheableData] = []
572
626
  for connection_id, connection in self._get_connections():
573
627
  stream_table_metadata = connection.parse_stream_tables(
574
628
  self._create_assets_for_normalization_tables
575
629
  )
576
630
  schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)
577
631
 
632
+ destination_database = connection.destination.get("configuration", {}).get("database")
633
+ destination_schema = connection.destination.get("configuration", {}).get("schema")
634
+
578
635
  table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)
636
+
637
+ destination_tables = list(stream_table_metadata.keys())
638
+ destination_raw_table_names_by_table = {
639
+ table: metadata.raw_table_name for table, metadata in stream_table_metadata.items()
640
+ }
641
+ normalization_tables = {
642
+ table: set(metadata.normalization_tables.keys())
643
+ for table, metadata in stream_table_metadata.items()
644
+ }
645
+ normalization_raw_table_names_by_table = {
646
+ normalization_table: metadata.normalization_tables[
647
+ normalization_table
648
+ ].raw_table_name
649
+ for table, metadata in stream_table_metadata.items()
650
+ for normalization_table in normalization_tables[table]
651
+ }
652
+
579
653
  asset_data_for_conn = _build_airbyte_asset_defn_metadata(
580
654
  connection_id=connection_id,
581
- destination_tables=list(stream_table_metadata.keys()),
582
- normalization_tables={
583
- table: set(metadata.normalization_tables.keys())
584
- for table, metadata in stream_table_metadata.items()
585
- },
655
+ destination_tables=destination_tables,
656
+ destination_raw_table_names_by_table=destination_raw_table_names_by_table,
657
+ destination_database=destination_database,
658
+ destination_schema=destination_schema,
659
+ normalization_tables=normalization_tables,
660
+ normalization_raw_table_names_by_table=normalization_raw_table_names_by_table,
586
661
  asset_key_prefix=self._key_prefix,
587
662
  group_name=(
588
663
  self._connection_meta_to_group_fn(connection)
@@ -661,11 +736,11 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
661
736
  )
662
737
  self._airbyte_instance: AirbyteResource = self._partially_initialized_airbyte_instance
663
738
 
664
- def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:
739
+ def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
665
740
  workspace_id = self._workspace_id
666
741
  if not workspace_id:
667
742
  workspaces = cast(
668
- List[Dict[str, Any]],
743
+ list[dict[str, Any]],
669
744
  check.not_none(
670
745
  self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})
671
746
  ).get("workspaces", []),
@@ -677,7 +752,7 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
677
752
  workspace_id = workspaces[0].get("workspaceId")
678
753
 
679
754
  connections = cast(
680
- List[Dict[str, Any]],
755
+ list[dict[str, Any]],
681
756
  check.not_none(
682
757
  self._airbyte_instance.make_request(
683
758
  endpoint="/connections/list", data={"workspaceId": workspace_id}
@@ -685,12 +760,12 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
685
760
  ).get("connections", []),
686
761
  )
687
762
 
688
- output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []
763
+ output_connections: list[tuple[str, AirbyteConnectionMetadata]] = []
689
764
  for connection_json in connections:
690
765
  connection_id = cast(str, connection_json.get("connectionId"))
691
766
 
692
767
  operations_json = cast(
693
- Dict[str, Any],
768
+ dict[str, Any],
694
769
  check.not_none(
695
770
  self._airbyte_instance.make_request(
696
771
  endpoint="/operations/list",
@@ -698,7 +773,21 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
698
773
  )
699
774
  ),
700
775
  )
701
- connection = AirbyteConnectionMetadata.from_api_json(connection_json, operations_json)
776
+
777
+ destination_id = cast(str, connection_json.get("destinationId"))
778
+ destination_json = cast(
779
+ dict[str, Any],
780
+ check.not_none(
781
+ self._airbyte_instance.make_request(
782
+ endpoint="/destinations/get",
783
+ data={"destinationId": destination_id},
784
+ )
785
+ ),
786
+ )
787
+
788
+ connection = AirbyteConnectionMetadata.from_api_json(
789
+ connection_json, operations_json, destination_json
790
+ )
702
791
 
703
792
  # Filter out connections that don't match the filter function
704
793
  if self._connection_filter and not self._connection_filter(connection):
@@ -749,16 +838,26 @@ class AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition)
749
838
  self._project_dir = project_dir
750
839
  self._connection_directories = connection_directories
751
840
 
752
- def _get_connections(self) -> Sequence[Tuple[str, AirbyteConnectionMetadata]]:
841
+ def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
753
842
  connections_dir = os.path.join(self._project_dir, "connections")
754
843
 
755
- output_connections: List[Tuple[str, AirbyteConnectionMetadata]] = []
844
+ output_connections: list[tuple[str, AirbyteConnectionMetadata]] = []
756
845
 
757
846
  connection_directories = self._connection_directories or os.listdir(connections_dir)
758
847
  for connection_name in connection_directories:
759
848
  connection_dir = os.path.join(connections_dir, connection_name)
760
849
  with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:
761
- connection = AirbyteConnectionMetadata.from_config(yaml.safe_load(f.read()))
850
+ connection_data = yaml.safe_load(f.read())
851
+
852
+ destination_configuration_path = cast(
853
+ str, connection_data.get("destination_configuration_path")
854
+ )
855
+ with open(
856
+ os.path.join(self._project_dir, destination_configuration_path), encoding="utf-8"
857
+ ) as f:
858
+ destination_data = yaml.safe_load(f.read())
859
+
860
+ connection = AirbyteConnectionMetadata.from_config(connection_data, destination_data)
762
861
 
763
862
  # Filter out connections that don't match the filter function
764
863
  if self._connection_filter and not self._connection_filter(connection):
@@ -800,7 +899,7 @@ def load_assets_from_airbyte_instance(
800
899
  workspace_id: Optional[str] = None,
801
900
  key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,
802
901
  create_assets_for_normalization_tables: bool = True,
803
- connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,
902
+ connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = clean_name,
804
903
  connection_meta_to_group_fn: Optional[
805
904
  Callable[[AirbyteConnectionMetadata], Optional[str]]
806
905
  ] = None,
@@ -905,7 +1004,7 @@ def load_assets_from_airbyte_instance(
905
1004
  check.invariant(
906
1005
  not connection_meta_to_group_fn
907
1006
  or not connection_to_group_fn
908
- or connection_to_group_fn == _clean_name,
1007
+ or connection_to_group_fn == clean_name,
909
1008
  "Cannot specify both connection_meta_to_group_fn and connection_to_group_fn",
910
1009
  )
911
1010
 
@@ -926,123 +1025,115 @@ def load_assets_from_airbyte_instance(
926
1025
  )
927
1026
 
928
1027
 
929
- def load_assets_from_airbyte_project(
930
- project_dir: str,
931
- workspace_id: Optional[str] = None,
932
- key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,
933
- create_assets_for_normalization_tables: bool = True,
934
- connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,
935
- connection_meta_to_group_fn: Optional[
936
- Callable[[AirbyteConnectionMetadata], Optional[str]]
937
- ] = None,
938
- io_manager_key: Optional[str] = None,
939
- connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,
940
- connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,
941
- connection_directories: Optional[Sequence[str]] = None,
942
- connection_to_asset_key_fn: Optional[
943
- Callable[[AirbyteConnectionMetadata, str], AssetKey]
944
- ] = None,
945
- connection_to_freshness_policy_fn: Optional[
946
- Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
947
- ] = None,
948
- connection_to_auto_materialize_policy_fn: Optional[
949
- Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
950
- ] = None,
951
- ) -> CacheableAssetsDefinition:
952
- """Loads an Airbyte project into a set of Dagster assets.
1028
+ # -----------------------
1029
+ # Reworked assets factory
1030
+ # -----------------------
953
1031
 
954
- Point to the root folder of an Airbyte project synced using the Octavia CLI. For
955
- more information, see https://github.com/airbytehq/airbyte/tree/master/octavia-cli#octavia-import-all.
1032
+
1033
+ @experimental
1034
+ def build_airbyte_assets_definitions(
1035
+ *,
1036
+ workspace: AirbyteCloudWorkspace,
1037
+ dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
1038
+ ) -> Sequence[AssetsDefinition]:
1039
+ """The list of AssetsDefinition for all connections in the Airbyte workspace.
956
1040
 
957
1041
  Args:
958
- project_dir (str): The path to the root of your Airbyte project, containing sources, destinations,
959
- and connections folders.
960
- workspace_id (Optional[str]): The ID of the Airbyte workspace to load connections from. Only
961
- required if multiple workspace state YAMLfiles exist in the project.
962
- key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.
963
- create_assets_for_normalization_tables (bool): If True, assets will be created for tables
964
- created by Airbyte's normalization feature. If False, only the destination tables
965
- will be created. Defaults to True.
966
- connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset
967
- group name for a given Airbyte connection name. If None, no groups will be created. Defaults
968
- to a basic sanitization function.
969
- connection_meta_to_group_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[str]]]): Function
970
- which returns an asset group name for a given Airbyte connection metadata. If None and connection_to_group_fn
971
- is None, no groups will be created. Defaults to None.
972
- io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".
973
- Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.
974
- connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an
975
- I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,
976
- the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".
977
- connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which
978
- takes in connection metadata and returns False if the connection should be excluded from the output assets.
979
- connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.
980
- If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter
981
- if the project has many connections or if the connection yaml files are large.
982
- connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which
983
- takes in connection metadata and table name and returns an asset key for the table. If None, the default asset
984
- key is based on the table name. Any asset key prefix will be applied to the output of this function.
985
- connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):
986
- Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.
987
- If None, no freshness policies will be applied to the assets.
988
- connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):
989
- Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.
990
- If None, no auto materialization policies will be applied to the assets.
1042
+ workspace (AirbyteCloudWorkspace): The Airbyte workspace to fetch assets from.
1043
+ dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
1044
+ to convert Airbyte content into :py:class:`dagster.AssetSpec`.
1045
+ Defaults to :py:class:`DagsterAirbyteTranslator`.
991
1046
 
992
- **Examples:**
1047
+ Returns:
1048
+ List[AssetsDefinition]: The list of AssetsDefinition for all connections in the Airbyte workspace.
993
1049
 
994
- Loading all Airbyte connections as assets:
1050
+ Examples:
1051
+ Sync the tables of a Airbyte connection:
995
1052
 
996
- .. code-block:: python
1053
+ .. code-block:: python
997
1054
 
998
- from dagster_airbyte import load_assets_from_airbyte_project
1055
+ from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
999
1056
 
1000
- airbyte_assets = load_assets_from_airbyte_project(
1001
- project_dir="path/to/airbyte/project",
1002
- )
1057
+ import dagster as dg
1003
1058
 
1004
- Filtering the set of loaded connections:
1059
+ airbyte_workspace = AirbyteCloudWorkspace(
1060
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1061
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1062
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1063
+ )
1005
1064
 
1006
- .. code-block:: python
1007
1065
 
1008
- from dagster_airbyte import load_assets_from_airbyte_project
1066
+ airbyte_assets = build_airbyte_assets_definitions(workspace=workspace)
1009
1067
 
1010
- airbyte_assets = load_assets_from_airbyte_project(
1011
- project_dir="path/to/airbyte/project",
1012
- connection_filter=lambda meta: "snowflake" in meta.name,
1013
- )
1068
+ defs = dg.Definitions(
1069
+ assets=airbyte_assets,
1070
+ resources={"airbyte": airbyte_workspace},
1071
+ )
1072
+
1073
+ Sync the tables of a Airbyte connection with a custom translator:
1074
+
1075
+ .. code-block:: python
1076
+
1077
+ from dagster_airbyte import (
1078
+ DagsterAirbyteTranslator,
1079
+ AirbyteConnectionTableProps,
1080
+ AirbyteCloudWorkspace,
1081
+ build_airbyte_assets_definitions
1082
+ )
1083
+
1084
+ import dagster as dg
1085
+
1086
+ class CustomDagsterAirbyteTranslator(DagsterAirbyteTranslator):
1087
+ def get_asset_spec(self, props: AirbyteConnectionTableProps) -> dg.AssetSpec:
1088
+ default_spec = super().get_asset_spec(props)
1089
+ return default_spec.merge_attributes(
1090
+ metadata={"custom": "metadata"},
1091
+ )
1092
+
1093
+ airbyte_workspace = AirbyteCloudWorkspace(
1094
+ workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
1095
+ client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
1096
+ client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
1097
+ )
1098
+
1099
+
1100
+ airbyte_assets = build_airbyte_assets_definitions(
1101
+ workspace=workspace,
1102
+ dagster_airbyte_translator=CustomDagsterAirbyteTranslator()
1103
+ )
1104
+
1105
+ defs = dg.Definitions(
1106
+ assets=airbyte_assets,
1107
+ resources={"airbyte": airbyte_workspace},
1108
+ )
1014
1109
  """
1015
- if isinstance(key_prefix, str):
1016
- key_prefix = [key_prefix]
1017
- key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)
1110
+ dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
1018
1111
 
1019
- check.invariant(
1020
- not io_manager_key or not connection_to_io_manager_key_fn,
1021
- "Cannot specify both io_manager_key and connection_to_io_manager_key_fn",
1112
+ all_asset_specs = workspace.load_asset_specs(
1113
+ dagster_airbyte_translator=dagster_airbyte_translator
1022
1114
  )
1023
- if not connection_to_io_manager_key_fn:
1024
- connection_to_io_manager_key_fn = lambda _: io_manager_key
1025
1115
 
1026
- check.invariant(
1027
- not connection_meta_to_group_fn
1028
- or not connection_to_group_fn
1029
- or connection_to_group_fn == _clean_name,
1030
- "Cannot specify both connection_meta_to_group_fn and connection_to_group_fn",
1031
- )
1116
+ connections = {
1117
+ (
1118
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id),
1119
+ check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_name),
1120
+ )
1121
+ for spec in all_asset_specs
1122
+ }
1032
1123
 
1033
- if not connection_meta_to_group_fn and connection_to_group_fn:
1034
- connection_meta_to_group_fn = lambda meta: connection_to_group_fn(meta.name)
1124
+ _asset_fns = []
1125
+ for connection_id, connection_name in connections:
1035
1126
 
1036
- return AirbyteYAMLCacheableAssetsDefinition(
1037
- project_dir=project_dir,
1038
- workspace_id=workspace_id,
1039
- key_prefix=key_prefix,
1040
- create_assets_for_normalization_tables=create_assets_for_normalization_tables,
1041
- connection_meta_to_group_fn=connection_meta_to_group_fn,
1042
- connection_to_io_manager_key_fn=connection_to_io_manager_key_fn,
1043
- connection_filter=connection_filter,
1044
- connection_directories=connection_directories,
1045
- connection_to_asset_key_fn=connection_to_asset_key_fn,
1046
- connection_to_freshness_policy_fn=connection_to_freshness_policy_fn,
1047
- connection_to_auto_materialize_policy_fn=connection_to_auto_materialize_policy_fn,
1048
- )
1127
+ @airbyte_assets(
1128
+ connection_id=connection_id,
1129
+ workspace=workspace,
1130
+ name=clean_name(connection_name),
1131
+ group_name=clean_name(connection_name),
1132
+ dagster_airbyte_translator=dagster_airbyte_translator,
1133
+ )
1134
+ def _asset_fn(context: AssetExecutionContext, airbyte: AirbyteCloudWorkspace):
1135
+ yield from airbyte.sync_and_poll(context=context)
1136
+
1137
+ _asset_fns.append(_asset_fn)
1138
+
1139
+ return _asset_fns