dagster-airbyte 0.23.7__py3-none-any.whl → 0.25.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dagster-airbyte might be problematic. Click here for more details.
- dagster_airbyte/__init__.py +16 -8
- dagster_airbyte/asset_decorator.py +113 -0
- dagster_airbyte/asset_defs.py +261 -170
- dagster_airbyte/managed/__init__.py +2 -2
- dagster_airbyte/managed/generated/__init__.py +1 -1
- dagster_airbyte/managed/generated/destinations.py +3 -3
- dagster_airbyte/managed/generated/sources.py +46 -46
- dagster_airbyte/managed/reconciliation.py +22 -34
- dagster_airbyte/managed/types.py +11 -10
- dagster_airbyte/ops.py +6 -5
- dagster_airbyte/py.typed +1 -0
- dagster_airbyte/resources.py +705 -45
- dagster_airbyte/translator.py +236 -0
- dagster_airbyte/types.py +7 -2
- dagster_airbyte/utils.py +38 -2
- dagster_airbyte/version.py +1 -1
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/METADATA +5 -5
- dagster_airbyte-0.25.10.dist-info/RECORD +23 -0
- dagster_airbyte-0.23.7.dist-info/RECORD +0 -20
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/LICENSE +0 -0
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/WHEEL +0 -0
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/entry_points.txt +0 -0
- {dagster_airbyte-0.23.7.dist-info → dagster_airbyte-0.25.10.dist-info}/top_level.txt +0 -0
dagster_airbyte/asset_defs.py
CHANGED
|
@@ -1,28 +1,15 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import inspect
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
4
|
from abc import abstractmethod
|
|
5
|
+
from collections.abc import Iterable, Mapping, Sequence
|
|
6
6
|
from functools import partial
|
|
7
7
|
from itertools import chain
|
|
8
|
-
from typing import
|
|
9
|
-
Any,
|
|
10
|
-
Callable,
|
|
11
|
-
Dict,
|
|
12
|
-
Iterable,
|
|
13
|
-
List,
|
|
14
|
-
Mapping,
|
|
15
|
-
NamedTuple,
|
|
16
|
-
Optional,
|
|
17
|
-
Sequence,
|
|
18
|
-
Set,
|
|
19
|
-
Tuple,
|
|
20
|
-
Union,
|
|
21
|
-
cast,
|
|
22
|
-
)
|
|
8
|
+
from typing import Any, Callable, NamedTuple, Optional, Union, cast
|
|
23
9
|
|
|
24
10
|
import yaml
|
|
25
11
|
from dagster import (
|
|
12
|
+
AssetExecutionContext,
|
|
26
13
|
AssetKey,
|
|
27
14
|
AssetOut,
|
|
28
15
|
AutoMaterializePolicy,
|
|
@@ -33,21 +20,30 @@ from dagster import (
|
|
|
33
20
|
SourceAsset,
|
|
34
21
|
_check as check,
|
|
35
22
|
)
|
|
23
|
+
from dagster._annotations import experimental
|
|
36
24
|
from dagster._core.definitions import AssetsDefinition, multi_asset
|
|
37
25
|
from dagster._core.definitions.cacheable_assets import (
|
|
38
26
|
AssetsDefinitionCacheableData,
|
|
39
27
|
CacheableAssetsDefinition,
|
|
40
28
|
)
|
|
41
29
|
from dagster._core.definitions.events import CoercibleToAssetKey, CoercibleToAssetKeyPrefix
|
|
42
|
-
from dagster._core.definitions.metadata import
|
|
30
|
+
from dagster._core.definitions.metadata.metadata_set import TableMetadataSet
|
|
43
31
|
from dagster._core.definitions.metadata.table import TableSchema
|
|
44
32
|
from dagster._core.errors import DagsterInvalidDefinitionError, DagsterInvalidInvocationError
|
|
45
33
|
from dagster._core.execution.context.init import build_init_resource_context
|
|
46
34
|
from dagster._utils.merger import merge_dicts
|
|
47
35
|
|
|
48
|
-
from dagster_airbyte.
|
|
36
|
+
from dagster_airbyte.asset_decorator import airbyte_assets
|
|
37
|
+
from dagster_airbyte.resources import (
|
|
38
|
+
AirbyteCloudResource,
|
|
39
|
+
AirbyteCloudWorkspace,
|
|
40
|
+
AirbyteResource,
|
|
41
|
+
BaseAirbyteResource,
|
|
42
|
+
)
|
|
43
|
+
from dagster_airbyte.translator import AirbyteMetadataSet, DagsterAirbyteTranslator
|
|
49
44
|
from dagster_airbyte.types import AirbyteTableMetadata
|
|
50
45
|
from dagster_airbyte.utils import (
|
|
46
|
+
clean_name,
|
|
51
47
|
generate_materializations,
|
|
52
48
|
generate_table_schema,
|
|
53
49
|
is_basic_normalization_operation,
|
|
@@ -61,9 +57,13 @@ def _table_to_output_name_fn(table: str) -> str:
|
|
|
61
57
|
def _build_airbyte_asset_defn_metadata(
|
|
62
58
|
connection_id: str,
|
|
63
59
|
destination_tables: Sequence[str],
|
|
60
|
+
destination_raw_table_names_by_table: Mapping[str, str],
|
|
61
|
+
destination_database: Optional[str],
|
|
62
|
+
destination_schema: Optional[str],
|
|
64
63
|
table_to_asset_key_fn: Callable[[str], AssetKey],
|
|
65
64
|
asset_key_prefix: Optional[Sequence[str]] = None,
|
|
66
|
-
normalization_tables: Optional[Mapping[str,
|
|
65
|
+
normalization_tables: Optional[Mapping[str, set[str]]] = None,
|
|
66
|
+
normalization_raw_table_names_by_table: Optional[Mapping[str, str]] = None,
|
|
67
67
|
upstream_assets: Optional[Iterable[AssetKey]] = None,
|
|
68
68
|
group_name: Optional[str] = None,
|
|
69
69
|
io_manager_key: Optional[str] = None,
|
|
@@ -92,7 +92,7 @@ def _build_airbyte_asset_defn_metadata(
|
|
|
92
92
|
for table in tables
|
|
93
93
|
}
|
|
94
94
|
|
|
95
|
-
internal_deps:
|
|
95
|
+
internal_deps: dict[str, set[AssetKey]] = {}
|
|
96
96
|
|
|
97
97
|
metadata_encodable_normalization_tables = (
|
|
98
98
|
{k: list(v) for k, v in normalization_tables.items()} if normalization_tables else {}
|
|
@@ -111,6 +111,30 @@ def _build_airbyte_asset_defn_metadata(
|
|
|
111
111
|
for table in destination_tables:
|
|
112
112
|
internal_deps[table] = set(upstream_assets or [])
|
|
113
113
|
|
|
114
|
+
table_names: dict[str, str] = {}
|
|
115
|
+
for table in destination_tables:
|
|
116
|
+
if destination_database and destination_schema and table:
|
|
117
|
+
# Use the destination raw table name to create the table name
|
|
118
|
+
table_names[table] = ".".join(
|
|
119
|
+
[
|
|
120
|
+
destination_database,
|
|
121
|
+
destination_schema,
|
|
122
|
+
destination_raw_table_names_by_table[table],
|
|
123
|
+
]
|
|
124
|
+
)
|
|
125
|
+
if normalization_tables and normalization_raw_table_names_by_table:
|
|
126
|
+
for normalization_table in normalization_tables.get(table, set()):
|
|
127
|
+
table_names[normalization_table] = ".".join(
|
|
128
|
+
[
|
|
129
|
+
destination_database,
|
|
130
|
+
destination_schema,
|
|
131
|
+
destination_raw_table_names_by_table[table],
|
|
132
|
+
normalization_raw_table_names_by_table[normalization_table],
|
|
133
|
+
]
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
schema_by_table_name = schema_by_table_name if schema_by_table_name else {}
|
|
137
|
+
|
|
114
138
|
return AssetsDefinitionCacheableData(
|
|
115
139
|
keys_by_input_name=(
|
|
116
140
|
{asset_key.path[-1]: asset_key for asset_key in upstream_assets}
|
|
@@ -124,11 +148,14 @@ def _build_airbyte_asset_defn_metadata(
|
|
|
124
148
|
can_subset=False,
|
|
125
149
|
metadata_by_output_name=(
|
|
126
150
|
{
|
|
127
|
-
table: {
|
|
151
|
+
table: {
|
|
152
|
+
**TableMetadataSet(
|
|
153
|
+
column_schema=schema_by_table_name.get(table),
|
|
154
|
+
table_name=table_names.get(table),
|
|
155
|
+
),
|
|
156
|
+
}
|
|
128
157
|
for table in tables
|
|
129
158
|
}
|
|
130
|
-
if schema_by_table_name
|
|
131
|
-
else None
|
|
132
159
|
),
|
|
133
160
|
freshness_policies_by_output_name=(
|
|
134
161
|
{output: freshness_policy for output in outputs} if freshness_policy else None
|
|
@@ -155,21 +182,18 @@ def _build_airbyte_assets_from_metadata(
|
|
|
155
182
|
metadata = cast(Mapping[str, Any], assets_defn_meta.extra_metadata)
|
|
156
183
|
connection_id = cast(str, metadata["connection_id"])
|
|
157
184
|
group_name = cast(Optional[str], metadata["group_name"])
|
|
158
|
-
destination_tables = cast(
|
|
159
|
-
normalization_tables = cast(Mapping[str,
|
|
185
|
+
destination_tables = cast(list[str], metadata["destination_tables"])
|
|
186
|
+
normalization_tables = cast(Mapping[str, list[str]], metadata["normalization_tables"])
|
|
160
187
|
io_manager_key = cast(Optional[str], metadata["io_manager_key"])
|
|
161
188
|
|
|
162
189
|
@multi_asset(
|
|
163
|
-
name=f"airbyte_sync_{connection_id
|
|
190
|
+
name=f"airbyte_sync_{connection_id.replace('-', '_')}",
|
|
164
191
|
deps=list((assets_defn_meta.keys_by_input_name or {}).values()),
|
|
165
192
|
outs={
|
|
166
193
|
k: AssetOut(
|
|
167
194
|
key=v,
|
|
168
195
|
metadata=(
|
|
169
|
-
|
|
170
|
-
k: cast(TableSchemaMetadataValue, v)
|
|
171
|
-
for k, v in assets_defn_meta.metadata_by_output_name.get(k, {}).items()
|
|
172
|
-
}
|
|
196
|
+
assets_defn_meta.metadata_by_output_name.get(k)
|
|
173
197
|
if assets_defn_meta.metadata_by_output_name
|
|
174
198
|
else None
|
|
175
199
|
),
|
|
@@ -224,11 +248,13 @@ def _build_airbyte_assets_from_metadata(
|
|
|
224
248
|
def build_airbyte_assets(
|
|
225
249
|
connection_id: str,
|
|
226
250
|
destination_tables: Sequence[str],
|
|
251
|
+
destination_database: Optional[str] = None,
|
|
252
|
+
destination_schema: Optional[str] = None,
|
|
227
253
|
asset_key_prefix: Optional[Sequence[str]] = None,
|
|
228
254
|
group_name: Optional[str] = None,
|
|
229
|
-
normalization_tables: Optional[Mapping[str,
|
|
255
|
+
normalization_tables: Optional[Mapping[str, set[str]]] = None,
|
|
230
256
|
deps: Optional[Iterable[Union[CoercibleToAssetKey, AssetsDefinition, SourceAsset]]] = None,
|
|
231
|
-
upstream_assets: Optional[
|
|
257
|
+
upstream_assets: Optional[set[AssetKey]] = None,
|
|
232
258
|
schema_by_table_name: Optional[Mapping[str, TableSchema]] = None,
|
|
233
259
|
freshness_policy: Optional[FreshnessPolicy] = None,
|
|
234
260
|
stream_to_asset_map: Optional[Mapping[str, str]] = None,
|
|
@@ -242,6 +268,8 @@ def build_airbyte_assets(
|
|
|
242
268
|
destination_tables (List[str]): The names of the tables that you want to be represented
|
|
243
269
|
in the Dagster asset graph for this sync. This will generally map to the name of the
|
|
244
270
|
stream in Airbyte, unless a stream prefix has been specified in Airbyte.
|
|
271
|
+
destination_database (Optional[str]): The name of the destination database.
|
|
272
|
+
destination_schema (Optional[str]): The name of the destination schema.
|
|
245
273
|
normalization_tables (Optional[Mapping[str, List[str]]]): If you are using Airbyte's
|
|
246
274
|
normalization feature, you may specify a mapping of destination table to a list of
|
|
247
275
|
derived tables that will be created by the normalization process.
|
|
@@ -268,13 +296,34 @@ def build_airbyte_assets(
|
|
|
268
296
|
tables = chain.from_iterable(
|
|
269
297
|
chain([destination_tables], normalization_tables.values() if normalization_tables else [])
|
|
270
298
|
)
|
|
299
|
+
|
|
300
|
+
table_names: dict[str, str] = {}
|
|
301
|
+
for table in destination_tables:
|
|
302
|
+
if destination_database and destination_schema and table:
|
|
303
|
+
table_names[table] = ".".join([destination_database, destination_schema, table])
|
|
304
|
+
if normalization_tables:
|
|
305
|
+
for normalization_table in normalization_tables.get(table, set()):
|
|
306
|
+
table_names[normalization_table] = ".".join(
|
|
307
|
+
[
|
|
308
|
+
destination_database,
|
|
309
|
+
destination_schema,
|
|
310
|
+
table,
|
|
311
|
+
normalization_table,
|
|
312
|
+
]
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
schema_by_table_name = schema_by_table_name if schema_by_table_name else {}
|
|
316
|
+
|
|
271
317
|
outputs = {
|
|
272
318
|
table: AssetOut(
|
|
273
319
|
key=AssetKey([*asset_key_prefix, table]),
|
|
274
320
|
metadata=(
|
|
275
|
-
{
|
|
276
|
-
|
|
277
|
-
|
|
321
|
+
{
|
|
322
|
+
**TableMetadataSet(
|
|
323
|
+
column_schema=schema_by_table_name.get(table),
|
|
324
|
+
table_name=table_names.get(table),
|
|
325
|
+
),
|
|
326
|
+
}
|
|
278
327
|
),
|
|
279
328
|
freshness_policy=freshness_policy,
|
|
280
329
|
auto_materialize_policy=auto_materialize_policy,
|
|
@@ -300,7 +349,7 @@ def build_airbyte_assets(
|
|
|
300
349
|
internal_deps[table] = set(upstream_deps) if upstream_deps else set()
|
|
301
350
|
|
|
302
351
|
@multi_asset(
|
|
303
|
-
name=f"airbyte_sync_{connection_id
|
|
352
|
+
name=f"airbyte_sync_{connection_id.replace('-', '_')}",
|
|
304
353
|
deps=upstream_deps,
|
|
305
354
|
outs=outputs,
|
|
306
355
|
internal_asset_deps=internal_deps,
|
|
@@ -376,7 +425,7 @@ def _get_normalization_tables_for_schema(
|
|
|
376
425
|
For more information on Airbyte's normalization process, see:
|
|
377
426
|
https://docs.airbyte.com/understanding-airbyte/basic-normalization/#nesting
|
|
378
427
|
"""
|
|
379
|
-
out:
|
|
428
|
+
out: dict[str, AirbyteTableMetadata] = {}
|
|
380
429
|
# Object types are broken into a new table, as long as they have children
|
|
381
430
|
|
|
382
431
|
sub_schemas = _get_sub_schemas(schema)
|
|
@@ -388,7 +437,7 @@ def _get_normalization_tables_for_schema(
|
|
|
388
437
|
|
|
389
438
|
if "object" in schema_types and len(sub_schema.get("properties", {})) > 0:
|
|
390
439
|
out[prefix + key] = AirbyteTableMetadata(
|
|
391
|
-
schema=generate_table_schema(sub_schema.get("properties", {}))
|
|
440
|
+
raw_table_name=key, schema=generate_table_schema(sub_schema.get("properties", {}))
|
|
392
441
|
)
|
|
393
442
|
for k, v in sub_schema["properties"].items():
|
|
394
443
|
out = merge_dicts(
|
|
@@ -397,7 +446,8 @@ def _get_normalization_tables_for_schema(
|
|
|
397
446
|
# Array types are also broken into a new table
|
|
398
447
|
elif "array" in schema_types:
|
|
399
448
|
out[prefix + key] = AirbyteTableMetadata(
|
|
400
|
-
|
|
449
|
+
raw_table_name=key,
|
|
450
|
+
schema=generate_table_schema(sub_schema.get("items", {}).get("properties", {})),
|
|
401
451
|
)
|
|
402
452
|
if sub_schema.get("items", {}).get("properties"):
|
|
403
453
|
for k, v in sub_schema["items"]["properties"].items():
|
|
@@ -408,11 +458,6 @@ def _get_normalization_tables_for_schema(
|
|
|
408
458
|
return out
|
|
409
459
|
|
|
410
460
|
|
|
411
|
-
def _clean_name(name: str) -> str:
|
|
412
|
-
"""Cleans an input to be a valid Dagster asset name."""
|
|
413
|
-
return re.sub(r"[^a-z0-9]+", "_", name.lower())
|
|
414
|
-
|
|
415
|
-
|
|
416
461
|
class AirbyteConnectionMetadata(
|
|
417
462
|
NamedTuple(
|
|
418
463
|
"_AirbyteConnectionMetadata",
|
|
@@ -420,7 +465,8 @@ class AirbyteConnectionMetadata(
|
|
|
420
465
|
("name", str),
|
|
421
466
|
("stream_prefix", str),
|
|
422
467
|
("has_basic_normalization", bool),
|
|
423
|
-
("stream_data",
|
|
468
|
+
("stream_data", list[Mapping[str, Any]]),
|
|
469
|
+
("destination", Mapping[str, Any]),
|
|
424
470
|
],
|
|
425
471
|
)
|
|
426
472
|
):
|
|
@@ -435,7 +481,10 @@ class AirbyteConnectionMetadata(
|
|
|
435
481
|
|
|
436
482
|
@classmethod
|
|
437
483
|
def from_api_json(
|
|
438
|
-
cls,
|
|
484
|
+
cls,
|
|
485
|
+
contents: Mapping[str, Any],
|
|
486
|
+
operations: Mapping[str, Any],
|
|
487
|
+
destination: Mapping[str, Any],
|
|
439
488
|
) -> "AirbyteConnectionMetadata":
|
|
440
489
|
return cls(
|
|
441
490
|
name=contents["name"],
|
|
@@ -445,10 +494,13 @@ class AirbyteConnectionMetadata(
|
|
|
445
494
|
for op in operations.get("operations", [])
|
|
446
495
|
),
|
|
447
496
|
stream_data=contents.get("syncCatalog", {}).get("streams", []),
|
|
497
|
+
destination=destination,
|
|
448
498
|
)
|
|
449
499
|
|
|
450
500
|
@classmethod
|
|
451
|
-
def from_config(
|
|
501
|
+
def from_config(
|
|
502
|
+
cls, contents: Mapping[str, Any], destination: Mapping[str, Any]
|
|
503
|
+
) -> "AirbyteConnectionMetadata":
|
|
452
504
|
config_contents = cast(Mapping[str, Any], contents.get("configuration"))
|
|
453
505
|
check.invariant(
|
|
454
506
|
config_contents is not None, "Airbyte connection config is missing 'configuration' key"
|
|
@@ -462,6 +514,7 @@ class AirbyteConnectionMetadata(
|
|
|
462
514
|
for op in config_contents.get("operations", [])
|
|
463
515
|
),
|
|
464
516
|
stream_data=config_contents.get("sync_catalog", {}).get("streams", []),
|
|
517
|
+
destination=destination,
|
|
465
518
|
)
|
|
466
519
|
|
|
467
520
|
def parse_stream_tables(
|
|
@@ -471,7 +524,7 @@ class AirbyteConnectionMetadata(
|
|
|
471
524
|
tables associated with each enabled stream and values representing any affiliated
|
|
472
525
|
tables created by Airbyte's normalization process, if enabled.
|
|
473
526
|
"""
|
|
474
|
-
tables:
|
|
527
|
+
tables: dict[str, AirbyteTableMetadata] = {}
|
|
475
528
|
|
|
476
529
|
enabled_streams = [
|
|
477
530
|
stream for stream in self.stream_data if stream.get("config", {}).get("selected", False)
|
|
@@ -486,7 +539,7 @@ class AirbyteConnectionMetadata(
|
|
|
486
539
|
if "json_schema" in stream["stream"]
|
|
487
540
|
else stream["stream"]["jsonSchema"]
|
|
488
541
|
)
|
|
489
|
-
normalization_tables:
|
|
542
|
+
normalization_tables: dict[str, AirbyteTableMetadata] = {}
|
|
490
543
|
schema_props = schema.get("properties", schema.get("items", {}).get("properties", {}))
|
|
491
544
|
if self.has_basic_normalization and return_normalization_tables:
|
|
492
545
|
for k, v in schema_props.items():
|
|
@@ -496,6 +549,7 @@ class AirbyteConnectionMetadata(
|
|
|
496
549
|
prefixed_norm_table_name = f"{self.stream_prefix}{normalization_table_name}"
|
|
497
550
|
normalization_tables[prefixed_norm_table_name] = meta
|
|
498
551
|
tables[prefixed_name] = AirbyteTableMetadata(
|
|
552
|
+
raw_table_name=name,
|
|
499
553
|
schema=generate_table_schema(schema_props),
|
|
500
554
|
normalization_tables=normalization_tables,
|
|
501
555
|
)
|
|
@@ -513,7 +567,7 @@ def _get_schema_by_table_name(
|
|
|
513
567
|
[
|
|
514
568
|
(k, v.schema)
|
|
515
569
|
for k, v in cast(
|
|
516
|
-
|
|
570
|
+
dict[str, AirbyteTableMetadata], meta.normalization_tables
|
|
517
571
|
).items()
|
|
518
572
|
]
|
|
519
573
|
for meta in stream_table_metadata.values()
|
|
@@ -564,25 +618,46 @@ class AirbyteCoreCacheableAssetsDefinition(CacheableAssetsDefinition):
|
|
|
564
618
|
super().__init__(unique_id=f"airbyte-{contents.hexdigest()}")
|
|
565
619
|
|
|
566
620
|
@abstractmethod
|
|
567
|
-
def _get_connections(self) -> Sequence[
|
|
621
|
+
def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
|
|
568
622
|
pass
|
|
569
623
|
|
|
570
624
|
def compute_cacheable_data(self) -> Sequence[AssetsDefinitionCacheableData]:
|
|
571
|
-
asset_defn_data:
|
|
625
|
+
asset_defn_data: list[AssetsDefinitionCacheableData] = []
|
|
572
626
|
for connection_id, connection in self._get_connections():
|
|
573
627
|
stream_table_metadata = connection.parse_stream_tables(
|
|
574
628
|
self._create_assets_for_normalization_tables
|
|
575
629
|
)
|
|
576
630
|
schema_by_table_name = _get_schema_by_table_name(stream_table_metadata)
|
|
577
631
|
|
|
632
|
+
destination_database = connection.destination.get("configuration", {}).get("database")
|
|
633
|
+
destination_schema = connection.destination.get("configuration", {}).get("schema")
|
|
634
|
+
|
|
578
635
|
table_to_asset_key = partial(self._connection_to_asset_key_fn, connection)
|
|
636
|
+
|
|
637
|
+
destination_tables = list(stream_table_metadata.keys())
|
|
638
|
+
destination_raw_table_names_by_table = {
|
|
639
|
+
table: metadata.raw_table_name for table, metadata in stream_table_metadata.items()
|
|
640
|
+
}
|
|
641
|
+
normalization_tables = {
|
|
642
|
+
table: set(metadata.normalization_tables.keys())
|
|
643
|
+
for table, metadata in stream_table_metadata.items()
|
|
644
|
+
}
|
|
645
|
+
normalization_raw_table_names_by_table = {
|
|
646
|
+
normalization_table: metadata.normalization_tables[
|
|
647
|
+
normalization_table
|
|
648
|
+
].raw_table_name
|
|
649
|
+
for table, metadata in stream_table_metadata.items()
|
|
650
|
+
for normalization_table in normalization_tables[table]
|
|
651
|
+
}
|
|
652
|
+
|
|
579
653
|
asset_data_for_conn = _build_airbyte_asset_defn_metadata(
|
|
580
654
|
connection_id=connection_id,
|
|
581
|
-
destination_tables=
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
655
|
+
destination_tables=destination_tables,
|
|
656
|
+
destination_raw_table_names_by_table=destination_raw_table_names_by_table,
|
|
657
|
+
destination_database=destination_database,
|
|
658
|
+
destination_schema=destination_schema,
|
|
659
|
+
normalization_tables=normalization_tables,
|
|
660
|
+
normalization_raw_table_names_by_table=normalization_raw_table_names_by_table,
|
|
586
661
|
asset_key_prefix=self._key_prefix,
|
|
587
662
|
group_name=(
|
|
588
663
|
self._connection_meta_to_group_fn(connection)
|
|
@@ -661,11 +736,11 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
|
|
|
661
736
|
)
|
|
662
737
|
self._airbyte_instance: AirbyteResource = self._partially_initialized_airbyte_instance
|
|
663
738
|
|
|
664
|
-
def _get_connections(self) -> Sequence[
|
|
739
|
+
def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
|
|
665
740
|
workspace_id = self._workspace_id
|
|
666
741
|
if not workspace_id:
|
|
667
742
|
workspaces = cast(
|
|
668
|
-
|
|
743
|
+
list[dict[str, Any]],
|
|
669
744
|
check.not_none(
|
|
670
745
|
self._airbyte_instance.make_request(endpoint="/workspaces/list", data={})
|
|
671
746
|
).get("workspaces", []),
|
|
@@ -677,7 +752,7 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
|
|
|
677
752
|
workspace_id = workspaces[0].get("workspaceId")
|
|
678
753
|
|
|
679
754
|
connections = cast(
|
|
680
|
-
|
|
755
|
+
list[dict[str, Any]],
|
|
681
756
|
check.not_none(
|
|
682
757
|
self._airbyte_instance.make_request(
|
|
683
758
|
endpoint="/connections/list", data={"workspaceId": workspace_id}
|
|
@@ -685,12 +760,12 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
|
|
|
685
760
|
).get("connections", []),
|
|
686
761
|
)
|
|
687
762
|
|
|
688
|
-
output_connections:
|
|
763
|
+
output_connections: list[tuple[str, AirbyteConnectionMetadata]] = []
|
|
689
764
|
for connection_json in connections:
|
|
690
765
|
connection_id = cast(str, connection_json.get("connectionId"))
|
|
691
766
|
|
|
692
767
|
operations_json = cast(
|
|
693
|
-
|
|
768
|
+
dict[str, Any],
|
|
694
769
|
check.not_none(
|
|
695
770
|
self._airbyte_instance.make_request(
|
|
696
771
|
endpoint="/operations/list",
|
|
@@ -698,7 +773,21 @@ class AirbyteInstanceCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinit
|
|
|
698
773
|
)
|
|
699
774
|
),
|
|
700
775
|
)
|
|
701
|
-
|
|
776
|
+
|
|
777
|
+
destination_id = cast(str, connection_json.get("destinationId"))
|
|
778
|
+
destination_json = cast(
|
|
779
|
+
dict[str, Any],
|
|
780
|
+
check.not_none(
|
|
781
|
+
self._airbyte_instance.make_request(
|
|
782
|
+
endpoint="/destinations/get",
|
|
783
|
+
data={"destinationId": destination_id},
|
|
784
|
+
)
|
|
785
|
+
),
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
connection = AirbyteConnectionMetadata.from_api_json(
|
|
789
|
+
connection_json, operations_json, destination_json
|
|
790
|
+
)
|
|
702
791
|
|
|
703
792
|
# Filter out connections that don't match the filter function
|
|
704
793
|
if self._connection_filter and not self._connection_filter(connection):
|
|
@@ -749,16 +838,26 @@ class AirbyteYAMLCacheableAssetsDefinition(AirbyteCoreCacheableAssetsDefinition)
|
|
|
749
838
|
self._project_dir = project_dir
|
|
750
839
|
self._connection_directories = connection_directories
|
|
751
840
|
|
|
752
|
-
def _get_connections(self) -> Sequence[
|
|
841
|
+
def _get_connections(self) -> Sequence[tuple[str, AirbyteConnectionMetadata]]:
|
|
753
842
|
connections_dir = os.path.join(self._project_dir, "connections")
|
|
754
843
|
|
|
755
|
-
output_connections:
|
|
844
|
+
output_connections: list[tuple[str, AirbyteConnectionMetadata]] = []
|
|
756
845
|
|
|
757
846
|
connection_directories = self._connection_directories or os.listdir(connections_dir)
|
|
758
847
|
for connection_name in connection_directories:
|
|
759
848
|
connection_dir = os.path.join(connections_dir, connection_name)
|
|
760
849
|
with open(os.path.join(connection_dir, "configuration.yaml"), encoding="utf-8") as f:
|
|
761
|
-
|
|
850
|
+
connection_data = yaml.safe_load(f.read())
|
|
851
|
+
|
|
852
|
+
destination_configuration_path = cast(
|
|
853
|
+
str, connection_data.get("destination_configuration_path")
|
|
854
|
+
)
|
|
855
|
+
with open(
|
|
856
|
+
os.path.join(self._project_dir, destination_configuration_path), encoding="utf-8"
|
|
857
|
+
) as f:
|
|
858
|
+
destination_data = yaml.safe_load(f.read())
|
|
859
|
+
|
|
860
|
+
connection = AirbyteConnectionMetadata.from_config(connection_data, destination_data)
|
|
762
861
|
|
|
763
862
|
# Filter out connections that don't match the filter function
|
|
764
863
|
if self._connection_filter and not self._connection_filter(connection):
|
|
@@ -800,7 +899,7 @@ def load_assets_from_airbyte_instance(
|
|
|
800
899
|
workspace_id: Optional[str] = None,
|
|
801
900
|
key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,
|
|
802
901
|
create_assets_for_normalization_tables: bool = True,
|
|
803
|
-
connection_to_group_fn: Optional[Callable[[str], Optional[str]]] =
|
|
902
|
+
connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = clean_name,
|
|
804
903
|
connection_meta_to_group_fn: Optional[
|
|
805
904
|
Callable[[AirbyteConnectionMetadata], Optional[str]]
|
|
806
905
|
] = None,
|
|
@@ -905,7 +1004,7 @@ def load_assets_from_airbyte_instance(
|
|
|
905
1004
|
check.invariant(
|
|
906
1005
|
not connection_meta_to_group_fn
|
|
907
1006
|
or not connection_to_group_fn
|
|
908
|
-
or connection_to_group_fn ==
|
|
1007
|
+
or connection_to_group_fn == clean_name,
|
|
909
1008
|
"Cannot specify both connection_meta_to_group_fn and connection_to_group_fn",
|
|
910
1009
|
)
|
|
911
1010
|
|
|
@@ -926,123 +1025,115 @@ def load_assets_from_airbyte_instance(
|
|
|
926
1025
|
)
|
|
927
1026
|
|
|
928
1027
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
key_prefix: Optional[CoercibleToAssetKeyPrefix] = None,
|
|
933
|
-
create_assets_for_normalization_tables: bool = True,
|
|
934
|
-
connection_to_group_fn: Optional[Callable[[str], Optional[str]]] = _clean_name,
|
|
935
|
-
connection_meta_to_group_fn: Optional[
|
|
936
|
-
Callable[[AirbyteConnectionMetadata], Optional[str]]
|
|
937
|
-
] = None,
|
|
938
|
-
io_manager_key: Optional[str] = None,
|
|
939
|
-
connection_to_io_manager_key_fn: Optional[Callable[[str], Optional[str]]] = None,
|
|
940
|
-
connection_filter: Optional[Callable[[AirbyteConnectionMetadata], bool]] = None,
|
|
941
|
-
connection_directories: Optional[Sequence[str]] = None,
|
|
942
|
-
connection_to_asset_key_fn: Optional[
|
|
943
|
-
Callable[[AirbyteConnectionMetadata, str], AssetKey]
|
|
944
|
-
] = None,
|
|
945
|
-
connection_to_freshness_policy_fn: Optional[
|
|
946
|
-
Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]
|
|
947
|
-
] = None,
|
|
948
|
-
connection_to_auto_materialize_policy_fn: Optional[
|
|
949
|
-
Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]
|
|
950
|
-
] = None,
|
|
951
|
-
) -> CacheableAssetsDefinition:
|
|
952
|
-
"""Loads an Airbyte project into a set of Dagster assets.
|
|
1028
|
+
# -----------------------
|
|
1029
|
+
# Reworked assets factory
|
|
1030
|
+
# -----------------------
|
|
953
1031
|
|
|
954
|
-
|
|
955
|
-
|
|
1032
|
+
|
|
1033
|
+
@experimental
|
|
1034
|
+
def build_airbyte_assets_definitions(
|
|
1035
|
+
*,
|
|
1036
|
+
workspace: AirbyteCloudWorkspace,
|
|
1037
|
+
dagster_airbyte_translator: Optional[DagsterAirbyteTranslator] = None,
|
|
1038
|
+
) -> Sequence[AssetsDefinition]:
|
|
1039
|
+
"""The list of AssetsDefinition for all connections in the Airbyte workspace.
|
|
956
1040
|
|
|
957
1041
|
Args:
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
key_prefix (Optional[CoercibleToAssetKeyPrefix]): A prefix for the asset keys created.
|
|
963
|
-
create_assets_for_normalization_tables (bool): If True, assets will be created for tables
|
|
964
|
-
created by Airbyte's normalization feature. If False, only the destination tables
|
|
965
|
-
will be created. Defaults to True.
|
|
966
|
-
connection_to_group_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an asset
|
|
967
|
-
group name for a given Airbyte connection name. If None, no groups will be created. Defaults
|
|
968
|
-
to a basic sanitization function.
|
|
969
|
-
connection_meta_to_group_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[str]]]): Function
|
|
970
|
-
which returns an asset group name for a given Airbyte connection metadata. If None and connection_to_group_fn
|
|
971
|
-
is None, no groups will be created. Defaults to None.
|
|
972
|
-
io_manager_key (Optional[str]): The I/O manager key to use for all assets. Defaults to "io_manager".
|
|
973
|
-
Use this if all assets should be loaded from the same source, otherwise use connection_to_io_manager_key_fn.
|
|
974
|
-
connection_to_io_manager_key_fn (Optional[Callable[[str], Optional[str]]]): Function which returns an
|
|
975
|
-
I/O manager key for a given Airbyte connection name. When other ops are downstream of the loaded assets,
|
|
976
|
-
the IOManager specified determines how the inputs to those ops are loaded. Defaults to "io_manager".
|
|
977
|
-
connection_filter (Optional[Callable[[AirbyteConnectionMetadata], bool]]): Optional function which
|
|
978
|
-
takes in connection metadata and returns False if the connection should be excluded from the output assets.
|
|
979
|
-
connection_directories (Optional[List[str]]): Optional list of connection directories to load assets from.
|
|
980
|
-
If omitted, all connections in the Airbyte project are loaded. May be faster than connection_filter
|
|
981
|
-
if the project has many connections or if the connection yaml files are large.
|
|
982
|
-
connection_to_asset_key_fn (Optional[Callable[[AirbyteConnectionMetadata, str], AssetKey]]): Optional function which
|
|
983
|
-
takes in connection metadata and table name and returns an asset key for the table. If None, the default asset
|
|
984
|
-
key is based on the table name. Any asset key prefix will be applied to the output of this function.
|
|
985
|
-
connection_to_freshness_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[FreshnessPolicy]]]):
|
|
986
|
-
Optional function which takes in connection metadata and returns a freshness policy for the connection's assets.
|
|
987
|
-
If None, no freshness policies will be applied to the assets.
|
|
988
|
-
connection_to_auto_materialize_policy_fn (Optional[Callable[[AirbyteConnectionMetadata], Optional[AutoMaterializePolicy]]]):
|
|
989
|
-
Optional function which takes in connection metadata and returns an auto materialization policy for the connection's assets.
|
|
990
|
-
If None, no auto materialization policies will be applied to the assets.
|
|
1042
|
+
workspace (AirbyteCloudWorkspace): The Airbyte workspace to fetch assets from.
|
|
1043
|
+
dagster_airbyte_translator (Optional[DagsterAirbyteTranslator], optional): The translator to use
|
|
1044
|
+
to convert Airbyte content into :py:class:`dagster.AssetSpec`.
|
|
1045
|
+
Defaults to :py:class:`DagsterAirbyteTranslator`.
|
|
991
1046
|
|
|
992
|
-
|
|
1047
|
+
Returns:
|
|
1048
|
+
List[AssetsDefinition]: The list of AssetsDefinition for all connections in the Airbyte workspace.
|
|
993
1049
|
|
|
994
|
-
|
|
1050
|
+
Examples:
|
|
1051
|
+
Sync the tables of a Airbyte connection:
|
|
995
1052
|
|
|
996
|
-
|
|
1053
|
+
.. code-block:: python
|
|
997
1054
|
|
|
998
|
-
|
|
1055
|
+
from dagster_airbyte import AirbyteCloudWorkspace, build_airbyte_assets_definitions
|
|
999
1056
|
|
|
1000
|
-
|
|
1001
|
-
project_dir="path/to/airbyte/project",
|
|
1002
|
-
)
|
|
1057
|
+
import dagster as dg
|
|
1003
1058
|
|
|
1004
|
-
|
|
1059
|
+
airbyte_workspace = AirbyteCloudWorkspace(
|
|
1060
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
1061
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
1062
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
1063
|
+
)
|
|
1005
1064
|
|
|
1006
|
-
.. code-block:: python
|
|
1007
1065
|
|
|
1008
|
-
|
|
1066
|
+
airbyte_assets = build_airbyte_assets_definitions(workspace=workspace)
|
|
1009
1067
|
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1068
|
+
defs = dg.Definitions(
|
|
1069
|
+
assets=airbyte_assets,
|
|
1070
|
+
resources={"airbyte": airbyte_workspace},
|
|
1071
|
+
)
|
|
1072
|
+
|
|
1073
|
+
Sync the tables of a Airbyte connection with a custom translator:
|
|
1074
|
+
|
|
1075
|
+
.. code-block:: python
|
|
1076
|
+
|
|
1077
|
+
from dagster_airbyte import (
|
|
1078
|
+
DagsterAirbyteTranslator,
|
|
1079
|
+
AirbyteConnectionTableProps,
|
|
1080
|
+
AirbyteCloudWorkspace,
|
|
1081
|
+
build_airbyte_assets_definitions
|
|
1082
|
+
)
|
|
1083
|
+
|
|
1084
|
+
import dagster as dg
|
|
1085
|
+
|
|
1086
|
+
class CustomDagsterAirbyteTranslator(DagsterAirbyteTranslator):
|
|
1087
|
+
def get_asset_spec(self, props: AirbyteConnectionTableProps) -> dg.AssetSpec:
|
|
1088
|
+
default_spec = super().get_asset_spec(props)
|
|
1089
|
+
return default_spec.merge_attributes(
|
|
1090
|
+
metadata={"custom": "metadata"},
|
|
1091
|
+
)
|
|
1092
|
+
|
|
1093
|
+
airbyte_workspace = AirbyteCloudWorkspace(
|
|
1094
|
+
workspace_id=dg.EnvVar("AIRBYTE_CLOUD_WORKSPACE_ID"),
|
|
1095
|
+
client_id=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_ID"),
|
|
1096
|
+
client_secret=dg.EnvVar("AIRBYTE_CLOUD_CLIENT_SECRET"),
|
|
1097
|
+
)
|
|
1098
|
+
|
|
1099
|
+
|
|
1100
|
+
airbyte_assets = build_airbyte_assets_definitions(
|
|
1101
|
+
workspace=workspace,
|
|
1102
|
+
dagster_airbyte_translator=CustomDagsterAirbyteTranslator()
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
defs = dg.Definitions(
|
|
1106
|
+
assets=airbyte_assets,
|
|
1107
|
+
resources={"airbyte": airbyte_workspace},
|
|
1108
|
+
)
|
|
1014
1109
|
"""
|
|
1015
|
-
|
|
1016
|
-
key_prefix = [key_prefix]
|
|
1017
|
-
key_prefix = check.list_param(key_prefix or [], "key_prefix", of_type=str)
|
|
1110
|
+
dagster_airbyte_translator = dagster_airbyte_translator or DagsterAirbyteTranslator()
|
|
1018
1111
|
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
"Cannot specify both io_manager_key and connection_to_io_manager_key_fn",
|
|
1112
|
+
all_asset_specs = workspace.load_asset_specs(
|
|
1113
|
+
dagster_airbyte_translator=dagster_airbyte_translator
|
|
1022
1114
|
)
|
|
1023
|
-
if not connection_to_io_manager_key_fn:
|
|
1024
|
-
connection_to_io_manager_key_fn = lambda _: io_manager_key
|
|
1025
1115
|
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1116
|
+
connections = {
|
|
1117
|
+
(
|
|
1118
|
+
check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_id),
|
|
1119
|
+
check.not_none(AirbyteMetadataSet.extract(spec.metadata).connection_name),
|
|
1120
|
+
)
|
|
1121
|
+
for spec in all_asset_specs
|
|
1122
|
+
}
|
|
1032
1123
|
|
|
1033
|
-
|
|
1034
|
-
|
|
1124
|
+
_asset_fns = []
|
|
1125
|
+
for connection_id, connection_name in connections:
|
|
1035
1126
|
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1127
|
+
@airbyte_assets(
|
|
1128
|
+
connection_id=connection_id,
|
|
1129
|
+
workspace=workspace,
|
|
1130
|
+
name=clean_name(connection_name),
|
|
1131
|
+
group_name=clean_name(connection_name),
|
|
1132
|
+
dagster_airbyte_translator=dagster_airbyte_translator,
|
|
1133
|
+
)
|
|
1134
|
+
def _asset_fn(context: AssetExecutionContext, airbyte: AirbyteCloudWorkspace):
|
|
1135
|
+
yield from airbyte.sync_and_poll(context=context)
|
|
1136
|
+
|
|
1137
|
+
_asset_fns.append(_asset_fn)
|
|
1138
|
+
|
|
1139
|
+
return _asset_fns
|