acryl-datahub 1.0.0rc13__py3-none-any.whl → 1.0.0rc14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0rc13.dist-info → acryl_datahub-1.0.0rc14.dist-info}/METADATA +2524 -2524
- {acryl_datahub-1.0.0rc13.dist-info → acryl_datahub-1.0.0rc14.dist-info}/RECORD +43 -43
- datahub/_version.py +1 -1
- datahub/configuration/common.py +1 -1
- datahub/emitter/rest_emitter.py +165 -10
- datahub/ingestion/glossary/classification_mixin.py +1 -5
- datahub/ingestion/graph/client.py +6 -3
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -1
- datahub/ingestion/run/pipeline.py +2 -4
- datahub/ingestion/sink/datahub_rest.py +4 -0
- datahub/ingestion/source/common/subtypes.py +5 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
- datahub/ingestion/source/dbt/dbt_common.py +2 -4
- datahub/ingestion/source/dbt/dbt_tests.py +4 -8
- datahub/ingestion/source/dremio/dremio_api.py +1 -5
- datahub/ingestion/source/dremio/dremio_aspects.py +1 -4
- datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
- datahub/ingestion/source/kafka_connect/common.py +1 -6
- datahub/ingestion/source/mlflow.py +338 -31
- datahub/ingestion/source/redshift/lineage.py +2 -2
- datahub/ingestion/source/redshift/lineage_v2.py +19 -7
- datahub/ingestion/source/redshift/profile.py +1 -1
- datahub/ingestion/source/redshift/query.py +14 -6
- datahub/ingestion/source/redshift/redshift.py +9 -5
- datahub/ingestion/source/redshift/redshift_schema.py +27 -7
- datahub/ingestion/source/sql/athena.py +6 -12
- datahub/ingestion/source/sql/hive.py +2 -6
- datahub/ingestion/source/sql/hive_metastore.py +2 -1
- datahub/ingestion/source/sql/sql_common.py +3 -9
- datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
- datahub/ingestion/source/superset.py +1 -3
- datahub/ingestion/source/tableau/tableau_common.py +1 -1
- datahub/lite/duckdb_lite.py +1 -3
- datahub/metadata/_schema_classes.py +31 -1
- datahub/metadata/schema.avsc +56 -4
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
- datahub/sdk/dataset.py +2 -2
- datahub/sql_parsing/sqlglot_utils.py +1 -4
- {acryl_datahub-1.0.0rc13.dist-info → acryl_datahub-1.0.0rc14.dist-info}/LICENSE +0 -0
- {acryl_datahub-1.0.0rc13.dist-info → acryl_datahub-1.0.0rc14.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.0.0rc13.dist-info → acryl_datahub-1.0.0rc14.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0rc13.dist-info → acryl_datahub-1.0.0rc14.dist-info}/top_level.txt +0 -0
|
@@ -48,7 +48,7 @@ class RedshiftProfiler(GenericProfiler):
|
|
|
48
48
|
if not self.config.schema_pattern.allowed(schema):
|
|
49
49
|
continue
|
|
50
50
|
for table in tables[db].get(schema, {}):
|
|
51
|
-
if table.is_external_table:
|
|
51
|
+
if table.is_external_table() or self.report.is_shared_database:
|
|
52
52
|
if not self.config.profiling.profile_external_tables:
|
|
53
53
|
# Case 1: If user did not tell us to profile external tables, simply log this.
|
|
54
54
|
self.report.profiling_skipped_other[schema] += 1
|
|
@@ -83,7 +83,9 @@ class RedshiftCommonQuery:
|
|
|
83
83
|
# NOTE: Tables from shared database are not available in pg_catalog.pg_class
|
|
84
84
|
@staticmethod
|
|
85
85
|
def list_tables(
|
|
86
|
-
|
|
86
|
+
database: str,
|
|
87
|
+
skip_external_tables: bool = False,
|
|
88
|
+
is_shared_database: bool = False,
|
|
87
89
|
) -> str:
|
|
88
90
|
# NOTE: it looks like description is available only in pg_description
|
|
89
91
|
# So this remains preferrred way
|
|
@@ -123,7 +125,7 @@ class RedshiftCommonQuery:
|
|
|
123
125
|
AND n.nspname != 'information_schema'
|
|
124
126
|
"""
|
|
125
127
|
|
|
126
|
-
external_tables_query = """
|
|
128
|
+
external_tables_query = f"""
|
|
127
129
|
SELECT 'EXTERNAL_TABLE' as tabletype,
|
|
128
130
|
NULL AS "schema_oid",
|
|
129
131
|
schemaname AS "schema",
|
|
@@ -142,10 +144,11 @@ class RedshiftCommonQuery:
|
|
|
142
144
|
serde_parameters,
|
|
143
145
|
NULL as table_description
|
|
144
146
|
FROM pg_catalog.svv_external_tables
|
|
147
|
+
WHERE redshift_database_name='{database}'
|
|
145
148
|
ORDER BY "schema",
|
|
146
149
|
"relname"
|
|
147
150
|
"""
|
|
148
|
-
shared_database_tables_query = """
|
|
151
|
+
shared_database_tables_query = f"""
|
|
149
152
|
SELECT table_type as tabletype,
|
|
150
153
|
NULL AS "schema_oid",
|
|
151
154
|
schema_name AS "schema",
|
|
@@ -164,6 +167,7 @@ class RedshiftCommonQuery:
|
|
|
164
167
|
NULL as serde_parameters,
|
|
165
168
|
NULL as table_description
|
|
166
169
|
FROM svv_redshift_tables
|
|
170
|
+
WHERE database_name='{database}'
|
|
167
171
|
ORDER BY "schema",
|
|
168
172
|
"relname"
|
|
169
173
|
"""
|
|
@@ -175,9 +179,11 @@ class RedshiftCommonQuery:
|
|
|
175
179
|
return f"{tables_query} UNION {external_tables_query}"
|
|
176
180
|
|
|
177
181
|
@staticmethod
|
|
178
|
-
def list_columns(
|
|
182
|
+
def list_columns(
|
|
183
|
+
database_name: str, schema_name: str, is_shared_database: bool = False
|
|
184
|
+
) -> str:
|
|
179
185
|
if is_shared_database:
|
|
180
|
-
return """
|
|
186
|
+
return f"""
|
|
181
187
|
SELECT
|
|
182
188
|
schema_name as "schema",
|
|
183
189
|
table_name as "table_name",
|
|
@@ -198,9 +204,10 @@ class RedshiftCommonQuery:
|
|
|
198
204
|
null as "table_oid"
|
|
199
205
|
FROM SVV_REDSHIFT_COLUMNS
|
|
200
206
|
WHERE 1 and schema = '{schema_name}'
|
|
207
|
+
AND database_name = '{database_name}'
|
|
201
208
|
ORDER BY "schema", "table_name", "attnum"
|
|
202
209
|
"""
|
|
203
|
-
return """
|
|
210
|
+
return f"""
|
|
204
211
|
SELECT
|
|
205
212
|
n.nspname as "schema",
|
|
206
213
|
c.relname as "table_name",
|
|
@@ -275,6 +282,7 @@ class RedshiftCommonQuery:
|
|
|
275
282
|
null as "table_oid"
|
|
276
283
|
FROM SVV_EXTERNAL_COLUMNS
|
|
277
284
|
WHERE 1 and schema = '{schema_name}'
|
|
285
|
+
AND redshift_database_name = '{database_name}'
|
|
278
286
|
ORDER BY "schema", "table_name", "attnum"
|
|
279
287
|
"""
|
|
280
288
|
|
|
@@ -366,7 +366,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
366
366
|
|
|
367
367
|
self.db = self.data_dictionary.get_database_details(connection, database)
|
|
368
368
|
self.report.is_shared_database = (
|
|
369
|
-
self.db is not None and self.db.is_shared_database
|
|
369
|
+
self.db is not None and self.db.is_shared_database()
|
|
370
370
|
)
|
|
371
371
|
with self.report.new_stage(METADATA_EXTRACTION):
|
|
372
372
|
self.db_tables[database] = defaultdict()
|
|
@@ -508,6 +508,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
508
508
|
schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
|
|
509
509
|
schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
|
|
510
510
|
conn=connection,
|
|
511
|
+
database=database,
|
|
511
512
|
schema=schema,
|
|
512
513
|
is_shared_database=self.report.is_shared_database,
|
|
513
514
|
)
|
|
@@ -829,9 +830,12 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
829
830
|
domain_config=self.config.domain,
|
|
830
831
|
)
|
|
831
832
|
|
|
832
|
-
def cache_tables_and_views(
|
|
833
|
+
def cache_tables_and_views(
|
|
834
|
+
self, connection: redshift_connector.Connection, database: str
|
|
835
|
+
) -> None:
|
|
833
836
|
tables, views = self.data_dictionary.get_tables_and_views(
|
|
834
837
|
conn=connection,
|
|
838
|
+
database=database,
|
|
835
839
|
skip_external_tables=self.config.skip_external_tables,
|
|
836
840
|
is_shared_database=self.report.is_shared_database,
|
|
837
841
|
)
|
|
@@ -982,7 +986,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
982
986
|
self.datashares_helper.to_platform_resource(list(outbound_shares))
|
|
983
987
|
)
|
|
984
988
|
|
|
985
|
-
if self.db and self.db.is_shared_database:
|
|
989
|
+
if self.db and self.db.is_shared_database():
|
|
986
990
|
inbound_share = self.db.get_inbound_share()
|
|
987
991
|
if inbound_share is None:
|
|
988
992
|
self.report.warning(
|
|
@@ -996,8 +1000,8 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
996
1000
|
):
|
|
997
1001
|
lineage_extractor.aggregator.add(known_lineage)
|
|
998
1002
|
|
|
999
|
-
# TODO: distinguish between definition level lineage and audit log based lineage
|
|
1000
|
-
#
|
|
1003
|
+
# TODO: distinguish between definition level lineage and audit log based lineage.
|
|
1004
|
+
# Definition level lineage should never be skipped
|
|
1001
1005
|
if not self._should_ingest_lineage():
|
|
1002
1006
|
return
|
|
1003
1007
|
|
|
@@ -42,7 +42,6 @@ class RedshiftTable(BaseTable):
|
|
|
42
42
|
serde_parameters: Optional[str] = None
|
|
43
43
|
last_altered: Optional[datetime] = None
|
|
44
44
|
|
|
45
|
-
@property
|
|
46
45
|
def is_external_table(self) -> bool:
|
|
47
46
|
return self.type == "EXTERNAL_TABLE"
|
|
48
47
|
|
|
@@ -56,7 +55,6 @@ class RedshiftView(BaseTable):
|
|
|
56
55
|
size_in_bytes: Optional[int] = None
|
|
57
56
|
rows_count: Optional[int] = None
|
|
58
57
|
|
|
59
|
-
@property
|
|
60
58
|
def is_external_table(self) -> bool:
|
|
61
59
|
return self.type == "EXTERNAL_TABLE"
|
|
62
60
|
|
|
@@ -71,10 +69,28 @@ class RedshiftSchema:
|
|
|
71
69
|
external_platform: Optional[str] = None
|
|
72
70
|
external_database: Optional[str] = None
|
|
73
71
|
|
|
74
|
-
@property
|
|
75
72
|
def is_external_schema(self) -> bool:
|
|
76
73
|
return self.type == "external"
|
|
77
74
|
|
|
75
|
+
def get_upstream_schema_name(self) -> Optional[str]:
|
|
76
|
+
"""Gets the schema name from the external schema option.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Optional[str]: The schema name from the external schema option
|
|
80
|
+
if this is an external schema and has a valid option format, None otherwise.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
if not self.is_external_schema() or not self.option:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
# For external schema on redshift, option is in form
|
|
87
|
+
# {"SCHEMA":"tickit"}
|
|
88
|
+
schema_match = re.search(r'"SCHEMA"\s*:\s*"([^"]*)"', self.option)
|
|
89
|
+
if not schema_match:
|
|
90
|
+
return None
|
|
91
|
+
else:
|
|
92
|
+
return schema_match.group(1)
|
|
93
|
+
|
|
78
94
|
|
|
79
95
|
@dataclass
|
|
80
96
|
class PartialInboundDatashare:
|
|
@@ -117,7 +133,6 @@ class RedshiftDatabase:
|
|
|
117
133
|
type: str
|
|
118
134
|
options: Optional[str] = None
|
|
119
135
|
|
|
120
|
-
@property
|
|
121
136
|
def is_shared_database(self) -> bool:
|
|
122
137
|
return self.type == "shared"
|
|
123
138
|
|
|
@@ -128,7 +143,7 @@ class RedshiftDatabase:
|
|
|
128
143
|
def get_inbound_share(
|
|
129
144
|
self,
|
|
130
145
|
) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]:
|
|
131
|
-
if not self.is_shared_database or not self.options:
|
|
146
|
+
if not self.is_shared_database() or not self.options:
|
|
132
147
|
return None
|
|
133
148
|
|
|
134
149
|
# Convert into single regex ??
|
|
@@ -323,6 +338,7 @@ class RedshiftDataDictionary:
|
|
|
323
338
|
def get_tables_and_views(
|
|
324
339
|
self,
|
|
325
340
|
conn: redshift_connector.Connection,
|
|
341
|
+
database: str,
|
|
326
342
|
skip_external_tables: bool = False,
|
|
327
343
|
is_shared_database: bool = False,
|
|
328
344
|
) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]:
|
|
@@ -336,6 +352,7 @@ class RedshiftDataDictionary:
|
|
|
336
352
|
cur = RedshiftDataDictionary.get_query_result(
|
|
337
353
|
conn,
|
|
338
354
|
RedshiftCommonQuery.list_tables(
|
|
355
|
+
database=database,
|
|
339
356
|
skip_external_tables=skip_external_tables,
|
|
340
357
|
is_shared_database=is_shared_database,
|
|
341
358
|
),
|
|
@@ -484,14 +501,17 @@ class RedshiftDataDictionary:
|
|
|
484
501
|
@staticmethod
|
|
485
502
|
def get_columns_for_schema(
|
|
486
503
|
conn: redshift_connector.Connection,
|
|
504
|
+
database: str,
|
|
487
505
|
schema: RedshiftSchema,
|
|
488
506
|
is_shared_database: bool = False,
|
|
489
507
|
) -> Dict[str, List[RedshiftColumn]]:
|
|
490
508
|
cursor = RedshiftDataDictionary.get_query_result(
|
|
491
509
|
conn,
|
|
492
510
|
RedshiftCommonQuery.list_columns(
|
|
493
|
-
|
|
494
|
-
|
|
511
|
+
database_name=database,
|
|
512
|
+
schema_name=schema.name,
|
|
513
|
+
is_shared_database=is_shared_database,
|
|
514
|
+
),
|
|
495
515
|
)
|
|
496
516
|
|
|
497
517
|
table_columns: Dict[str, List[RedshiftColumn]] = {}
|
|
@@ -540,19 +540,13 @@ class AthenaSource(SQLAlchemySource):
|
|
|
540
540
|
inspector=inspector,
|
|
541
541
|
description=column.get("comment"),
|
|
542
542
|
nullable=column.get("nullable", True),
|
|
543
|
-
is_part_of_key=(
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
and isinstance(pk_constraints, dict)
|
|
548
|
-
and column["name"] in pk_constraints.get("constrained_columns", [])
|
|
549
|
-
)
|
|
550
|
-
else False
|
|
543
|
+
is_part_of_key=bool(
|
|
544
|
+
pk_constraints is not None
|
|
545
|
+
and isinstance(pk_constraints, dict)
|
|
546
|
+
and column["name"] in pk_constraints.get("constrained_columns", [])
|
|
551
547
|
),
|
|
552
|
-
is_partitioning_key=(
|
|
553
|
-
|
|
554
|
-
if (partition_keys is not None and column["name"] in partition_keys)
|
|
555
|
-
else False
|
|
548
|
+
is_partitioning_key=bool(
|
|
549
|
+
partition_keys is not None and column["name"] in partition_keys
|
|
556
550
|
),
|
|
557
551
|
)
|
|
558
552
|
|
|
@@ -821,12 +821,8 @@ class HiveSource(TwoTierSQLAlchemySource):
|
|
|
821
821
|
|
|
822
822
|
try:
|
|
823
823
|
view_definition = inspector.get_view_definition(view, schema)
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
else:
|
|
827
|
-
# Some dialects return a TextClause instead of a raw string,
|
|
828
|
-
# so we need to convert them to a string.
|
|
829
|
-
view_definition = str(view_definition)
|
|
824
|
+
# Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
|
|
825
|
+
view_definition = str(view_definition) if view_definition else ""
|
|
830
826
|
except NotImplementedError:
|
|
831
827
|
view_definition = ""
|
|
832
828
|
|
|
@@ -893,8 +893,9 @@ class HiveMetastoreSource(SQLAlchemySource):
|
|
|
893
893
|
return get_schema_fields_for_hive_column(
|
|
894
894
|
column["col_name"],
|
|
895
895
|
column["col_type"],
|
|
896
|
+
# column is actually an sqlalchemy.engine.row.LegacyRow, not a Dict and we cannot make column.get("col_description", "")
|
|
896
897
|
description=(
|
|
897
|
-
column["col_description"] if "col_description" in column else ""
|
|
898
|
+
column["col_description"] if "col_description" in column else "" # noqa: SIM401
|
|
898
899
|
),
|
|
899
900
|
default_nullable=True,
|
|
900
901
|
)
|
|
@@ -1031,16 +1031,10 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
|
|
|
1031
1031
|
def _get_view_definition(self, inspector: Inspector, schema: str, view: str) -> str:
|
|
1032
1032
|
try:
|
|
1033
1033
|
view_definition = inspector.get_view_definition(view, schema)
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
else:
|
|
1037
|
-
# Some dialects return a TextClause instead of a raw string,
|
|
1038
|
-
# so we need to convert them to a string.
|
|
1039
|
-
view_definition = str(view_definition)
|
|
1034
|
+
# Some dialects return a TextClause instead of a raw string, so we need to convert them to a string.
|
|
1035
|
+
return str(view_definition) if view_definition else ""
|
|
1040
1036
|
except NotImplementedError:
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
return view_definition
|
|
1037
|
+
return ""
|
|
1044
1038
|
|
|
1045
1039
|
def _process_view(
|
|
1046
1040
|
self,
|
|
@@ -114,14 +114,10 @@ class StaleEntityRemovalHandler(
|
|
|
114
114
|
self.stateful_ingestion_config: Optional[StatefulStaleMetadataRemovalConfig] = (
|
|
115
115
|
config.stateful_ingestion
|
|
116
116
|
)
|
|
117
|
-
self.checkpointing_enabled: bool = (
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
and self.stateful_ingestion_config
|
|
122
|
-
and self.stateful_ingestion_config.remove_stale_metadata
|
|
123
|
-
)
|
|
124
|
-
else False
|
|
117
|
+
self.checkpointing_enabled: bool = bool(
|
|
118
|
+
self.state_provider.is_stateful_ingestion_configured()
|
|
119
|
+
and self.stateful_ingestion_config
|
|
120
|
+
and self.stateful_ingestion_config.remove_stale_metadata
|
|
125
121
|
)
|
|
126
122
|
self._job_id = self._init_job_id()
|
|
127
123
|
self._urns_to_skip: Set[str] = set()
|
|
@@ -431,9 +431,7 @@ class SupersetSource(StatefulIngestionSourceBase):
|
|
|
431
431
|
dashboard_data.get("owners", []),
|
|
432
432
|
)
|
|
433
433
|
),
|
|
434
|
-
"IsCertified": str(
|
|
435
|
-
True if dashboard_data.get("certified_by") else False
|
|
436
|
-
).lower(),
|
|
434
|
+
"IsCertified": str(bool(dashboard_data.get("certified_by"))).lower(),
|
|
437
435
|
}
|
|
438
436
|
|
|
439
437
|
if dashboard_data.get("certified_by"):
|
|
@@ -902,7 +902,7 @@ def get_unique_custom_sql(custom_sql_list: List[dict]) -> List[dict]:
|
|
|
902
902
|
"name": custom_sql.get("name"),
|
|
903
903
|
# We assume that this is unsupported custom sql if "actual tables that this query references"
|
|
904
904
|
# are missing from api result.
|
|
905
|
-
"isUnsupportedCustomSql":
|
|
905
|
+
"isUnsupportedCustomSql": not custom_sql.get("tables"),
|
|
906
906
|
"query": custom_sql.get("query"),
|
|
907
907
|
"connectionType": custom_sql.get("connectionType"),
|
|
908
908
|
"columns": custom_sql.get("columns"),
|
datahub/lite/duckdb_lite.py
CHANGED
|
@@ -760,9 +760,7 @@ class DuckDBLite(DataHubLiteLocal[DuckDBLiteConfig]):
|
|
|
760
760
|
entity_id=[str(data_platform_urn), data_platform_instance],
|
|
761
761
|
)
|
|
762
762
|
self._create_edges_from_data_platform_instance(data_platform_instance_urn)
|
|
763
|
-
elif isinstance(aspect, ChartInfoClass)
|
|
764
|
-
aspect, DashboardInfoClass
|
|
765
|
-
):
|
|
763
|
+
elif isinstance(aspect, (ChartInfoClass, DashboardInfoClass)):
|
|
766
764
|
urn = Urn.from_string(entity_urn)
|
|
767
765
|
self.add_edge(
|
|
768
766
|
entity_urn,
|
|
@@ -9326,13 +9326,16 @@ class DataProcessInstanceInputClass(_Aspect):
|
|
|
9326
9326
|
|
|
9327
9327
|
def __init__(self,
|
|
9328
9328
|
inputs: List[str],
|
|
9329
|
+
inputEdges: Union[None, List["EdgeClass"]]=None,
|
|
9329
9330
|
):
|
|
9330
9331
|
super().__init__()
|
|
9331
9332
|
|
|
9332
9333
|
self.inputs = inputs
|
|
9334
|
+
self.inputEdges = inputEdges
|
|
9333
9335
|
|
|
9334
9336
|
def _restore_defaults(self) -> None:
|
|
9335
9337
|
self.inputs = list()
|
|
9338
|
+
self.inputEdges = self.RECORD_SCHEMA.fields_dict["inputEdges"].default
|
|
9336
9339
|
|
|
9337
9340
|
|
|
9338
9341
|
@property
|
|
@@ -9345,6 +9348,18 @@ class DataProcessInstanceInputClass(_Aspect):
|
|
|
9345
9348
|
self._inner_dict['inputs'] = value
|
|
9346
9349
|
|
|
9347
9350
|
|
|
9351
|
+
@property
|
|
9352
|
+
def inputEdges(self) -> Union[None, List["EdgeClass"]]:
|
|
9353
|
+
"""Input assets consumed by the data process instance, with additional metadata.
|
|
9354
|
+
Counts as lineage.
|
|
9355
|
+
Will eventually deprecate the inputs field."""
|
|
9356
|
+
return self._inner_dict.get('inputEdges') # type: ignore
|
|
9357
|
+
|
|
9358
|
+
@inputEdges.setter
|
|
9359
|
+
def inputEdges(self, value: Union[None, List["EdgeClass"]]) -> None:
|
|
9360
|
+
self._inner_dict['inputEdges'] = value
|
|
9361
|
+
|
|
9362
|
+
|
|
9348
9363
|
class DataProcessInstanceOutputClass(_Aspect):
|
|
9349
9364
|
"""Information about the outputs of a Data process"""
|
|
9350
9365
|
|
|
@@ -9355,18 +9370,21 @@ class DataProcessInstanceOutputClass(_Aspect):
|
|
|
9355
9370
|
|
|
9356
9371
|
def __init__(self,
|
|
9357
9372
|
outputs: List[str],
|
|
9373
|
+
outputEdges: Union[None, List["EdgeClass"]]=None,
|
|
9358
9374
|
):
|
|
9359
9375
|
super().__init__()
|
|
9360
9376
|
|
|
9361
9377
|
self.outputs = outputs
|
|
9378
|
+
self.outputEdges = outputEdges
|
|
9362
9379
|
|
|
9363
9380
|
def _restore_defaults(self) -> None:
|
|
9364
9381
|
self.outputs = list()
|
|
9382
|
+
self.outputEdges = self.RECORD_SCHEMA.fields_dict["outputEdges"].default
|
|
9365
9383
|
|
|
9366
9384
|
|
|
9367
9385
|
@property
|
|
9368
9386
|
def outputs(self) -> List[str]:
|
|
9369
|
-
"""Output
|
|
9387
|
+
"""Output assets produced"""
|
|
9370
9388
|
return self._inner_dict.get('outputs') # type: ignore
|
|
9371
9389
|
|
|
9372
9390
|
@outputs.setter
|
|
@@ -9374,6 +9392,18 @@ class DataProcessInstanceOutputClass(_Aspect):
|
|
|
9374
9392
|
self._inner_dict['outputs'] = value
|
|
9375
9393
|
|
|
9376
9394
|
|
|
9395
|
+
@property
|
|
9396
|
+
def outputEdges(self) -> Union[None, List["EdgeClass"]]:
|
|
9397
|
+
"""Output assets produced by the data process instance during processing, with additional metadata.
|
|
9398
|
+
Counts as lineage.
|
|
9399
|
+
Will eventually deprecate the outputs field."""
|
|
9400
|
+
return self._inner_dict.get('outputEdges') # type: ignore
|
|
9401
|
+
|
|
9402
|
+
@outputEdges.setter
|
|
9403
|
+
def outputEdges(self, value: Union[None, List["EdgeClass"]]) -> None:
|
|
9404
|
+
self._inner_dict['outputEdges'] = value
|
|
9405
|
+
|
|
9406
|
+
|
|
9377
9407
|
class DataProcessInstancePropertiesClass(_Aspect):
|
|
9378
9408
|
"""The inputs and outputs of this data process"""
|
|
9379
9409
|
|
datahub/metadata/schema.avsc
CHANGED
|
@@ -16749,8 +16749,6 @@
|
|
|
16749
16749
|
"dataset",
|
|
16750
16750
|
"mlModel"
|
|
16751
16751
|
],
|
|
16752
|
-
"isLineage": true,
|
|
16753
|
-
"isUpstream": false,
|
|
16754
16752
|
"name": "Produces"
|
|
16755
16753
|
}
|
|
16756
16754
|
},
|
|
@@ -16770,7 +16768,35 @@
|
|
|
16770
16768
|
"items": "string"
|
|
16771
16769
|
},
|
|
16772
16770
|
"name": "outputs",
|
|
16773
|
-
"doc": "Output
|
|
16771
|
+
"doc": "Output assets produced"
|
|
16772
|
+
},
|
|
16773
|
+
{
|
|
16774
|
+
"Relationship": {
|
|
16775
|
+
"/*/destinationUrn": {
|
|
16776
|
+
"createdActor": "outputEdges/*/created/actor",
|
|
16777
|
+
"createdOn": "outputEdges/*/created/time",
|
|
16778
|
+
"entityTypes": [
|
|
16779
|
+
"dataset",
|
|
16780
|
+
"mlModel"
|
|
16781
|
+
],
|
|
16782
|
+
"isLineage": true,
|
|
16783
|
+
"isUpstream": false,
|
|
16784
|
+
"name": "DataProcessInstanceProduces",
|
|
16785
|
+
"properties": "outputEdges/*/properties",
|
|
16786
|
+
"updatedActor": "outputEdges/*/lastModified/actor",
|
|
16787
|
+
"updatedOn": "outputEdges/*/lastModified/time"
|
|
16788
|
+
}
|
|
16789
|
+
},
|
|
16790
|
+
"type": [
|
|
16791
|
+
"null",
|
|
16792
|
+
{
|
|
16793
|
+
"type": "array",
|
|
16794
|
+
"items": "com.linkedin.pegasus2avro.common.Edge"
|
|
16795
|
+
}
|
|
16796
|
+
],
|
|
16797
|
+
"name": "outputEdges",
|
|
16798
|
+
"default": null,
|
|
16799
|
+
"doc": "Output assets produced by the data process instance during processing, with additional metadata.\nCounts as lineage.\nWill eventually deprecate the outputs field."
|
|
16774
16800
|
}
|
|
16775
16801
|
],
|
|
16776
16802
|
"doc": "Information about the outputs of a Data process"
|
|
@@ -16977,7 +17003,6 @@
|
|
|
16977
17003
|
"dataset",
|
|
16978
17004
|
"mlModel"
|
|
16979
17005
|
],
|
|
16980
|
-
"isLineage": true,
|
|
16981
17006
|
"name": "Consumes"
|
|
16982
17007
|
}
|
|
16983
17008
|
},
|
|
@@ -16998,6 +17023,33 @@
|
|
|
16998
17023
|
},
|
|
16999
17024
|
"name": "inputs",
|
|
17000
17025
|
"doc": "Input assets consumed"
|
|
17026
|
+
},
|
|
17027
|
+
{
|
|
17028
|
+
"Relationship": {
|
|
17029
|
+
"/*/destinationUrn": {
|
|
17030
|
+
"createdActor": "inputEdges/*/created/actor",
|
|
17031
|
+
"createdOn": "inputEdges/*/created/time",
|
|
17032
|
+
"entityTypes": [
|
|
17033
|
+
"dataset",
|
|
17034
|
+
"mlModel"
|
|
17035
|
+
],
|
|
17036
|
+
"isLineage": true,
|
|
17037
|
+
"name": "DataProcessInstanceConsumes",
|
|
17038
|
+
"properties": "inputEdges/*/properties",
|
|
17039
|
+
"updatedActor": "inputEdges/*/lastModified/actor",
|
|
17040
|
+
"updatedOn": "inputEdges/*/lastModified/time"
|
|
17041
|
+
}
|
|
17042
|
+
},
|
|
17043
|
+
"type": [
|
|
17044
|
+
"null",
|
|
17045
|
+
{
|
|
17046
|
+
"type": "array",
|
|
17047
|
+
"items": "com.linkedin.pegasus2avro.common.Edge"
|
|
17048
|
+
}
|
|
17049
|
+
],
|
|
17050
|
+
"name": "inputEdges",
|
|
17051
|
+
"default": null,
|
|
17052
|
+
"doc": "Input assets consumed by the data process instance, with additional metadata.\nCounts as lineage.\nWill eventually deprecate the inputs field."
|
|
17001
17053
|
}
|
|
17002
17054
|
],
|
|
17003
17055
|
"doc": "Information about the inputs datasets of a Data process"
|
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
"dataset",
|
|
14
14
|
"mlModel"
|
|
15
15
|
],
|
|
16
|
-
"isLineage": true,
|
|
17
16
|
"name": "Consumes"
|
|
18
17
|
}
|
|
19
18
|
},
|
|
@@ -34,6 +33,135 @@
|
|
|
34
33
|
"doc": "Input assets consumed",
|
|
35
34
|
"Urn": "Urn",
|
|
36
35
|
"urn_is_array": true
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"Relationship": {
|
|
39
|
+
"/*/destinationUrn": {
|
|
40
|
+
"createdActor": "inputEdges/*/created/actor",
|
|
41
|
+
"createdOn": "inputEdges/*/created/time",
|
|
42
|
+
"entityTypes": [
|
|
43
|
+
"dataset",
|
|
44
|
+
"mlModel"
|
|
45
|
+
],
|
|
46
|
+
"isLineage": true,
|
|
47
|
+
"name": "DataProcessInstanceConsumes",
|
|
48
|
+
"properties": "inputEdges/*/properties",
|
|
49
|
+
"updatedActor": "inputEdges/*/lastModified/actor",
|
|
50
|
+
"updatedOn": "inputEdges/*/lastModified/time"
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
"type": [
|
|
54
|
+
"null",
|
|
55
|
+
{
|
|
56
|
+
"type": "array",
|
|
57
|
+
"items": {
|
|
58
|
+
"type": "record",
|
|
59
|
+
"name": "Edge",
|
|
60
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
61
|
+
"fields": [
|
|
62
|
+
{
|
|
63
|
+
"java": {
|
|
64
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
65
|
+
},
|
|
66
|
+
"type": [
|
|
67
|
+
"null",
|
|
68
|
+
"string"
|
|
69
|
+
],
|
|
70
|
+
"name": "sourceUrn",
|
|
71
|
+
"default": null,
|
|
72
|
+
"doc": "Urn of the source of this relationship edge.\nIf not specified, assumed to be the entity that this aspect belongs to.",
|
|
73
|
+
"Urn": "Urn"
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
"java": {
|
|
77
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
78
|
+
},
|
|
79
|
+
"type": "string",
|
|
80
|
+
"name": "destinationUrn",
|
|
81
|
+
"doc": "Urn of the destination of this relationship edge.",
|
|
82
|
+
"Urn": "Urn"
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"type": [
|
|
86
|
+
"null",
|
|
87
|
+
{
|
|
88
|
+
"type": "record",
|
|
89
|
+
"name": "AuditStamp",
|
|
90
|
+
"namespace": "com.linkedin.pegasus2avro.common",
|
|
91
|
+
"fields": [
|
|
92
|
+
{
|
|
93
|
+
"type": "long",
|
|
94
|
+
"name": "time",
|
|
95
|
+
"doc": "When did the resource/association/sub-resource move into the specific lifecycle stage represented by this AuditEvent."
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"java": {
|
|
99
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
100
|
+
},
|
|
101
|
+
"type": "string",
|
|
102
|
+
"name": "actor",
|
|
103
|
+
"doc": "The entity (e.g. a member URN) which will be credited for moving the resource/association/sub-resource into the specific lifecycle stage. It is also the one used to authorize the change.",
|
|
104
|
+
"Urn": "Urn"
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"java": {
|
|
108
|
+
"class": "com.linkedin.pegasus2avro.common.urn.Urn"
|
|
109
|
+
},
|
|
110
|
+
"type": [
|
|
111
|
+
"null",
|
|
112
|
+
"string"
|
|
113
|
+
],
|
|
114
|
+
"name": "impersonator",
|
|
115
|
+
"default": null,
|
|
116
|
+
"doc": "The entity (e.g. a service URN) which performs the change on behalf of the Actor and must be authorized to act as the Actor.",
|
|
117
|
+
"Urn": "Urn"
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"type": [
|
|
121
|
+
"null",
|
|
122
|
+
"string"
|
|
123
|
+
],
|
|
124
|
+
"name": "message",
|
|
125
|
+
"default": null,
|
|
126
|
+
"doc": "Additional context around how DataHub was informed of the particular change. For example: was the change created by an automated process, or manually."
|
|
127
|
+
}
|
|
128
|
+
],
|
|
129
|
+
"doc": "Data captured on a resource/association/sub-resource level giving insight into when that resource/association/sub-resource moved into a particular lifecycle stage, and who acted to move it into that specific lifecycle stage."
|
|
130
|
+
}
|
|
131
|
+
],
|
|
132
|
+
"name": "created",
|
|
133
|
+
"default": null,
|
|
134
|
+
"doc": "Audit stamp containing who created this relationship edge and when"
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"type": [
|
|
138
|
+
"null",
|
|
139
|
+
"com.linkedin.pegasus2avro.common.AuditStamp"
|
|
140
|
+
],
|
|
141
|
+
"name": "lastModified",
|
|
142
|
+
"default": null,
|
|
143
|
+
"doc": "Audit stamp containing who last modified this relationship edge and when"
|
|
144
|
+
},
|
|
145
|
+
{
|
|
146
|
+
"type": [
|
|
147
|
+
"null",
|
|
148
|
+
{
|
|
149
|
+
"type": "map",
|
|
150
|
+
"values": "string"
|
|
151
|
+
}
|
|
152
|
+
],
|
|
153
|
+
"name": "properties",
|
|
154
|
+
"default": null,
|
|
155
|
+
"doc": "A generic properties bag that allows us to store specific information on this graph edge."
|
|
156
|
+
}
|
|
157
|
+
],
|
|
158
|
+
"doc": "A common structure to represent all edges to entities when used inside aspects as collections\nThis ensures that all edges have common structure around audit-stamps and will support PATCH, time-travel automatically."
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
],
|
|
162
|
+
"name": "inputEdges",
|
|
163
|
+
"default": null,
|
|
164
|
+
"doc": "Input assets consumed by the data process instance, with additional metadata.\nCounts as lineage.\nWill eventually deprecate the inputs field."
|
|
37
165
|
}
|
|
38
166
|
],
|
|
39
167
|
"doc": "Information about the inputs datasets of a Data process"
|