acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
- datahub/_version.py +1 -1
- datahub/api/graphql/operation.py +1 -1
- datahub/ingestion/autogenerated/capability_summary.json +46 -6
- datahub/ingestion/autogenerated/lineage.json +3 -2
- datahub/ingestion/run/pipeline.py +1 -0
- datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
- datahub/ingestion/source/dbt/dbt_common.py +74 -0
- datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
- datahub/ingestion/source/dremio/dremio_source.py +4 -0
- datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
- datahub/ingestion/source/excel/__init__.py +0 -0
- datahub/ingestion/source/excel/config.py +92 -0
- datahub/ingestion/source/excel/excel_file.py +539 -0
- datahub/ingestion/source/excel/profiling.py +308 -0
- datahub/ingestion/source/excel/report.py +49 -0
- datahub/ingestion/source/excel/source.py +662 -0
- datahub/ingestion/source/excel/util.py +18 -0
- datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
- datahub/ingestion/source/openapi.py +1 -1
- datahub/ingestion/source/powerbi/config.py +33 -0
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
- datahub/ingestion/source/powerbi/powerbi.py +5 -0
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
- datahub/ingestion/source/redshift/config.py +9 -6
- datahub/ingestion/source/redshift/lineage.py +386 -687
- datahub/ingestion/source/redshift/redshift.py +19 -106
- datahub/ingestion/source/s3/source.py +65 -59
- datahub/ingestion/source/snowflake/constants.py +2 -0
- datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
- datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
- datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
- datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
- datahub/ingestion/source/sql/hive_metastore.py +1 -0
- datahub/ingestion/source/sql/mssql/job_models.py +3 -1
- datahub/ingestion/source/sql/mssql/source.py +62 -3
- datahub/ingestion/source/sql_queries.py +24 -2
- datahub/ingestion/source/state/checkpoint.py +3 -28
- datahub/ingestion/source/unity/config.py +74 -9
- datahub/ingestion/source/unity/proxy.py +167 -5
- datahub/ingestion/source/unity/proxy_patch.py +321 -0
- datahub/ingestion/source/unity/proxy_types.py +24 -0
- datahub/ingestion/source/unity/report.py +5 -0
- datahub/ingestion/source/unity/source.py +111 -1
- datahub/ingestion/source/usage/usage_common.py +1 -0
- datahub/metadata/_internal_schema_classes.py +573 -517
- datahub/metadata/_urns/urn_defs.py +1748 -1748
- datahub/metadata/schema.avsc +18564 -18484
- datahub/metadata/schemas/ChartInfo.avsc +2 -1
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
- datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
- datahub/metadata/schemas/LogicalParent.avsc +104 -100
- datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
- datahub/metadata/schemas/Ownership.avsc +69 -0
- datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
- datahub/metadata/schemas/StructuredProperties.avsc +69 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/chart.py +36 -22
- datahub/sdk/dashboard.py +38 -62
- datahub/sdk/lineage_client.py +6 -26
- datahub/sdk/main_client.py +7 -3
- datahub/sdk/search_filters.py +16 -0
- datahub/specific/aspect_helpers/siblings.py +73 -0
- datahub/specific/dataset.py +2 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
- datahub/sql_parsing/tool_meta_extractor.py +1 -3
- datahub/upgrade/upgrade.py +14 -2
- datahub/ingestion/source/redshift/lineage_v2.py +0 -466
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0
|
@@ -337,3 +337,27 @@ class Notebook:
|
|
|
337
337
|
"upstreams": frozenset([*notebook.upstreams, upstream]),
|
|
338
338
|
}
|
|
339
339
|
)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@dataclass
|
|
343
|
+
class Model:
|
|
344
|
+
id: str
|
|
345
|
+
name: str
|
|
346
|
+
schema_name: str
|
|
347
|
+
catalog_name: str
|
|
348
|
+
description: Optional[str]
|
|
349
|
+
created_at: Optional[datetime]
|
|
350
|
+
updated_at: Optional[datetime]
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@dataclass
|
|
354
|
+
class ModelVersion:
|
|
355
|
+
id: str
|
|
356
|
+
name: str
|
|
357
|
+
model: Model
|
|
358
|
+
version: str
|
|
359
|
+
aliases: Optional[List[str]]
|
|
360
|
+
description: Optional[str]
|
|
361
|
+
created_at: Optional[datetime]
|
|
362
|
+
updated_at: Optional[datetime]
|
|
363
|
+
created_by: Optional[str]
|
|
@@ -31,6 +31,10 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
31
31
|
tables: EntityFilterReport = EntityFilterReport.field(type="table/view")
|
|
32
32
|
table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile")
|
|
33
33
|
notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook")
|
|
34
|
+
ml_models: EntityFilterReport = EntityFilterReport.field(type="ml_model")
|
|
35
|
+
ml_model_versions: EntityFilterReport = EntityFilterReport.field(
|
|
36
|
+
type="ml_model_version"
|
|
37
|
+
)
|
|
34
38
|
|
|
35
39
|
hive_metastore_catalog_found: Optional[bool] = None
|
|
36
40
|
|
|
@@ -64,6 +68,7 @@ class UnityCatalogReport(IngestionStageReport, SQLSourceReport):
|
|
|
64
68
|
num_catalogs_missing_name: int = 0
|
|
65
69
|
num_schemas_missing_name: int = 0
|
|
66
70
|
num_tables_missing_name: int = 0
|
|
71
|
+
num_ml_models_missing_name: int = 0
|
|
67
72
|
num_columns_missing_name: int = 0
|
|
68
73
|
num_queries_missing_info: int = 0
|
|
69
74
|
|
|
@@ -12,6 +12,7 @@ from datahub.emitter.mce_builder import (
|
|
|
12
12
|
make_dataset_urn_with_platform_instance,
|
|
13
13
|
make_domain_urn,
|
|
14
14
|
make_group_urn,
|
|
15
|
+
make_ml_model_group_urn,
|
|
15
16
|
make_schema_field_urn,
|
|
16
17
|
make_ts_millis,
|
|
17
18
|
make_user_urn,
|
|
@@ -26,6 +27,7 @@ from datahub.emitter.mcp_builder import (
|
|
|
26
27
|
UnitySchemaKey,
|
|
27
28
|
UnitySchemaKeyWithMetastore,
|
|
28
29
|
add_dataset_to_container,
|
|
30
|
+
add_entity_to_container,
|
|
29
31
|
gen_containers,
|
|
30
32
|
)
|
|
31
33
|
from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
|
|
@@ -87,6 +89,8 @@ from datahub.ingestion.source.unity.proxy_types import (
|
|
|
87
89
|
CustomCatalogType,
|
|
88
90
|
HiveTableType,
|
|
89
91
|
Metastore,
|
|
92
|
+
Model,
|
|
93
|
+
ModelVersion,
|
|
90
94
|
Notebook,
|
|
91
95
|
NotebookId,
|
|
92
96
|
Schema,
|
|
@@ -121,6 +125,7 @@ from datahub.metadata.schema_classes import (
|
|
|
121
125
|
DatasetLineageTypeClass,
|
|
122
126
|
DatasetPropertiesClass,
|
|
123
127
|
DomainsClass,
|
|
128
|
+
MLModelPropertiesClass,
|
|
124
129
|
MySqlDDLClass,
|
|
125
130
|
NullTypeClass,
|
|
126
131
|
OwnerClass,
|
|
@@ -134,7 +139,8 @@ from datahub.metadata.schema_classes import (
|
|
|
134
139
|
UpstreamClass,
|
|
135
140
|
UpstreamLineageClass,
|
|
136
141
|
)
|
|
137
|
-
from datahub.metadata.urns import TagUrn
|
|
142
|
+
from datahub.metadata.urns import MlModelGroupUrn, MlModelUrn, TagUrn
|
|
143
|
+
from datahub.sdk import MLModel, MLModelGroup
|
|
138
144
|
from datahub.sql_parsing.schema_resolver import SchemaResolver
|
|
139
145
|
from datahub.sql_parsing.sqlglot_lineage import (
|
|
140
146
|
SqlParsingResult,
|
|
@@ -182,6 +188,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
182
188
|
- metastores
|
|
183
189
|
- schemas
|
|
184
190
|
- tables and column lineage
|
|
191
|
+
- model and model versions
|
|
185
192
|
"""
|
|
186
193
|
|
|
187
194
|
config: UnityCatalogSourceConfig
|
|
@@ -250,6 +257,18 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
250
257
|
else:
|
|
251
258
|
self.platform_resource_repository = None
|
|
252
259
|
|
|
260
|
+
if self.config._forced_disable_tag_extraction:
|
|
261
|
+
self.report.report_warning(
|
|
262
|
+
"Some features disabled because of configuration conflicts",
|
|
263
|
+
"Tag Extraction is disabled due to missing warehouse_id in config",
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
if self.config._forced_disable_hive_metastore_extraction:
|
|
267
|
+
self.report.report_warning(
|
|
268
|
+
"Some features disabled because of configuration conflicts",
|
|
269
|
+
"Hive Metastore Extraction is disabled due to missing warehouse_id in config",
|
|
270
|
+
)
|
|
271
|
+
|
|
253
272
|
# Include platform resource repository in report for automatic cache statistics
|
|
254
273
|
if self.config.include_tags and self.platform_resource_repository:
|
|
255
274
|
self.report.tag_urn_resolver_cache = self.platform_resource_repository
|
|
@@ -512,6 +531,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
512
531
|
yield from self.gen_schema_containers(schema)
|
|
513
532
|
try:
|
|
514
533
|
yield from self.process_tables(schema)
|
|
534
|
+
yield from self.process_ml_models(schema)
|
|
515
535
|
except Exception as e:
|
|
516
536
|
logger.exception(f"Error parsing schema {schema}")
|
|
517
537
|
self.report.report_warning(
|
|
@@ -665,6 +685,69 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
665
685
|
)
|
|
666
686
|
]
|
|
667
687
|
|
|
688
|
+
def process_ml_models(self, schema: Schema) -> Iterable[MetadataWorkUnit]:
|
|
689
|
+
for ml_model in self.unity_catalog_api_proxy.ml_models(
|
|
690
|
+
schema=schema, max_results=self.config.ml_model_max_results
|
|
691
|
+
):
|
|
692
|
+
yield from self.process_ml_model(ml_model, schema)
|
|
693
|
+
ml_model_urn = self.gen_ml_model_urn(ml_model.id)
|
|
694
|
+
for ml_model_version in self.unity_catalog_api_proxy.ml_model_versions(
|
|
695
|
+
ml_model, include_aliases=self.config.include_ml_model_aliases
|
|
696
|
+
):
|
|
697
|
+
yield from self.process_ml_model_version(
|
|
698
|
+
ml_model_urn, ml_model_version, schema
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
def process_ml_model(
|
|
702
|
+
self, ml_model: Model, schema: Schema
|
|
703
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
704
|
+
ml_model_group = MLModelGroup(
|
|
705
|
+
id=ml_model.id,
|
|
706
|
+
name=ml_model.name,
|
|
707
|
+
platform=self.platform,
|
|
708
|
+
platform_instance=schema.name,
|
|
709
|
+
env=self.config.env,
|
|
710
|
+
description=ml_model.description,
|
|
711
|
+
created=ml_model.created_at,
|
|
712
|
+
last_modified=ml_model.updated_at,
|
|
713
|
+
)
|
|
714
|
+
yield from ml_model_group.as_workunits()
|
|
715
|
+
yield from self.add_model_to_schema_container(str(ml_model_group.urn), schema)
|
|
716
|
+
self.report.ml_models.processed(ml_model.id)
|
|
717
|
+
|
|
718
|
+
def process_ml_model_version(
|
|
719
|
+
self, ml_model_urn: str, ml_model_version: ModelVersion, schema: Schema
|
|
720
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
721
|
+
extra_aspects = []
|
|
722
|
+
if ml_model_version.created_at is not None:
|
|
723
|
+
created_time = int(ml_model_version.created_at.timestamp() * 1000)
|
|
724
|
+
created_actor = (
|
|
725
|
+
f"urn:li:platformResource:{ml_model_version.created_by}"
|
|
726
|
+
if ml_model_version.created_by
|
|
727
|
+
else None
|
|
728
|
+
)
|
|
729
|
+
extra_aspects.append(
|
|
730
|
+
MLModelPropertiesClass(
|
|
731
|
+
created=TimeStampClass(time=created_time, actor=created_actor),
|
|
732
|
+
)
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
ml_model = MLModel(
|
|
736
|
+
id=ml_model_version.id,
|
|
737
|
+
name=ml_model_version.name,
|
|
738
|
+
version=str(ml_model_version.version),
|
|
739
|
+
aliases=ml_model_version.aliases,
|
|
740
|
+
description=ml_model_version.description,
|
|
741
|
+
model_group=ml_model_urn,
|
|
742
|
+
platform=self.platform,
|
|
743
|
+
last_modified=ml_model_version.updated_at,
|
|
744
|
+
extra_aspects=extra_aspects,
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
yield from ml_model.as_workunits()
|
|
748
|
+
yield from self.add_model_version_to_schema_container(str(ml_model.urn), schema)
|
|
749
|
+
self.report.ml_model_versions.processed(ml_model_version.id)
|
|
750
|
+
|
|
668
751
|
def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]:
|
|
669
752
|
# Calculate datetime filters for lineage
|
|
670
753
|
lineage_start_time = None
|
|
@@ -802,6 +885,13 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
802
885
|
env=self.config.env,
|
|
803
886
|
)
|
|
804
887
|
|
|
888
|
+
def gen_ml_model_urn(self, name: str) -> str:
|
|
889
|
+
return make_ml_model_group_urn(
|
|
890
|
+
platform=self.platform,
|
|
891
|
+
group_name=name,
|
|
892
|
+
env=self.config.env,
|
|
893
|
+
)
|
|
894
|
+
|
|
805
895
|
def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str:
|
|
806
896
|
notebook_id = notebook.id if isinstance(notebook, Notebook) else notebook
|
|
807
897
|
return NotebookKey(
|
|
@@ -973,6 +1063,26 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
|
|
|
973
1063
|
dataset_urn=dataset_urn,
|
|
974
1064
|
)
|
|
975
1065
|
|
|
1066
|
+
def add_model_to_schema_container(
|
|
1067
|
+
self, model_urn: str, schema: Schema
|
|
1068
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1069
|
+
schema_container_key = self.gen_schema_key(schema)
|
|
1070
|
+
yield from add_entity_to_container(
|
|
1071
|
+
container_key=schema_container_key,
|
|
1072
|
+
entity_type=MlModelGroupUrn.ENTITY_TYPE,
|
|
1073
|
+
entity_urn=model_urn,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
def add_model_version_to_schema_container(
|
|
1077
|
+
self, model_version_urn: str, schema: Schema
|
|
1078
|
+
) -> Iterable[MetadataWorkUnit]:
|
|
1079
|
+
schema_container_key = self.gen_schema_key(schema)
|
|
1080
|
+
yield from add_entity_to_container(
|
|
1081
|
+
container_key=schema_container_key,
|
|
1082
|
+
entity_type=MlModelUrn.ENTITY_TYPE,
|
|
1083
|
+
entity_urn=model_version_urn,
|
|
1084
|
+
)
|
|
1085
|
+
|
|
976
1086
|
def _get_catalog_tags(
|
|
977
1087
|
self, catalog: str, schema: str, table: str
|
|
978
1088
|
) -> List[UnityCatalogTag]:
|