acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/METADATA +2558 -2531
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/RECORD +221 -187
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/entry_points.txt +2 -0
- datahub/_version.py +1 -1
- datahub/api/entities/dataset/dataset.py +1 -1
- datahub/api/entities/external/__init__.py +0 -0
- datahub/api/entities/external/external_entities.py +239 -0
- datahub/api/entities/external/external_tag.py +145 -0
- datahub/api/entities/external/lake_formation_external_entites.py +161 -0
- datahub/api/entities/external/restricted_text.py +247 -0
- datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
- datahub/cli/check_cli.py +88 -7
- datahub/cli/cli_utils.py +63 -0
- datahub/cli/container_cli.py +5 -0
- datahub/cli/delete_cli.py +124 -27
- datahub/cli/docker_check.py +107 -12
- datahub/cli/docker_cli.py +149 -227
- datahub/cli/exists_cli.py +0 -2
- datahub/cli/get_cli.py +0 -2
- datahub/cli/iceberg_cli.py +5 -0
- datahub/cli/ingest_cli.py +3 -15
- datahub/cli/migrate.py +2 -0
- datahub/cli/put_cli.py +1 -4
- datahub/cli/quickstart_versioning.py +50 -7
- datahub/cli/specific/assertions_cli.py +0 -4
- datahub/cli/specific/datacontract_cli.py +0 -3
- datahub/cli/specific/dataproduct_cli.py +0 -11
- datahub/cli/specific/dataset_cli.py +1 -8
- datahub/cli/specific/forms_cli.py +0 -4
- datahub/cli/specific/group_cli.py +0 -2
- datahub/cli/specific/structuredproperties_cli.py +1 -4
- datahub/cli/specific/user_cli.py +0 -2
- datahub/cli/state_cli.py +0 -2
- datahub/cli/timeline_cli.py +0 -2
- datahub/emitter/rest_emitter.py +70 -12
- datahub/entrypoints.py +4 -3
- datahub/ingestion/api/decorators.py +15 -3
- datahub/ingestion/api/report.py +332 -3
- datahub/ingestion/api/sink.py +3 -0
- datahub/ingestion/api/source.py +48 -44
- datahub/ingestion/autogenerated/__init__.py +0 -0
- datahub/ingestion/autogenerated/capability_summary.json +3449 -0
- datahub/ingestion/autogenerated/lineage.json +401 -0
- datahub/ingestion/autogenerated/lineage_helper.py +177 -0
- datahub/ingestion/extractor/schema_util.py +13 -4
- datahub/ingestion/glossary/classification_mixin.py +5 -0
- datahub/ingestion/graph/client.py +100 -15
- datahub/ingestion/graph/config.py +1 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
- datahub/ingestion/run/pipeline.py +54 -2
- datahub/ingestion/sink/datahub_rest.py +13 -0
- datahub/ingestion/source/abs/source.py +1 -1
- datahub/ingestion/source/aws/aws_common.py +4 -0
- datahub/ingestion/source/aws/glue.py +489 -244
- datahub/ingestion/source/aws/tag_entities.py +292 -0
- datahub/ingestion/source/azure/azure_common.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
- datahub/ingestion/source/bigquery_v2/common.py +1 -1
- datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
- datahub/ingestion/source/bigquery_v2/queries.py +3 -3
- datahub/ingestion/source/cassandra/cassandra.py +1 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
- datahub/ingestion/source/common/subtypes.py +45 -0
- datahub/ingestion/source/data_lake_common/object_store.py +115 -27
- datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
- datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
- datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
- datahub/ingestion/source/dbt/dbt_common.py +6 -2
- datahub/ingestion/source/dbt/dbt_core.py +3 -0
- datahub/ingestion/source/debug/__init__.py +0 -0
- datahub/ingestion/source/debug/datahub_debug.py +300 -0
- datahub/ingestion/source/dremio/dremio_api.py +114 -73
- datahub/ingestion/source/dremio/dremio_config.py +2 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
- datahub/ingestion/source/dremio/dremio_source.py +94 -81
- datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
- datahub/ingestion/source/file.py +3 -0
- datahub/ingestion/source/fivetran/fivetran.py +34 -26
- datahub/ingestion/source/gcs/gcs_source.py +13 -2
- datahub/ingestion/source/ge_data_profiler.py +76 -28
- datahub/ingestion/source/ge_profiling_config.py +11 -0
- datahub/ingestion/source/hex/api.py +26 -1
- datahub/ingestion/source/iceberg/iceberg.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +1 -1
- datahub/ingestion/source/identity/okta.py +1 -14
- datahub/ingestion/source/kafka/kafka.py +16 -0
- datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
- datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
- datahub/ingestion/source/looker/looker_source.py +1 -0
- datahub/ingestion/source/mlflow.py +11 -1
- datahub/ingestion/source/mock_data/__init__.py +0 -0
- datahub/ingestion/source/mock_data/datahub_mock_data.py +472 -0
- datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
- datahub/ingestion/source/mock_data/table_naming_helper.py +91 -0
- datahub/ingestion/source/nifi.py +1 -1
- datahub/ingestion/source/powerbi/powerbi.py +1 -5
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
- datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
- datahub/ingestion/source/preset.py +2 -2
- datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +21 -1
- datahub/ingestion/source/redshift/usage.py +4 -3
- datahub/ingestion/source/s3/report.py +4 -2
- datahub/ingestion/source/s3/source.py +367 -115
- datahub/ingestion/source/sac/sac.py +3 -1
- datahub/ingestion/source/salesforce.py +6 -3
- datahub/ingestion/source/sigma/sigma.py +7 -1
- datahub/ingestion/source/slack/slack.py +2 -1
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
- datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
- datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
- datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
- datahub/ingestion/source/sql/athena.py +119 -11
- datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
- datahub/ingestion/source/sql/clickhouse.py +3 -1
- datahub/ingestion/source/sql/cockroachdb.py +0 -1
- datahub/ingestion/source/sql/hana.py +3 -1
- datahub/ingestion/source/sql/hive_metastore.py +3 -11
- datahub/ingestion/source/sql/mariadb.py +0 -1
- datahub/ingestion/source/sql/mssql/source.py +239 -34
- datahub/ingestion/source/sql/mysql.py +0 -1
- datahub/ingestion/source/sql/oracle.py +1 -1
- datahub/ingestion/source/sql/postgres.py +0 -1
- datahub/ingestion/source/sql/sql_common.py +121 -34
- datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
- datahub/ingestion/source/sql/teradata.py +997 -235
- datahub/ingestion/source/sql/vertica.py +10 -6
- datahub/ingestion/source/sql_queries.py +2 -2
- datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
- datahub/ingestion/source/superset.py +58 -3
- datahub/ingestion/source/tableau/tableau.py +58 -37
- datahub/ingestion/source/tableau/tableau_common.py +4 -2
- datahub/ingestion/source/tableau/tableau_constant.py +0 -4
- datahub/ingestion/source/unity/config.py +5 -0
- datahub/ingestion/source/unity/proxy.py +118 -0
- datahub/ingestion/source/unity/source.py +195 -17
- datahub/ingestion/source/unity/tag_entities.py +295 -0
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
- datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
- datahub/integrations/assertion/snowflake/compiler.py +4 -3
- datahub/metadata/_internal_schema_classes.py +1433 -546
- datahub/metadata/_urns/urn_defs.py +1826 -1658
- datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
- datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +27 -0
- datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
- datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
- datahub/metadata/schema.avsc +17736 -17112
- datahub/metadata/schemas/ApplicationKey.avsc +31 -0
- datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
- datahub/metadata/schemas/Applications.avsc +38 -0
- datahub/metadata/schemas/ChartKey.avsc +1 -0
- datahub/metadata/schemas/ContainerKey.avsc +1 -0
- datahub/metadata/schemas/ContainerProperties.avsc +8 -0
- datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
- datahub/metadata/schemas/DashboardKey.avsc +1 -0
- datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageModuleProperties.avsc +200 -0
- datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
- datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
- datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
- datahub/metadata/schemas/DataJobInfo.avsc +8 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessKey.avsc +8 -0
- datahub/metadata/schemas/DataProductKey.avsc +1 -0
- datahub/metadata/schemas/DataProductProperties.avsc +1 -1
- datahub/metadata/schemas/DatasetKey.avsc +11 -1
- datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
- datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
- datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
- datahub/metadata/schemas/LogicalParent.avsc +140 -0
- datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
- datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
- datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
- datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
- datahub/metadata/schemas/MLModelKey.avsc +9 -0
- datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
- datahub/metadata/schemas/NotebookKey.avsc +1 -0
- datahub/metadata/schemas/QuerySubjects.avsc +1 -12
- datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/__init__.py +2 -0
- datahub/sdk/_all_entities.py +7 -0
- datahub/sdk/_shared.py +116 -0
- datahub/sdk/chart.py +315 -0
- datahub/sdk/container.py +7 -0
- datahub/sdk/dashboard.py +432 -0
- datahub/sdk/dataflow.py +7 -0
- datahub/sdk/datajob.py +45 -13
- datahub/sdk/dataset.py +8 -2
- datahub/sdk/entity_client.py +82 -2
- datahub/sdk/lineage_client.py +683 -82
- datahub/sdk/main_client.py +46 -16
- datahub/sdk/mlmodel.py +101 -38
- datahub/sdk/mlmodelgroup.py +7 -0
- datahub/sdk/search_client.py +4 -3
- datahub/specific/chart.py +1 -1
- datahub/specific/dataproduct.py +4 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
- datahub/sql_parsing/sqlglot_lineage.py +62 -13
- datahub/telemetry/telemetry.py +17 -11
- datahub/testing/sdk_v2_helpers.py +7 -1
- datahub/upgrade/upgrade.py +46 -13
- datahub/utilities/server_config_util.py +8 -0
- datahub/utilities/sqlalchemy_query_combiner.py +5 -2
- datahub/utilities/stats_collections.py +4 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.dist-info}/top_level.txt +0 -0
datahub/sdk/main_client.py
CHANGED
|
@@ -7,16 +7,8 @@ from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
|
7
7
|
from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
|
|
8
8
|
from datahub.sdk.entity_client import EntityClient
|
|
9
9
|
from datahub.sdk.lineage_client import LineageClient
|
|
10
|
-
from datahub.sdk.resolver_client import ResolverClient
|
|
11
10
|
from datahub.sdk.search_client import SearchClient
|
|
12
11
|
|
|
13
|
-
try:
|
|
14
|
-
from acryl_datahub_cloud._sdk_extras import ( # type: ignore[import-not-found]
|
|
15
|
-
AssertionsClient,
|
|
16
|
-
)
|
|
17
|
-
except ImportError:
|
|
18
|
-
AssertionsClient = None
|
|
19
|
-
|
|
20
12
|
|
|
21
13
|
class DataHubClient:
|
|
22
14
|
"""Main client for interacting with DataHub.
|
|
@@ -74,7 +66,12 @@ class DataHubClient:
|
|
|
74
66
|
self._graph.test_connection()
|
|
75
67
|
|
|
76
68
|
@classmethod
|
|
77
|
-
def from_env(
|
|
69
|
+
def from_env(
|
|
70
|
+
cls,
|
|
71
|
+
*,
|
|
72
|
+
client_mode: ClientMode = ClientMode.SDK,
|
|
73
|
+
datahub_component: Optional[str] = None,
|
|
74
|
+
) -> "DataHubClient":
|
|
78
75
|
"""Initialize a DataHubClient from the environment variables or ~/.datahubenv file.
|
|
79
76
|
|
|
80
77
|
This will first check DATAHUB_GMS_URL and DATAHUB_GMS_TOKEN. If not present,
|
|
@@ -84,6 +81,10 @@ class DataHubClient:
|
|
|
84
81
|
If you're looking to specify the server/token in code, use the
|
|
85
82
|
DataHubClient(server=..., token=...) constructor instead.
|
|
86
83
|
|
|
84
|
+
Args:
|
|
85
|
+
client_mode: [internal] The client mode to use. Defaults to "SDK".
|
|
86
|
+
datahub_component: [internal] The DataHub component name to include in the user agent.
|
|
87
|
+
|
|
87
88
|
Returns:
|
|
88
89
|
A DataHubClient instance.
|
|
89
90
|
"""
|
|
@@ -91,7 +92,10 @@ class DataHubClient:
|
|
|
91
92
|
# Inspired by the DockerClient.from_env() method.
|
|
92
93
|
# TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
|
|
93
94
|
# That file is part of the "environment", but is not a traditional "env variable".
|
|
94
|
-
graph = get_default_graph(
|
|
95
|
+
graph = get_default_graph(
|
|
96
|
+
client_mode=client_mode,
|
|
97
|
+
datahub_component=datahub_component,
|
|
98
|
+
)
|
|
95
99
|
|
|
96
100
|
return cls(graph=graph)
|
|
97
101
|
|
|
@@ -100,7 +104,15 @@ class DataHubClient:
|
|
|
100
104
|
return EntityClient(self)
|
|
101
105
|
|
|
102
106
|
@property
|
|
103
|
-
def resolve(self)
|
|
107
|
+
def resolve(self): # type: ignore[report-untyped-call] # Not available due to circular import issues
|
|
108
|
+
try:
|
|
109
|
+
from acryl_datahub_cloud.sdk import ( # type: ignore[import-not-found]
|
|
110
|
+
ResolverClient,
|
|
111
|
+
)
|
|
112
|
+
except ImportError:
|
|
113
|
+
from datahub.sdk.resolver_client import ( # type: ignore[assignment] # If the client is not installed, use the one from the SDK
|
|
114
|
+
ResolverClient,
|
|
115
|
+
)
|
|
104
116
|
return ResolverClient(self)
|
|
105
117
|
|
|
106
118
|
@property
|
|
@@ -112,9 +124,27 @@ class DataHubClient:
|
|
|
112
124
|
return LineageClient(self)
|
|
113
125
|
|
|
114
126
|
@property
|
|
115
|
-
def assertions(self)
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
)
|
|
127
|
+
def assertions(self): # type: ignore[report-untyped-call] # Not available due to circular import issues
|
|
128
|
+
try:
|
|
129
|
+
from acryl_datahub_cloud.sdk import AssertionsClient
|
|
130
|
+
except ImportError as e:
|
|
131
|
+
if "acryl_datahub_cloud" in str(e):
|
|
132
|
+
raise SdkUsageError(
|
|
133
|
+
"AssertionsClient is not installed, please install it with `pip install acryl-datahub-cloud`"
|
|
134
|
+
) from e
|
|
135
|
+
else:
|
|
136
|
+
raise e
|
|
120
137
|
return AssertionsClient(self)
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def subscriptions(self): # type: ignore[report-untyped-call] # Not available due to circular import issues
|
|
141
|
+
try:
|
|
142
|
+
from acryl_datahub_cloud.sdk import SubscriptionClient
|
|
143
|
+
except ImportError as e:
|
|
144
|
+
if "acryl_datahub_cloud" in str(e):
|
|
145
|
+
raise SdkUsageError(
|
|
146
|
+
"SubscriptionClient is not installed, please install it with `pip install acryl-datahub-cloud`"
|
|
147
|
+
) from e
|
|
148
|
+
else:
|
|
149
|
+
raise e
|
|
150
|
+
return SubscriptionClient(self)
|
datahub/sdk/mlmodel.py
CHANGED
|
@@ -24,6 +24,7 @@ from datahub.sdk._shared import (
|
|
|
24
24
|
HasInstitutionalMemory,
|
|
25
25
|
HasOwnership,
|
|
26
26
|
HasPlatformInstance,
|
|
27
|
+
HasStructuredProperties,
|
|
27
28
|
HasTags,
|
|
28
29
|
HasTerms,
|
|
29
30
|
HasVersion,
|
|
@@ -31,6 +32,7 @@ from datahub.sdk._shared import (
|
|
|
31
32
|
LinksInputType,
|
|
32
33
|
MLTrainingJobInputType,
|
|
33
34
|
OwnersInputType,
|
|
35
|
+
StructuredPropertyInputType,
|
|
34
36
|
TagsInputType,
|
|
35
37
|
TermsInputType,
|
|
36
38
|
TrainingMetricsInputType,
|
|
@@ -50,6 +52,7 @@ class MLModel(
|
|
|
50
52
|
HasTerms,
|
|
51
53
|
HasDomain,
|
|
52
54
|
HasVersion,
|
|
55
|
+
HasStructuredProperties,
|
|
53
56
|
Entity,
|
|
54
57
|
):
|
|
55
58
|
__slots__ = ()
|
|
@@ -82,53 +85,43 @@ class MLModel(
|
|
|
82
85
|
model_group: Optional[Union[str, MlModelGroupUrn]] = None,
|
|
83
86
|
training_jobs: Optional[MLTrainingJobInputType] = None,
|
|
84
87
|
downstream_jobs: Optional[MLTrainingJobInputType] = None,
|
|
88
|
+
structured_properties: Optional[StructuredPropertyInputType] = None,
|
|
85
89
|
extra_aspects: ExtraAspectsType = None,
|
|
86
90
|
):
|
|
87
91
|
urn = MlModelUrn(platform=platform, name=id, env=env)
|
|
88
92
|
super().__init__(urn)
|
|
89
93
|
self._set_extra_aspects(extra_aspects)
|
|
90
|
-
|
|
91
94
|
self._set_platform_instance(urn.platform, platform_instance)
|
|
92
|
-
|
|
93
95
|
self._ensure_model_props()
|
|
94
96
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
self.set_hyper_params(hyper_params)
|
|
107
|
-
if external_url is not None:
|
|
108
|
-
self.set_external_url(external_url)
|
|
109
|
-
if custom_properties is not None:
|
|
110
|
-
self.set_custom_properties(custom_properties)
|
|
111
|
-
if created is not None:
|
|
112
|
-
self.set_created(created)
|
|
113
|
-
if last_modified is not None:
|
|
114
|
-
self.set_last_modified(last_modified)
|
|
97
|
+
# Initialize properties in logical groups
|
|
98
|
+
self._init_basic_properties(
|
|
99
|
+
version=version,
|
|
100
|
+
name=name,
|
|
101
|
+
aliases=aliases,
|
|
102
|
+
description=description,
|
|
103
|
+
external_url=external_url,
|
|
104
|
+
custom_properties=custom_properties,
|
|
105
|
+
created=created,
|
|
106
|
+
last_modified=last_modified,
|
|
107
|
+
)
|
|
115
108
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
109
|
+
self._init_ml_specific_properties(
|
|
110
|
+
training_metrics=training_metrics,
|
|
111
|
+
hyper_params=hyper_params,
|
|
112
|
+
model_group=model_group,
|
|
113
|
+
training_jobs=training_jobs,
|
|
114
|
+
downstream_jobs=downstream_jobs,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
self._init_metadata_properties(
|
|
118
|
+
owners=owners,
|
|
119
|
+
links=links,
|
|
120
|
+
tags=tags,
|
|
121
|
+
terms=terms,
|
|
122
|
+
domain=domain,
|
|
123
|
+
structured_properties=structured_properties,
|
|
124
|
+
)
|
|
132
125
|
|
|
133
126
|
@classmethod
|
|
134
127
|
def _new_from_graph(cls, urn: Urn, current_aspects: AspectBag) -> Self:
|
|
@@ -299,3 +292,73 @@ class MLModel(
|
|
|
299
292
|
props.downstreamJobs = [
|
|
300
293
|
job for job in props.downstreamJobs if job != job_str
|
|
301
294
|
]
|
|
295
|
+
|
|
296
|
+
def _init_basic_properties(
|
|
297
|
+
self,
|
|
298
|
+
version: Optional[str] = None,
|
|
299
|
+
name: Optional[str] = None,
|
|
300
|
+
aliases: Optional[List[str]] = None,
|
|
301
|
+
description: Optional[str] = None,
|
|
302
|
+
external_url: Optional[str] = None,
|
|
303
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
304
|
+
created: Optional[datetime] = None,
|
|
305
|
+
last_modified: Optional[datetime] = None,
|
|
306
|
+
) -> None:
|
|
307
|
+
if version is not None:
|
|
308
|
+
self.set_version(version)
|
|
309
|
+
if name is not None:
|
|
310
|
+
self.set_name(name)
|
|
311
|
+
if aliases is not None:
|
|
312
|
+
self.set_version_aliases(aliases)
|
|
313
|
+
if description is not None:
|
|
314
|
+
self.set_description(description)
|
|
315
|
+
if external_url is not None:
|
|
316
|
+
self.set_external_url(external_url)
|
|
317
|
+
if custom_properties is not None:
|
|
318
|
+
self.set_custom_properties(custom_properties)
|
|
319
|
+
if created is not None:
|
|
320
|
+
self.set_created(created)
|
|
321
|
+
if last_modified is not None:
|
|
322
|
+
self.set_last_modified(last_modified)
|
|
323
|
+
|
|
324
|
+
def _init_ml_specific_properties(
|
|
325
|
+
self,
|
|
326
|
+
training_metrics: Optional[TrainingMetricsInputType] = None,
|
|
327
|
+
hyper_params: Optional[HyperParamsInputType] = None,
|
|
328
|
+
model_group: Optional[Union[str, MlModelGroupUrn]] = None,
|
|
329
|
+
training_jobs: Optional[MLTrainingJobInputType] = None,
|
|
330
|
+
downstream_jobs: Optional[MLTrainingJobInputType] = None,
|
|
331
|
+
) -> None:
|
|
332
|
+
if training_metrics is not None:
|
|
333
|
+
self.set_training_metrics(training_metrics)
|
|
334
|
+
if hyper_params is not None:
|
|
335
|
+
self.set_hyper_params(hyper_params)
|
|
336
|
+
if model_group is not None:
|
|
337
|
+
self.set_model_group(model_group)
|
|
338
|
+
if training_jobs is not None:
|
|
339
|
+
self.set_training_jobs(training_jobs)
|
|
340
|
+
if downstream_jobs is not None:
|
|
341
|
+
self.set_downstream_jobs(downstream_jobs)
|
|
342
|
+
|
|
343
|
+
def _init_metadata_properties(
|
|
344
|
+
self,
|
|
345
|
+
owners: Optional[OwnersInputType] = None,
|
|
346
|
+
links: Optional[LinksInputType] = None,
|
|
347
|
+
tags: Optional[TagsInputType] = None,
|
|
348
|
+
terms: Optional[TermsInputType] = None,
|
|
349
|
+
domain: Optional[DomainInputType] = None,
|
|
350
|
+
structured_properties: Optional[StructuredPropertyInputType] = None,
|
|
351
|
+
) -> None:
|
|
352
|
+
if owners is not None:
|
|
353
|
+
self.set_owners(owners)
|
|
354
|
+
if links is not None:
|
|
355
|
+
self.set_links(links)
|
|
356
|
+
if tags is not None:
|
|
357
|
+
self.set_tags(tags)
|
|
358
|
+
if terms is not None:
|
|
359
|
+
self.set_terms(terms)
|
|
360
|
+
if domain is not None:
|
|
361
|
+
self.set_domain(domain)
|
|
362
|
+
if structured_properties is not None:
|
|
363
|
+
for key, value in structured_properties.items():
|
|
364
|
+
self.set_structured_property(property_urn=key, values=value)
|
datahub/sdk/mlmodelgroup.py
CHANGED
|
@@ -17,10 +17,12 @@ from datahub.sdk._shared import (
|
|
|
17
17
|
HasInstitutionalMemory,
|
|
18
18
|
HasOwnership,
|
|
19
19
|
HasPlatformInstance,
|
|
20
|
+
HasStructuredProperties,
|
|
20
21
|
HasTags,
|
|
21
22
|
HasTerms,
|
|
22
23
|
LinksInputType,
|
|
23
24
|
OwnersInputType,
|
|
25
|
+
StructuredPropertyInputType,
|
|
24
26
|
TagsInputType,
|
|
25
27
|
TermsInputType,
|
|
26
28
|
make_time_stamp,
|
|
@@ -36,6 +38,7 @@ class MLModelGroup(
|
|
|
36
38
|
HasTags,
|
|
37
39
|
HasTerms,
|
|
38
40
|
HasDomain,
|
|
41
|
+
HasStructuredProperties,
|
|
39
42
|
Entity,
|
|
40
43
|
):
|
|
41
44
|
__slots__ = ()
|
|
@@ -66,6 +69,7 @@ class MLModelGroup(
|
|
|
66
69
|
domain: Optional[DomainInputType] = None,
|
|
67
70
|
training_jobs: Optional[Sequence[Union[str, DataProcessInstanceUrn]]] = None,
|
|
68
71
|
downstream_jobs: Optional[Sequence[Union[str, DataProcessInstanceUrn]]] = None,
|
|
72
|
+
structured_properties: Optional[StructuredPropertyInputType] = None,
|
|
69
73
|
extra_aspects: ExtraAspectsType = None,
|
|
70
74
|
):
|
|
71
75
|
urn = MlModelGroupUrn(platform=platform, name=id, env=env)
|
|
@@ -105,6 +109,9 @@ class MLModelGroup(
|
|
|
105
109
|
self.set_training_jobs(training_jobs)
|
|
106
110
|
if downstream_jobs is not None:
|
|
107
111
|
self.set_downstream_jobs(downstream_jobs)
|
|
112
|
+
if structured_properties is not None:
|
|
113
|
+
for key, value in structured_properties.items():
|
|
114
|
+
self.set_structured_property(property_urn=key, values=value)
|
|
108
115
|
|
|
109
116
|
@classmethod
|
|
110
117
|
def _new_from_graph(cls, urn: Urn, current_aspects: AspectBag) -> Self:
|
datahub/sdk/search_client.py
CHANGED
|
@@ -19,6 +19,7 @@ from datahub.sdk.search_filters import (
|
|
|
19
19
|
_OrFilters,
|
|
20
20
|
_StatusFilter,
|
|
21
21
|
)
|
|
22
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
22
23
|
|
|
23
24
|
if TYPE_CHECKING:
|
|
24
25
|
from datahub.sdk.main_client import DataHubClient
|
|
@@ -80,7 +81,7 @@ def compute_entity_types(
|
|
|
80
81
|
) -> Optional[List[str]]:
|
|
81
82
|
found_filters = False
|
|
82
83
|
found_positive_filters = False
|
|
83
|
-
entity_types:
|
|
84
|
+
entity_types: OrderedSet[str] = OrderedSet()
|
|
84
85
|
for ands in filters:
|
|
85
86
|
for clause in ands["and"]:
|
|
86
87
|
if clause.field == _EntityTypeFilter.ENTITY_TYPE_FIELD:
|
|
@@ -88,7 +89,7 @@ def compute_entity_types(
|
|
|
88
89
|
if not clause.negated:
|
|
89
90
|
found_positive_filters = True
|
|
90
91
|
|
|
91
|
-
entity_types.
|
|
92
|
+
entity_types.update(clause.values)
|
|
92
93
|
|
|
93
94
|
if not found_filters:
|
|
94
95
|
# If we didn't find any filters, use None so we use the default set.
|
|
@@ -100,7 +101,7 @@ def compute_entity_types(
|
|
|
100
101
|
# still want to use the default set.
|
|
101
102
|
return None
|
|
102
103
|
|
|
103
|
-
return entity_types
|
|
104
|
+
return list(entity_types)
|
|
104
105
|
|
|
105
106
|
|
|
106
107
|
class SearchClient:
|
datahub/specific/chart.py
CHANGED
datahub/specific/dataproduct.py
CHANGED
|
@@ -9,6 +9,9 @@ from datahub.metadata.schema_classes import (
|
|
|
9
9
|
)
|
|
10
10
|
from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
|
|
11
11
|
from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
|
|
12
|
+
from datahub.specific.aspect_helpers.structured_properties import (
|
|
13
|
+
HasStructuredPropertiesPatch,
|
|
14
|
+
)
|
|
12
15
|
from datahub.specific.aspect_helpers.tags import HasTagsPatch
|
|
13
16
|
from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
14
17
|
|
|
@@ -16,6 +19,7 @@ from datahub.specific.aspect_helpers.terms import HasTermsPatch
|
|
|
16
19
|
class DataProductPatchBuilder(
|
|
17
20
|
HasOwnershipPatch,
|
|
18
21
|
HasCustomPropertiesPatch,
|
|
22
|
+
HasStructuredPropertiesPatch,
|
|
19
23
|
HasTagsPatch,
|
|
20
24
|
HasTermsPatch,
|
|
21
25
|
MetadataPatchProposal,
|
|
@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
|
|
|
58
58
|
ToolMetaExtractorReport,
|
|
59
59
|
)
|
|
60
60
|
from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
|
|
61
|
+
from datahub.utilities.dedup_list import deduplicate_list
|
|
61
62
|
from datahub.utilities.file_backed_collections import (
|
|
62
63
|
ConnectionWrapper,
|
|
63
64
|
FileBackedDict,
|
|
@@ -140,6 +141,7 @@ class QueryMetadata:
|
|
|
140
141
|
|
|
141
142
|
used_temp_tables: bool = True
|
|
142
143
|
|
|
144
|
+
extra_info: Optional[dict] = None
|
|
143
145
|
origin: Optional[Urn] = None
|
|
144
146
|
|
|
145
147
|
def make_created_audit_stamp(self) -> models.AuditStampClass:
|
|
@@ -263,7 +265,7 @@ class PreparsedQuery:
|
|
|
263
265
|
query_type_props: QueryTypeProps = dataclasses.field(
|
|
264
266
|
default_factory=lambda: QueryTypeProps()
|
|
265
267
|
)
|
|
266
|
-
# Use this to store
|
|
268
|
+
# Use this to store additional key-value information about the query for debugging.
|
|
267
269
|
extra_info: Optional[dict] = None
|
|
268
270
|
origin: Optional[Urn] = None
|
|
269
271
|
|
|
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
948
950
|
column_usage=parsed.column_usage or {},
|
|
949
951
|
confidence_score=parsed.confidence_score,
|
|
950
952
|
used_temp_tables=session_has_temp_tables,
|
|
953
|
+
extra_info=parsed.extra_info,
|
|
951
954
|
origin=parsed.origin,
|
|
952
955
|
)
|
|
953
956
|
)
|
|
@@ -1491,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
|
|
|
1491
1494
|
return
|
|
1492
1495
|
|
|
1493
1496
|
# If a query doesn't involve any allowed tables, skip it.
|
|
1494
|
-
if
|
|
1495
|
-
self.is_allowed_table(
|
|
1496
|
-
):
|
|
1497
|
+
if (
|
|
1498
|
+
downstream_urn is None or not self.is_allowed_table(downstream_urn)
|
|
1499
|
+
) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
|
|
1497
1500
|
self.report.num_queries_skipped_due_to_filters += 1
|
|
1498
1501
|
return
|
|
1499
1502
|
|
|
@@ -1574,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
|
|
|
1574
1577
|
|
|
1575
1578
|
@dataclasses.dataclass
|
|
1576
1579
|
class QueryLineageInfo:
|
|
1577
|
-
upstreams:
|
|
1578
|
-
|
|
1580
|
+
upstreams: OrderedSet[
|
|
1581
|
+
UrnStr
|
|
1582
|
+
] # this is direct upstreams, with *no temp tables*
|
|
1583
|
+
column_lineage: OrderedSet[ColumnLineageInfo]
|
|
1579
1584
|
confidence_score: float
|
|
1580
1585
|
|
|
1581
1586
|
def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
|
|
1582
|
-
self.upstreams
|
|
1583
|
-
self.column_lineage
|
|
1587
|
+
self.upstreams.update(other_query.upstreams)
|
|
1588
|
+
self.column_lineage.update(other_query.column_lineage)
|
|
1584
1589
|
self.confidence_score = min(
|
|
1585
1590
|
self.confidence_score, other_query.confidence_score
|
|
1586
1591
|
)
|
|
1587
1592
|
|
|
1593
|
+
cache: Dict[str, QueryLineageInfo] = {}
|
|
1594
|
+
|
|
1588
1595
|
def _recurse_into_query(
|
|
1589
1596
|
query: QueryMetadata, recursion_path: List[QueryId]
|
|
1590
1597
|
) -> QueryLineageInfo:
|
|
1591
1598
|
if query.query_id in recursion_path:
|
|
1592
1599
|
# This is a cycle, so we just return the query as-is.
|
|
1593
1600
|
return QueryLineageInfo(
|
|
1594
|
-
upstreams=query.upstreams,
|
|
1595
|
-
column_lineage=query.column_lineage,
|
|
1601
|
+
upstreams=OrderedSet(query.upstreams),
|
|
1602
|
+
column_lineage=OrderedSet(query.column_lineage),
|
|
1596
1603
|
confidence_score=query.confidence_score,
|
|
1597
1604
|
)
|
|
1605
|
+
if query.query_id in cache:
|
|
1606
|
+
return cache[query.query_id]
|
|
1598
1607
|
recursion_path = [*recursion_path, query.query_id]
|
|
1599
1608
|
composed_of_queries.add(query.query_id)
|
|
1600
1609
|
|
|
@@ -1609,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
|
|
|
1609
1618
|
upstream_query = self._query_map.get(upstream_query_id)
|
|
1610
1619
|
if (
|
|
1611
1620
|
upstream_query
|
|
1612
|
-
and upstream_query.query_id not in
|
|
1621
|
+
and upstream_query.query_id not in recursion_path
|
|
1613
1622
|
):
|
|
1614
1623
|
temp_query_lineage_info = _recurse_into_query(
|
|
1615
1624
|
upstream_query, recursion_path
|
|
@@ -1669,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
|
|
|
1669
1678
|
]
|
|
1670
1679
|
)
|
|
1671
1680
|
|
|
1672
|
-
|
|
1673
|
-
upstreams=
|
|
1674
|
-
column_lineage=new_cll,
|
|
1681
|
+
ret = QueryLineageInfo(
|
|
1682
|
+
upstreams=new_upstreams,
|
|
1683
|
+
column_lineage=OrderedSet(new_cll),
|
|
1675
1684
|
confidence_score=new_confidence_score,
|
|
1676
1685
|
)
|
|
1686
|
+
cache[query.query_id] = ret
|
|
1687
|
+
|
|
1688
|
+
return ret
|
|
1677
1689
|
|
|
1678
1690
|
resolved_lineage_info = _recurse_into_query(base_query, [])
|
|
1679
1691
|
|
|
@@ -1706,15 +1718,15 @@ class SqlParsingAggregator(Closeable):
|
|
|
1706
1718
|
)
|
|
1707
1719
|
|
|
1708
1720
|
merged_query_text = ";\n\n".join(
|
|
1709
|
-
[q.formatted_query_string for q in ordered_queries]
|
|
1721
|
+
deduplicate_list([q.formatted_query_string for q in ordered_queries])
|
|
1710
1722
|
)
|
|
1711
1723
|
|
|
1712
1724
|
resolved_query = dataclasses.replace(
|
|
1713
1725
|
base_query,
|
|
1714
1726
|
query_id=composite_query_id,
|
|
1715
1727
|
formatted_query_string=merged_query_text,
|
|
1716
|
-
upstreams=resolved_lineage_info.upstreams,
|
|
1717
|
-
column_lineage=resolved_lineage_info.column_lineage,
|
|
1728
|
+
upstreams=list(resolved_lineage_info.upstreams),
|
|
1729
|
+
column_lineage=list(resolved_lineage_info.column_lineage),
|
|
1718
1730
|
confidence_score=resolved_lineage_info.confidence_score,
|
|
1719
1731
|
)
|
|
1720
1732
|
|