acryl-datahub 1.0.0.2rc4__py3-none-any.whl → 1.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/METADATA +2566 -2514
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/RECORD +159 -149
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/WHEEL +1 -1
- datahub/_version.py +1 -1
- datahub/api/circuit_breaker/operation_circuit_breaker.py +2 -2
- datahub/api/entities/datacontract/datacontract.py +35 -3
- datahub/api/entities/datajob/dataflow.py +3 -3
- datahub/api/entities/datajob/datajob.py +7 -4
- datahub/api/entities/dataset/dataset.py +9 -11
- datahub/api/entities/forms/forms.py +34 -34
- datahub/api/graphql/assertion.py +1 -1
- datahub/api/graphql/operation.py +4 -4
- datahub/cli/check_cli.py +3 -2
- datahub/cli/config_utils.py +2 -2
- datahub/cli/delete_cli.py +6 -5
- datahub/cli/docker_cli.py +2 -2
- datahub/cli/exists_cli.py +2 -1
- datahub/cli/get_cli.py +2 -1
- datahub/cli/iceberg_cli.py +6 -5
- datahub/cli/ingest_cli.py +9 -6
- datahub/cli/migrate.py +4 -3
- datahub/cli/migration_utils.py +4 -3
- datahub/cli/put_cli.py +3 -2
- datahub/cli/specific/assertions_cli.py +2 -1
- datahub/cli/specific/datacontract_cli.py +3 -2
- datahub/cli/specific/dataproduct_cli.py +10 -9
- datahub/cli/specific/dataset_cli.py +4 -3
- datahub/cli/specific/forms_cli.py +2 -1
- datahub/cli/specific/group_cli.py +2 -1
- datahub/cli/specific/structuredproperties_cli.py +4 -3
- datahub/cli/specific/user_cli.py +2 -1
- datahub/cli/state_cli.py +2 -1
- datahub/cli/timeline_cli.py +2 -1
- datahub/configuration/common.py +5 -0
- datahub/configuration/source_common.py +1 -1
- datahub/emitter/mcp.py +20 -5
- datahub/emitter/request_helper.py +116 -3
- datahub/emitter/rest_emitter.py +163 -93
- datahub/entrypoints.py +2 -1
- datahub/errors.py +4 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +2 -1
- datahub/ingestion/api/source.py +2 -5
- datahub/ingestion/api/source_helpers.py +1 -0
- datahub/ingestion/glossary/classification_mixin.py +4 -2
- datahub/ingestion/graph/client.py +33 -8
- datahub/ingestion/graph/config.py +14 -0
- datahub/ingestion/graph/filters.py +1 -1
- datahub/ingestion/graph/links.py +53 -0
- datahub/ingestion/run/pipeline.py +9 -6
- datahub/ingestion/run/pipeline_config.py +1 -1
- datahub/ingestion/sink/datahub_rest.py +5 -6
- datahub/ingestion/source/apply/datahub_apply.py +2 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/bigquery_v2/bigquery.py +24 -23
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +4 -62
- datahub/ingestion/source/bigquery_v2/bigquery_connection.py +70 -0
- datahub/ingestion/source/bigquery_v2/bigquery_queries.py +3 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +25 -24
- datahub/ingestion/source/common/subtypes.py +3 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +12 -11
- datahub/ingestion/source/dbt/dbt_cloud.py +2 -6
- datahub/ingestion/source/dbt/dbt_common.py +10 -2
- datahub/ingestion/source/dbt/dbt_core.py +82 -42
- datahub/ingestion/source/dynamodb/dynamodb.py +7 -4
- datahub/ingestion/source/feast.py +4 -4
- datahub/ingestion/source/fivetran/config.py +1 -1
- datahub/ingestion/source/fivetran/fivetran_log_api.py +7 -3
- datahub/ingestion/source/fivetran/fivetran_query.py +16 -16
- datahub/ingestion/source/ge_data_profiler.py +27 -1
- datahub/ingestion/source/hex/api.py +1 -20
- datahub/ingestion/source/hex/query_fetcher.py +4 -1
- datahub/ingestion/source/iceberg/iceberg.py +20 -4
- datahub/ingestion/source/iceberg/iceberg_common.py +2 -2
- datahub/ingestion/source/ldap.py +1 -1
- datahub/ingestion/source/looker/looker_common.py +17 -2
- datahub/ingestion/source/looker/looker_lib_wrapper.py +1 -1
- datahub/ingestion/source/looker/looker_source.py +34 -5
- datahub/ingestion/source/looker/lookml_source.py +7 -1
- datahub/ingestion/source/metadata/lineage.py +2 -1
- datahub/ingestion/source/mlflow.py +19 -6
- datahub/ingestion/source/mode.py +74 -28
- datahub/ingestion/source/neo4j/neo4j_source.py +85 -55
- datahub/ingestion/source/powerbi/config.py +13 -1
- datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
- datahub/ingestion/source/powerbi/m_query/odbc.py +185 -0
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +153 -0
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -2
- datahub/ingestion/source/redshift/usage.py +10 -9
- datahub/ingestion/source/sigma/config.py +74 -6
- datahub/ingestion/source/sigma/sigma.py +16 -1
- datahub/ingestion/source/sigma/sigma_api.py +99 -58
- datahub/ingestion/source/slack/slack.py +4 -52
- datahub/ingestion/source/snowflake/snowflake_config.py +2 -12
- datahub/ingestion/source/snowflake/snowflake_connection.py +24 -18
- datahub/ingestion/source/snowflake/snowflake_profiler.py +1 -6
- datahub/ingestion/source/snowflake/snowflake_queries.py +18 -4
- datahub/ingestion/source/snowflake/snowflake_query.py +9 -63
- datahub/ingestion/source/snowflake/snowflake_tag.py +4 -1
- datahub/ingestion/source/sql/athena.py +2 -1
- datahub/ingestion/source/sql/clickhouse.py +5 -1
- datahub/ingestion/source/sql/druid.py +7 -2
- datahub/ingestion/source/sql/hive.py +7 -2
- datahub/ingestion/source/sql/hive_metastore.py +5 -5
- datahub/ingestion/source/sql/mssql/source.py +1 -1
- datahub/ingestion/source/sql/oracle.py +6 -2
- datahub/ingestion/source/sql/sql_config.py +1 -34
- datahub/ingestion/source/sql/sqlalchemy_uri.py +36 -0
- datahub/ingestion/source/sql/stored_procedures/base.py +12 -1
- datahub/ingestion/source/sql/two_tier_sql_source.py +1 -1
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +2 -1
- datahub/ingestion/source/tableau/tableau.py +31 -6
- datahub/ingestion/source/tableau/tableau_validation.py +1 -1
- datahub/ingestion/source/unity/config.py +2 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +7 -3
- datahub/ingestion/source/usage/starburst_trino_usage.py +5 -3
- datahub/ingestion/source/vertexai/vertexai.py +316 -4
- datahub/ingestion/source/vertexai/vertexai_result_type_utils.py +23 -2
- datahub/integrations/assertion/common.py +3 -2
- datahub/metadata/{_schema_classes.py → _internal_schema_classes.py} +538 -493
- datahub/metadata/_urns/urn_defs.py +1819 -1763
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/schema.avsc +17296 -16883
- datahub/metadata/schema_classes.py +3 -3
- datahub/metadata/schemas/DataContractKey.avsc +2 -1
- datahub/metadata/schemas/DataHubOpenAPISchemaKey.avsc +22 -0
- datahub/metadata/schemas/DataTransformLogic.avsc +4 -2
- datahub/metadata/schemas/FormInfo.avsc +5 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +6 -0
- datahub/metadata/schemas/MetadataChangeLog.avsc +3 -0
- datahub/metadata/schemas/MetadataChangeProposal.avsc +3 -0
- datahub/metadata/schemas/QueryProperties.avsc +4 -2
- datahub/metadata/schemas/SystemMetadata.avsc +86 -0
- datahub/metadata/schemas/__init__.py +3 -3
- datahub/sdk/_all_entities.py +4 -0
- datahub/sdk/_shared.py +142 -4
- datahub/sdk/_utils.py +4 -0
- datahub/sdk/dataset.py +2 -2
- datahub/sdk/entity_client.py +8 -0
- datahub/sdk/lineage_client.py +235 -0
- datahub/sdk/main_client.py +6 -3
- datahub/sdk/mlmodel.py +301 -0
- datahub/sdk/mlmodelgroup.py +233 -0
- datahub/secret/datahub_secret_store.py +2 -1
- datahub/specific/dataset.py +12 -0
- datahub/sql_parsing/fingerprint_utils.py +6 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +48 -34
- datahub/sql_parsing/sqlglot_utils.py +18 -14
- datahub/telemetry/telemetry.py +2 -2
- datahub/testing/check_imports.py +1 -1
- datahub/testing/mcp_diff.py +15 -2
- datahub/upgrade/upgrade.py +10 -12
- datahub/utilities/logging_manager.py +8 -1
- datahub/utilities/server_config_util.py +350 -10
- datahub/utilities/sqlalchemy_query_combiner.py +4 -5
- datahub/utilities/urn_encoder.py +1 -1
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/licenses/LICENSE +0 -0
- {acryl_datahub-1.0.0.2rc4.dist-info → acryl_datahub-1.0.0.3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import logging
|
|
5
|
+
from typing import TYPE_CHECKING, List, Literal, Optional, Set, Union
|
|
6
|
+
|
|
7
|
+
import datahub.metadata.schema_classes as models
|
|
8
|
+
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
9
|
+
from datahub.errors import SdkUsageError
|
|
10
|
+
from datahub.metadata.schema_classes import SchemaMetadataClass
|
|
11
|
+
from datahub.metadata.urns import DatasetUrn, QueryUrn
|
|
12
|
+
from datahub.sdk._shared import DatasetUrnOrStr
|
|
13
|
+
from datahub.sdk._utils import DEFAULT_ACTOR_URN
|
|
14
|
+
from datahub.sdk.dataset import ColumnLineageMapping, parse_cll_mapping
|
|
15
|
+
from datahub.specific.dataset import DatasetPatchBuilder
|
|
16
|
+
from datahub.sql_parsing.fingerprint_utils import generate_hash
|
|
17
|
+
from datahub.utilities.ordered_set import OrderedSet
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from datahub.sdk.main_client import DataHubClient
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
_empty_audit_stamp = models.AuditStampClass(
|
|
25
|
+
time=0,
|
|
26
|
+
actor=DEFAULT_ACTOR_URN,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LineageClient:
|
|
31
|
+
def __init__(self, client: DataHubClient):
|
|
32
|
+
self._client = client
|
|
33
|
+
|
|
34
|
+
def _get_fields_from_dataset_urn(self, dataset_urn: DatasetUrn) -> Set[str]:
|
|
35
|
+
schema_metadata = self._client._graph.get_aspect(
|
|
36
|
+
str(dataset_urn), SchemaMetadataClass
|
|
37
|
+
)
|
|
38
|
+
if schema_metadata is None:
|
|
39
|
+
return Set()
|
|
40
|
+
|
|
41
|
+
return {field.fieldPath for field in schema_metadata.fields}
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def _get_strict_column_lineage(
|
|
45
|
+
cls,
|
|
46
|
+
upstream_fields: Set[str],
|
|
47
|
+
downstream_fields: Set[str],
|
|
48
|
+
) -> ColumnLineageMapping:
|
|
49
|
+
"""Find matches between upstream and downstream fields with case-insensitive matching."""
|
|
50
|
+
strict_column_lineage: ColumnLineageMapping = {}
|
|
51
|
+
|
|
52
|
+
# Create case-insensitive mapping of upstream fields
|
|
53
|
+
case_insensitive_map = {field.lower(): field for field in upstream_fields}
|
|
54
|
+
|
|
55
|
+
# Match downstream fields using case-insensitive comparison
|
|
56
|
+
for downstream_field in downstream_fields:
|
|
57
|
+
lower_field = downstream_field.lower()
|
|
58
|
+
if lower_field in case_insensitive_map:
|
|
59
|
+
# Use the original case of the upstream field
|
|
60
|
+
strict_column_lineage[downstream_field] = [
|
|
61
|
+
case_insensitive_map[lower_field]
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
return strict_column_lineage
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def _get_fuzzy_column_lineage(
|
|
68
|
+
cls,
|
|
69
|
+
upstream_fields: Set[str],
|
|
70
|
+
downstream_fields: Set[str],
|
|
71
|
+
) -> ColumnLineageMapping:
|
|
72
|
+
"""Generate fuzzy matches between upstream and downstream fields."""
|
|
73
|
+
|
|
74
|
+
# Simple normalization function for better matching
|
|
75
|
+
def normalize(s: str) -> str:
|
|
76
|
+
return s.lower().replace("_", "")
|
|
77
|
+
|
|
78
|
+
# Create normalized lookup for upstream fields
|
|
79
|
+
normalized_upstream = {normalize(field): field for field in upstream_fields}
|
|
80
|
+
|
|
81
|
+
fuzzy_column_lineage = {}
|
|
82
|
+
for downstream_field in downstream_fields:
|
|
83
|
+
# Try exact match first
|
|
84
|
+
if downstream_field in upstream_fields:
|
|
85
|
+
fuzzy_column_lineage[downstream_field] = [downstream_field]
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
# Try normalized match
|
|
89
|
+
norm_downstream = normalize(downstream_field)
|
|
90
|
+
if norm_downstream in normalized_upstream:
|
|
91
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
92
|
+
normalized_upstream[norm_downstream]
|
|
93
|
+
]
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# If no direct match, find closest match using similarity
|
|
97
|
+
matches = difflib.get_close_matches(
|
|
98
|
+
norm_downstream,
|
|
99
|
+
normalized_upstream.keys(),
|
|
100
|
+
n=1, # Return only the best match
|
|
101
|
+
cutoff=0.8, # Adjust cutoff for sensitivity
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if matches:
|
|
105
|
+
fuzzy_column_lineage[downstream_field] = [
|
|
106
|
+
normalized_upstream[matches[0]]
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
return fuzzy_column_lineage
|
|
110
|
+
|
|
111
|
+
def add_dataset_copy_lineage(
|
|
112
|
+
self,
|
|
113
|
+
*,
|
|
114
|
+
upstream: DatasetUrnOrStr,
|
|
115
|
+
downstream: DatasetUrnOrStr,
|
|
116
|
+
column_lineage: Union[
|
|
117
|
+
None, ColumnLineageMapping, Literal["auto_fuzzy", "auto_strict"]
|
|
118
|
+
] = "auto_fuzzy",
|
|
119
|
+
) -> None:
|
|
120
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
121
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
122
|
+
|
|
123
|
+
if column_lineage is None:
|
|
124
|
+
cll = None
|
|
125
|
+
elif column_lineage in ["auto_fuzzy", "auto_strict"]:
|
|
126
|
+
upstream_schema = self._get_fields_from_dataset_urn(upstream)
|
|
127
|
+
downstream_schema = self._get_fields_from_dataset_urn(downstream)
|
|
128
|
+
if column_lineage == "auto_fuzzy":
|
|
129
|
+
mapping = self._get_fuzzy_column_lineage(
|
|
130
|
+
upstream_schema, downstream_schema
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
mapping = self._get_strict_column_lineage(
|
|
134
|
+
upstream_schema, downstream_schema
|
|
135
|
+
)
|
|
136
|
+
cll = parse_cll_mapping(
|
|
137
|
+
upstream=upstream,
|
|
138
|
+
downstream=downstream,
|
|
139
|
+
cll_mapping=mapping,
|
|
140
|
+
)
|
|
141
|
+
elif isinstance(column_lineage, dict):
|
|
142
|
+
cll = parse_cll_mapping(
|
|
143
|
+
upstream=upstream,
|
|
144
|
+
downstream=downstream,
|
|
145
|
+
cll_mapping=column_lineage,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
149
|
+
updater.add_upstream_lineage(
|
|
150
|
+
models.UpstreamClass(
|
|
151
|
+
dataset=str(upstream),
|
|
152
|
+
type=models.DatasetLineageTypeClass.COPY,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
for cl in cll or []:
|
|
156
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
157
|
+
|
|
158
|
+
self._client.entities.update(updater)
|
|
159
|
+
|
|
160
|
+
def add_dataset_transform_lineage(
|
|
161
|
+
self,
|
|
162
|
+
*,
|
|
163
|
+
upstream: DatasetUrnOrStr,
|
|
164
|
+
downstream: DatasetUrnOrStr,
|
|
165
|
+
column_lineage: Optional[ColumnLineageMapping] = None,
|
|
166
|
+
query_text: Optional[str] = None,
|
|
167
|
+
) -> None:
|
|
168
|
+
upstream = DatasetUrn.from_string(upstream)
|
|
169
|
+
downstream = DatasetUrn.from_string(downstream)
|
|
170
|
+
|
|
171
|
+
cll = None
|
|
172
|
+
if column_lineage is not None:
|
|
173
|
+
cll = parse_cll_mapping(
|
|
174
|
+
upstream=upstream,
|
|
175
|
+
downstream=downstream,
|
|
176
|
+
cll_mapping=column_lineage,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
fields_involved = OrderedSet([str(upstream), str(downstream)])
|
|
180
|
+
if cll is not None:
|
|
181
|
+
for c in cll:
|
|
182
|
+
for field in c.upstreams or []:
|
|
183
|
+
fields_involved.add(field)
|
|
184
|
+
for field in c.downstreams or []:
|
|
185
|
+
fields_involved.add(field)
|
|
186
|
+
|
|
187
|
+
query_urn = None
|
|
188
|
+
query_entity = None
|
|
189
|
+
if query_text:
|
|
190
|
+
# Eventually we might want to use our regex-based fingerprinting instead.
|
|
191
|
+
fingerprint = generate_hash(query_text)
|
|
192
|
+
query_urn = QueryUrn(fingerprint).urn()
|
|
193
|
+
|
|
194
|
+
from datahub.sql_parsing.sql_parsing_aggregator import make_query_subjects
|
|
195
|
+
|
|
196
|
+
query_entity = MetadataChangeProposalWrapper.construct_many(
|
|
197
|
+
query_urn,
|
|
198
|
+
aspects=[
|
|
199
|
+
models.QueryPropertiesClass(
|
|
200
|
+
statement=models.QueryStatementClass(
|
|
201
|
+
value=query_text, language=models.QueryLanguageClass.SQL
|
|
202
|
+
),
|
|
203
|
+
source=models.QuerySourceClass.SYSTEM,
|
|
204
|
+
created=_empty_audit_stamp,
|
|
205
|
+
lastModified=_empty_audit_stamp,
|
|
206
|
+
),
|
|
207
|
+
make_query_subjects(list(fields_involved)),
|
|
208
|
+
],
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
updater = DatasetPatchBuilder(str(downstream))
|
|
212
|
+
updater.add_upstream_lineage(
|
|
213
|
+
models.UpstreamClass(
|
|
214
|
+
dataset=str(upstream),
|
|
215
|
+
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
|
216
|
+
query=query_urn,
|
|
217
|
+
)
|
|
218
|
+
)
|
|
219
|
+
for cl in cll or []:
|
|
220
|
+
cl.query = query_urn
|
|
221
|
+
updater.add_fine_grained_upstream_lineage(cl)
|
|
222
|
+
|
|
223
|
+
# Throw if the dataset does not exist.
|
|
224
|
+
# We need to manually call .build() instead of reusing client.update()
|
|
225
|
+
# so that we make just one emit_mcps call.
|
|
226
|
+
if not self._client._graph.exists(updater.urn):
|
|
227
|
+
raise SdkUsageError(
|
|
228
|
+
f"Dataset {updater.urn} does not exist, and hence cannot be updated."
|
|
229
|
+
)
|
|
230
|
+
mcps: List[
|
|
231
|
+
Union[MetadataChangeProposalWrapper, models.MetadataChangeProposalClass]
|
|
232
|
+
] = list(updater.build())
|
|
233
|
+
if query_entity:
|
|
234
|
+
mcps.extend(query_entity)
|
|
235
|
+
self._client._graph.emit_mcps(mcps)
|
datahub/sdk/main_client.py
CHANGED
|
@@ -4,8 +4,9 @@ from typing import Optional, overload
|
|
|
4
4
|
|
|
5
5
|
from datahub.errors import SdkUsageError
|
|
6
6
|
from datahub.ingestion.graph.client import DataHubGraph, get_default_graph
|
|
7
|
-
from datahub.ingestion.graph.config import DatahubClientConfig
|
|
7
|
+
from datahub.ingestion.graph.config import ClientMode, DatahubClientConfig
|
|
8
8
|
from datahub.sdk.entity_client import EntityClient
|
|
9
|
+
from datahub.sdk.lineage_client import LineageClient
|
|
9
10
|
from datahub.sdk.resolver_client import ResolverClient
|
|
10
11
|
from datahub.sdk.search_client import SearchClient
|
|
11
12
|
|
|
@@ -83,7 +84,7 @@ class DataHubClient:
|
|
|
83
84
|
# Inspired by the DockerClient.from_env() method.
|
|
84
85
|
# TODO: This one also reads from ~/.datahubenv, so the "from_env" name might be a bit confusing.
|
|
85
86
|
# That file is part of the "environment", but is not a traditional "env variable".
|
|
86
|
-
graph = get_default_graph()
|
|
87
|
+
graph = get_default_graph(ClientMode.SDK)
|
|
87
88
|
|
|
88
89
|
return cls(graph=graph)
|
|
89
90
|
|
|
@@ -99,4 +100,6 @@ class DataHubClient:
|
|
|
99
100
|
def search(self) -> SearchClient:
|
|
100
101
|
return SearchClient(self)
|
|
101
102
|
|
|
102
|
-
|
|
103
|
+
@property
|
|
104
|
+
def lineage(self) -> LineageClient:
|
|
105
|
+
return LineageClient(self)
|
datahub/sdk/mlmodel.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import Dict, List, Optional, Sequence, Type, Union
|
|
5
|
+
|
|
6
|
+
from typing_extensions import Self
|
|
7
|
+
|
|
8
|
+
from datahub.emitter.mce_builder import DEFAULT_ENV
|
|
9
|
+
from datahub.metadata.schema_classes import (
|
|
10
|
+
AspectBag,
|
|
11
|
+
MLHyperParamClass,
|
|
12
|
+
MLMetricClass,
|
|
13
|
+
MLModelPropertiesClass,
|
|
14
|
+
)
|
|
15
|
+
from datahub.metadata.urns import (
|
|
16
|
+
DataProcessInstanceUrn,
|
|
17
|
+
MlModelGroupUrn,
|
|
18
|
+
MlModelUrn,
|
|
19
|
+
Urn,
|
|
20
|
+
)
|
|
21
|
+
from datahub.sdk._shared import (
|
|
22
|
+
DomainInputType,
|
|
23
|
+
HasDomain,
|
|
24
|
+
HasInstitutionalMemory,
|
|
25
|
+
HasOwnership,
|
|
26
|
+
HasPlatformInstance,
|
|
27
|
+
HasTags,
|
|
28
|
+
HasTerms,
|
|
29
|
+
HasVersion,
|
|
30
|
+
HyperParamsInputType,
|
|
31
|
+
LinksInputType,
|
|
32
|
+
MLTrainingJobInputType,
|
|
33
|
+
OwnersInputType,
|
|
34
|
+
TagsInputType,
|
|
35
|
+
TermsInputType,
|
|
36
|
+
TrainingMetricsInputType,
|
|
37
|
+
convert_hyper_params,
|
|
38
|
+
convert_training_metrics,
|
|
39
|
+
make_time_stamp,
|
|
40
|
+
parse_time_stamp,
|
|
41
|
+
)
|
|
42
|
+
from datahub.sdk.entity import Entity, ExtraAspectsType
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MLModel(
|
|
46
|
+
HasPlatformInstance,
|
|
47
|
+
HasOwnership,
|
|
48
|
+
HasInstitutionalMemory,
|
|
49
|
+
HasTags,
|
|
50
|
+
HasTerms,
|
|
51
|
+
HasDomain,
|
|
52
|
+
HasVersion,
|
|
53
|
+
Entity,
|
|
54
|
+
):
|
|
55
|
+
__slots__ = ()
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def get_urn_type(cls) -> Type[MlModelUrn]:
|
|
59
|
+
return MlModelUrn
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
id: str,
|
|
64
|
+
platform: str,
|
|
65
|
+
version: Optional[str] = None,
|
|
66
|
+
aliases: Optional[List[str]] = None,
|
|
67
|
+
platform_instance: Optional[str] = None,
|
|
68
|
+
env: str = DEFAULT_ENV,
|
|
69
|
+
name: Optional[str] = None,
|
|
70
|
+
description: Optional[str] = None,
|
|
71
|
+
training_metrics: Optional[TrainingMetricsInputType] = None,
|
|
72
|
+
hyper_params: Optional[HyperParamsInputType] = None,
|
|
73
|
+
external_url: Optional[str] = None,
|
|
74
|
+
custom_properties: Optional[Dict[str, str]] = None,
|
|
75
|
+
created: Optional[datetime] = None,
|
|
76
|
+
last_modified: Optional[datetime] = None,
|
|
77
|
+
owners: Optional[OwnersInputType] = None,
|
|
78
|
+
links: Optional[LinksInputType] = None,
|
|
79
|
+
tags: Optional[TagsInputType] = None,
|
|
80
|
+
terms: Optional[TermsInputType] = None,
|
|
81
|
+
domain: Optional[DomainInputType] = None,
|
|
82
|
+
model_group: Optional[Union[str, MlModelGroupUrn]] = None,
|
|
83
|
+
training_jobs: Optional[MLTrainingJobInputType] = None,
|
|
84
|
+
downstream_jobs: Optional[MLTrainingJobInputType] = None,
|
|
85
|
+
extra_aspects: ExtraAspectsType = None,
|
|
86
|
+
):
|
|
87
|
+
urn = MlModelUrn(platform=platform, name=id, env=env)
|
|
88
|
+
super().__init__(urn)
|
|
89
|
+
self._set_extra_aspects(extra_aspects)
|
|
90
|
+
|
|
91
|
+
self._set_platform_instance(urn.platform, platform_instance)
|
|
92
|
+
|
|
93
|
+
self._ensure_model_props()
|
|
94
|
+
|
|
95
|
+
if version is not None:
|
|
96
|
+
self.set_version(version)
|
|
97
|
+
if name is not None:
|
|
98
|
+
self.set_name(name)
|
|
99
|
+
if aliases is not None:
|
|
100
|
+
self.set_version_aliases(aliases)
|
|
101
|
+
if description is not None:
|
|
102
|
+
self.set_description(description)
|
|
103
|
+
if training_metrics is not None:
|
|
104
|
+
self.set_training_metrics(training_metrics)
|
|
105
|
+
if hyper_params is not None:
|
|
106
|
+
self.set_hyper_params(hyper_params)
|
|
107
|
+
if external_url is not None:
|
|
108
|
+
self.set_external_url(external_url)
|
|
109
|
+
if custom_properties is not None:
|
|
110
|
+
self.set_custom_properties(custom_properties)
|
|
111
|
+
if created is not None:
|
|
112
|
+
self.set_created(created)
|
|
113
|
+
if last_modified is not None:
|
|
114
|
+
self.set_last_modified(last_modified)
|
|
115
|
+
|
|
116
|
+
if owners is not None:
|
|
117
|
+
self.set_owners(owners)
|
|
118
|
+
if links is not None:
|
|
119
|
+
self.set_links(links)
|
|
120
|
+
if tags is not None:
|
|
121
|
+
self.set_tags(tags)
|
|
122
|
+
if terms is not None:
|
|
123
|
+
self.set_terms(terms)
|
|
124
|
+
if domain is not None:
|
|
125
|
+
self.set_domain(domain)
|
|
126
|
+
if model_group is not None:
|
|
127
|
+
self.set_model_group(model_group)
|
|
128
|
+
if training_jobs is not None:
|
|
129
|
+
self.set_training_jobs(training_jobs)
|
|
130
|
+
if downstream_jobs is not None:
|
|
131
|
+
self.set_downstream_jobs(downstream_jobs)
|
|
132
|
+
|
|
133
|
+
@classmethod
|
|
134
|
+
def _new_from_graph(cls, urn: Urn, current_aspects: AspectBag) -> Self:
|
|
135
|
+
assert isinstance(urn, MlModelUrn)
|
|
136
|
+
entity = cls(
|
|
137
|
+
id=urn.name,
|
|
138
|
+
platform=urn.platform,
|
|
139
|
+
env=urn.env,
|
|
140
|
+
)
|
|
141
|
+
return entity._init_from_graph(current_aspects)
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def urn(self) -> MlModelUrn:
|
|
145
|
+
return self._urn # type: ignore
|
|
146
|
+
|
|
147
|
+
def _ensure_model_props(
|
|
148
|
+
self,
|
|
149
|
+
) -> MLModelPropertiesClass:
|
|
150
|
+
return self._setdefault_aspect(MLModelPropertiesClass())
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def name(self) -> Optional[str]:
|
|
154
|
+
return self._ensure_model_props().name
|
|
155
|
+
|
|
156
|
+
def set_name(self, name: str) -> None:
|
|
157
|
+
self._ensure_model_props().name = name
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def description(self) -> Optional[str]:
|
|
161
|
+
return self._ensure_model_props().description
|
|
162
|
+
|
|
163
|
+
def set_description(self, description: str) -> None:
|
|
164
|
+
self._ensure_model_props().description = description
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def external_url(self) -> Optional[str]:
|
|
168
|
+
return self._ensure_model_props().externalUrl
|
|
169
|
+
|
|
170
|
+
def set_external_url(self, external_url: str) -> None:
|
|
171
|
+
self._ensure_model_props().externalUrl = external_url
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def custom_properties(self) -> Optional[Dict[str, str]]:
|
|
175
|
+
return self._ensure_model_props().customProperties
|
|
176
|
+
|
|
177
|
+
def set_custom_properties(self, custom_properties: Dict[str, str]) -> None:
|
|
178
|
+
self._ensure_model_props().customProperties = custom_properties
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def created(self) -> Optional[datetime]:
|
|
182
|
+
return parse_time_stamp(self._ensure_model_props().created)
|
|
183
|
+
|
|
184
|
+
def set_created(self, created: datetime) -> None:
|
|
185
|
+
self._ensure_model_props().created = make_time_stamp(created)
|
|
186
|
+
|
|
187
|
+
@property
|
|
188
|
+
def last_modified(self) -> Optional[datetime]:
|
|
189
|
+
return parse_time_stamp(self._ensure_model_props().lastModified)
|
|
190
|
+
|
|
191
|
+
def set_last_modified(self, last_modified: datetime) -> None:
|
|
192
|
+
self._ensure_model_props().lastModified = make_time_stamp(last_modified)
|
|
193
|
+
|
|
194
|
+
@property
|
|
195
|
+
def training_metrics(self) -> Optional[List[MLMetricClass]]:
|
|
196
|
+
return self._ensure_model_props().trainingMetrics
|
|
197
|
+
|
|
198
|
+
def set_training_metrics(self, metrics: TrainingMetricsInputType) -> None:
|
|
199
|
+
self._ensure_model_props().trainingMetrics = convert_training_metrics(metrics)
|
|
200
|
+
|
|
201
|
+
def add_training_metrics(self, metrics: TrainingMetricsInputType) -> None:
|
|
202
|
+
props = self._ensure_model_props()
|
|
203
|
+
if props.trainingMetrics is None:
|
|
204
|
+
props.trainingMetrics = []
|
|
205
|
+
if isinstance(metrics, list):
|
|
206
|
+
props.trainingMetrics.extend(
|
|
207
|
+
[
|
|
208
|
+
MLMetricClass(name=metric.name, value=metric.value)
|
|
209
|
+
for metric in metrics
|
|
210
|
+
]
|
|
211
|
+
)
|
|
212
|
+
else:
|
|
213
|
+
# For dictionary case, use the key as name and value as value
|
|
214
|
+
for name, value in metrics.items():
|
|
215
|
+
props.trainingMetrics.append(MLMetricClass(name=name, value=value))
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def hyper_params(self) -> Optional[List[MLHyperParamClass]]:
|
|
219
|
+
return self._ensure_model_props().hyperParams
|
|
220
|
+
|
|
221
|
+
def set_hyper_params(self, params: HyperParamsInputType) -> None:
|
|
222
|
+
self._ensure_model_props().hyperParams = convert_hyper_params(params)
|
|
223
|
+
|
|
224
|
+
def add_hyper_params(self, params: HyperParamsInputType) -> None:
|
|
225
|
+
props = self._ensure_model_props()
|
|
226
|
+
if props.hyperParams is None:
|
|
227
|
+
props.hyperParams = []
|
|
228
|
+
if isinstance(params, list):
|
|
229
|
+
props.hyperParams.extend(
|
|
230
|
+
[
|
|
231
|
+
MLHyperParamClass(name=param.name, value=param.value)
|
|
232
|
+
for param in params
|
|
233
|
+
]
|
|
234
|
+
)
|
|
235
|
+
else:
|
|
236
|
+
# For dictionary case, iterate through key-value pairs
|
|
237
|
+
for name, value in params.items():
|
|
238
|
+
props.hyperParams.append(MLHyperParamClass(name=name, value=value))
|
|
239
|
+
|
|
240
|
+
@property
|
|
241
|
+
def model_group(self) -> Optional[str]:
|
|
242
|
+
props = self._ensure_model_props()
|
|
243
|
+
groups = props.groups
|
|
244
|
+
if groups is None or len(groups) == 0:
|
|
245
|
+
return None
|
|
246
|
+
return groups[0]
|
|
247
|
+
|
|
248
|
+
def set_model_group(self, group: Union[str, MlModelGroupUrn]) -> None:
|
|
249
|
+
self._ensure_model_props().groups = [str(group)]
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def training_jobs(self) -> Optional[List[str]]:
|
|
253
|
+
return self._ensure_model_props().trainingJobs
|
|
254
|
+
|
|
255
|
+
def set_training_jobs(self, training_jobs: MLTrainingJobInputType) -> None:
|
|
256
|
+
self._ensure_model_props().trainingJobs = [str(job) for job in training_jobs]
|
|
257
|
+
|
|
258
|
+
def add_training_job(
|
|
259
|
+
self, training_job: Union[str, DataProcessInstanceUrn]
|
|
260
|
+
) -> None:
|
|
261
|
+
props = self._ensure_model_props()
|
|
262
|
+
if props.trainingJobs is None:
|
|
263
|
+
props.trainingJobs = []
|
|
264
|
+
props.trainingJobs.append(str(training_job))
|
|
265
|
+
|
|
266
|
+
def remove_training_job(
|
|
267
|
+
self, training_job: Union[str, DataProcessInstanceUrn]
|
|
268
|
+
) -> None:
|
|
269
|
+
props = self._ensure_model_props()
|
|
270
|
+
if props.trainingJobs is not None:
|
|
271
|
+
job_str = str(training_job)
|
|
272
|
+
props.trainingJobs = [job for job in props.trainingJobs if job != job_str]
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def downstream_jobs(self) -> Optional[List[str]]:
|
|
276
|
+
return self._ensure_model_props().downstreamJobs
|
|
277
|
+
|
|
278
|
+
def set_downstream_jobs(
|
|
279
|
+
self, downstream_jobs: Sequence[Union[str, DataProcessInstanceUrn]]
|
|
280
|
+
) -> None:
|
|
281
|
+
self._ensure_model_props().downstreamJobs = [
|
|
282
|
+
str(job) for job in downstream_jobs
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
def add_downstream_job(
|
|
286
|
+
self, downstream_job: Union[str, DataProcessInstanceUrn]
|
|
287
|
+
) -> None:
|
|
288
|
+
props = self._ensure_model_props()
|
|
289
|
+
if props.downstreamJobs is None:
|
|
290
|
+
props.downstreamJobs = []
|
|
291
|
+
props.downstreamJobs.append(str(downstream_job))
|
|
292
|
+
|
|
293
|
+
def remove_downstream_job(
|
|
294
|
+
self, downstream_job: Union[str, DataProcessInstanceUrn]
|
|
295
|
+
) -> None:
|
|
296
|
+
props = self._ensure_model_props()
|
|
297
|
+
if props.downstreamJobs is not None:
|
|
298
|
+
job_str = str(downstream_job)
|
|
299
|
+
props.downstreamJobs = [
|
|
300
|
+
job for job in props.downstreamJobs if job != job_str
|
|
301
|
+
]
|