acryl-datahub 0.14.1.13rc9__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/METADATA +2348 -2298
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/RECORD +130 -125
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/entry_points.txt +2 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +123 -146
- datahub/cli/cli_utils.py +2 -0
- datahub/cli/delete_cli.py +103 -24
- datahub/cli/ingest_cli.py +110 -0
- datahub/cli/put_cli.py +1 -1
- datahub/cli/specific/dataproduct_cli.py +1 -1
- datahub/cli/specific/structuredproperties_cli.py +2 -1
- datahub/configuration/common.py +3 -3
- datahub/configuration/git.py +7 -1
- datahub/configuration/kafka_consumer_config.py +31 -1
- datahub/emitter/mcp_patch_builder.py +43 -0
- datahub/emitter/rest_emitter.py +17 -4
- datahub/ingestion/api/incremental_properties_helper.py +69 -0
- datahub/ingestion/api/source.py +6 -1
- datahub/ingestion/api/source_helpers.py +4 -2
- datahub/ingestion/graph/client.py +2 -0
- datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +2 -2
- datahub/ingestion/run/pipeline.py +6 -5
- datahub/ingestion/run/pipeline_config.py +6 -0
- datahub/ingestion/sink/datahub_rest.py +15 -4
- datahub/ingestion/source/abs/source.py +4 -0
- datahub/ingestion/source/aws/aws_common.py +13 -1
- datahub/ingestion/source/aws/sagemaker.py +8 -0
- datahub/ingestion/source/aws/sagemaker_processors/common.py +6 -0
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +9 -4
- datahub/ingestion/source/aws/sagemaker_processors/jobs.py +12 -1
- datahub/ingestion/source/aws/sagemaker_processors/lineage.py +11 -4
- datahub/ingestion/source/aws/sagemaker_processors/models.py +30 -1
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
- datahub/ingestion/source/common/subtypes.py +2 -0
- datahub/ingestion/source/csv_enricher.py +1 -1
- datahub/ingestion/source/datahub/datahub_database_reader.py +41 -21
- datahub/ingestion/source/datahub/datahub_source.py +8 -1
- datahub/ingestion/source/dbt/dbt_common.py +7 -61
- datahub/ingestion/source/dremio/dremio_api.py +204 -86
- datahub/ingestion/source/dremio/dremio_aspects.py +19 -15
- datahub/ingestion/source/dremio/dremio_config.py +5 -0
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +2 -0
- datahub/ingestion/source/dremio/dremio_entities.py +4 -0
- datahub/ingestion/source/dremio/dremio_reporting.py +15 -0
- datahub/ingestion/source/dremio/dremio_source.py +7 -2
- datahub/ingestion/source/elastic_search.py +1 -1
- datahub/ingestion/source/feast.py +97 -6
- datahub/ingestion/source/gc/datahub_gc.py +46 -35
- datahub/ingestion/source/gc/dataprocess_cleanup.py +110 -50
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +7 -2
- datahub/ingestion/source/ge_data_profiler.py +23 -1
- datahub/ingestion/source/iceberg/iceberg.py +12 -5
- datahub/ingestion/source/kafka/kafka.py +39 -19
- datahub/ingestion/source/kafka/kafka_connect.py +81 -51
- datahub/ingestion/source/looker/looker_liquid_tag.py +8 -1
- datahub/ingestion/source/looker/lookml_concept_context.py +1 -2
- datahub/ingestion/source/looker/view_upstream.py +65 -30
- datahub/ingestion/source/metadata/business_glossary.py +35 -18
- datahub/ingestion/source/mode.py +0 -23
- datahub/ingestion/source/neo4j/__init__.py +0 -0
- datahub/ingestion/source/neo4j/neo4j_source.py +331 -0
- datahub/ingestion/source/powerbi/__init__.py +0 -1
- datahub/ingestion/source/powerbi/config.py +3 -3
- datahub/ingestion/source/powerbi/m_query/data_classes.py +36 -15
- datahub/ingestion/source/powerbi/m_query/parser.py +6 -3
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +912 -0
- datahub/ingestion/source/powerbi/m_query/resolver.py +23 -947
- datahub/ingestion/source/powerbi/m_query/tree_function.py +3 -3
- datahub/ingestion/source/powerbi/m_query/validator.py +9 -3
- datahub/ingestion/source/powerbi/powerbi.py +12 -6
- datahub/ingestion/source/preset.py +1 -0
- datahub/ingestion/source/pulsar.py +21 -2
- datahub/ingestion/source/qlik_sense/data_classes.py +1 -0
- datahub/ingestion/source/redash.py +13 -63
- datahub/ingestion/source/redshift/config.py +1 -0
- datahub/ingestion/source/redshift/redshift.py +3 -0
- datahub/ingestion/source/s3/source.py +2 -3
- datahub/ingestion/source/snowflake/snowflake_config.py +8 -3
- datahub/ingestion/source/snowflake/snowflake_connection.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +6 -1
- datahub/ingestion/source/snowflake/snowflake_query.py +21 -4
- datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
- datahub/ingestion/source/snowflake/snowflake_schema.py +28 -0
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +41 -2
- datahub/ingestion/source/snowflake/snowflake_utils.py +46 -6
- datahub/ingestion/source/snowflake/snowflake_v2.py +6 -0
- datahub/ingestion/source/sql/athena.py +46 -22
- datahub/ingestion/source/sql/mssql/source.py +0 -2
- datahub/ingestion/source/sql/sql_common.py +34 -21
- datahub/ingestion/source/sql/sql_report.py +1 -0
- datahub/ingestion/source/sql/sql_types.py +85 -8
- datahub/ingestion/source/state/redundant_run_skip_handler.py +1 -1
- datahub/ingestion/source/superset.py +215 -65
- datahub/ingestion/source/tableau/tableau.py +237 -76
- datahub/ingestion/source/tableau/tableau_common.py +12 -6
- datahub/ingestion/source/tableau/tableau_constant.py +2 -0
- datahub/ingestion/source/tableau/tableau_server_wrapper.py +33 -0
- datahub/ingestion/source/tableau/tableau_validation.py +48 -0
- datahub/ingestion/source/unity/proxy_types.py +1 -0
- datahub/ingestion/source/unity/source.py +4 -0
- datahub/ingestion/source/unity/usage.py +20 -11
- datahub/ingestion/transformer/add_dataset_tags.py +1 -1
- datahub/ingestion/transformer/generic_aspect_transformer.py +1 -1
- datahub/integrations/assertion/common.py +1 -1
- datahub/lite/duckdb_lite.py +12 -17
- datahub/metadata/_schema_classes.py +512 -392
- datahub/metadata/_urns/urn_defs.py +1355 -1355
- datahub/metadata/com/linkedin/pegasus2avro/structured/__init__.py +2 -0
- datahub/metadata/schema.avsc +17222 -17499
- datahub/metadata/schemas/FormInfo.avsc +4 -0
- datahub/metadata/schemas/StructuredPropertyDefinition.avsc +1 -1
- datahub/metadata/schemas/StructuredPropertyKey.avsc +1 -0
- datahub/metadata/schemas/StructuredPropertySettings.avsc +114 -0
- datahub/specific/chart.py +0 -39
- datahub/specific/dashboard.py +0 -39
- datahub/specific/datajob.py +7 -57
- datahub/sql_parsing/schema_resolver.py +23 -0
- datahub/sql_parsing/sql_parsing_aggregator.py +1 -2
- datahub/sql_parsing/sqlglot_lineage.py +55 -14
- datahub/sql_parsing/sqlglot_utils.py +8 -2
- datahub/telemetry/telemetry.py +23 -9
- datahub/testing/compare_metadata_json.py +1 -1
- datahub/testing/doctest.py +12 -0
- datahub/utilities/file_backed_collections.py +35 -2
- datahub/utilities/partition_executor.py +1 -1
- datahub/utilities/urn_encoder.py +2 -1
- datahub/utilities/urns/_urn_base.py +1 -1
- datahub/utilities/urns/structured_properties_urn.py +1 -1
- datahub/utilities/sql_lineage_parser_impl.py +0 -160
- datahub/utilities/sql_parser.py +0 -94
- datahub/utilities/sql_parser_base.py +0 -21
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/WHEEL +0 -0
- {acryl_datahub-0.14.1.13rc9.dist-info → acryl_datahub-0.15.0.dist-info}/top_level.txt +0 -0
|
@@ -68,12 +68,13 @@ mode = datahub.ingestion.source.mode:ModeSource
|
|
|
68
68
|
mongodb = datahub.ingestion.source.mongodb:MongoDBSource
|
|
69
69
|
mssql = datahub.ingestion.source.sql.mssql:SQLServerSource
|
|
70
70
|
mysql = datahub.ingestion.source.sql.mysql:MySQLSource
|
|
71
|
+
neo4j = datahub.ingestion.source.neo4j.neo4j_source:Neo4jSource
|
|
71
72
|
nifi = datahub.ingestion.source.nifi:NifiSource
|
|
72
73
|
okta = datahub.ingestion.source.identity.okta:OktaSource
|
|
73
74
|
openapi = datahub.ingestion.source.openapi:OpenApiSource
|
|
74
75
|
oracle = datahub.ingestion.source.sql.oracle:OracleSource
|
|
75
76
|
postgres = datahub.ingestion.source.sql.postgres:PostgresSource
|
|
76
|
-
powerbi = datahub.ingestion.source.powerbi:PowerBiDashboardSource
|
|
77
|
+
powerbi = datahub.ingestion.source.powerbi.powerbi:PowerBiDashboardSource
|
|
77
78
|
powerbi-report-server = datahub.ingestion.source.powerbi_report_server:PowerBiReportServerDashboardSource
|
|
78
79
|
preset = datahub.ingestion.source.preset:PresetSource
|
|
79
80
|
presto = datahub.ingestion.source.sql.presto:PrestoSource
|
datahub/__init__.py
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from contextlib import contextmanager
|
|
3
2
|
from enum import Enum
|
|
4
3
|
from pathlib import Path
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import List, Optional
|
|
6
5
|
|
|
7
6
|
import yaml
|
|
8
7
|
from pydantic import validator
|
|
@@ -10,39 +9,18 @@ from ruamel.yaml import YAML
|
|
|
10
9
|
|
|
11
10
|
from datahub.configuration.common import ConfigModel
|
|
12
11
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|
13
|
-
from datahub.ingestion.graph.client import DataHubGraph
|
|
12
|
+
from datahub.ingestion.graph.client import DataHubGraph
|
|
14
13
|
from datahub.metadata.schema_classes import (
|
|
15
14
|
PropertyValueClass,
|
|
16
15
|
StructuredPropertyDefinitionClass,
|
|
17
16
|
)
|
|
18
|
-
from datahub.
|
|
17
|
+
from datahub.metadata.urns import StructuredPropertyUrn, Urn
|
|
18
|
+
from datahub.utilities.urns._urn_base import URN_TYPES
|
|
19
19
|
|
|
20
20
|
logging.basicConfig(level=logging.INFO)
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
class StructuredPropertiesConfig:
|
|
25
|
-
"""Configuration class to hold the graph client"""
|
|
26
|
-
|
|
27
|
-
_graph: Optional[DataHubGraph] = None
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
@contextmanager
|
|
31
|
-
def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
|
|
32
|
-
"""Context manager to temporarily set a custom graph"""
|
|
33
|
-
previous_graph = cls._graph
|
|
34
|
-
cls._graph = graph
|
|
35
|
-
try:
|
|
36
|
-
yield
|
|
37
|
-
finally:
|
|
38
|
-
cls._graph = previous_graph
|
|
39
|
-
|
|
40
|
-
@classmethod
|
|
41
|
-
def get_graph(cls) -> DataHubGraph:
|
|
42
|
-
"""Get the current graph, falling back to default if none set"""
|
|
43
|
-
return cls._graph if cls._graph is not None else get_default_graph()
|
|
44
|
-
|
|
45
|
-
|
|
46
24
|
class AllowedTypes(Enum):
|
|
47
25
|
STRING = "string"
|
|
48
26
|
RICH_TEXT = "rich_text"
|
|
@@ -64,29 +42,28 @@ class AllowedValue(ConfigModel):
|
|
|
64
42
|
description: Optional[str] = None
|
|
65
43
|
|
|
66
44
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
45
|
+
VALID_ENTITY_TYPE_URNS = [
|
|
46
|
+
Urn.make_entity_type_urn(entity_type) for entity_type in URN_TYPES.keys()
|
|
47
|
+
]
|
|
48
|
+
_VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {', '.join(VALID_ENTITY_TYPE_URNS)}, etc... Ensure that the entity type is valid."
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _validate_entity_type_urn(v: str) -> str:
|
|
52
|
+
urn = Urn.make_entity_type_urn(v)
|
|
53
|
+
if urn not in VALID_ENTITY_TYPE_URNS:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Input {v} is not a valid entity type urn. {_VALID_ENTITY_TYPES_STRING}"
|
|
56
|
+
)
|
|
57
|
+
v = str(urn)
|
|
58
|
+
return v
|
|
74
59
|
|
|
75
60
|
|
|
76
61
|
class TypeQualifierAllowedTypes(ConfigModel):
|
|
77
62
|
allowed_types: List[str]
|
|
78
63
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
graph = StructuredPropertiesConfig.get_graph()
|
|
83
|
-
validated_urn = Urn.make_entity_type_urn(v)
|
|
84
|
-
if not graph.exists(validated_urn):
|
|
85
|
-
raise ValueError(
|
|
86
|
-
f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
|
|
87
|
-
)
|
|
88
|
-
v = str(validated_urn)
|
|
89
|
-
return v
|
|
64
|
+
_check_allowed_types = validator("allowed_types", each_item=True, allow_reuse=True)(
|
|
65
|
+
_validate_entity_type_urn
|
|
66
|
+
)
|
|
90
67
|
|
|
91
68
|
|
|
92
69
|
class StructuredProperties(ConfigModel):
|
|
@@ -103,26 +80,36 @@ class StructuredProperties(ConfigModel):
|
|
|
103
80
|
type_qualifier: Optional[TypeQualifierAllowedTypes] = None
|
|
104
81
|
immutable: Optional[bool] = False
|
|
105
82
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
83
|
+
_check_entity_types = validator("entity_types", each_item=True, allow_reuse=True)(
|
|
84
|
+
_validate_entity_type_urn
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@validator("type")
|
|
88
|
+
def validate_type(cls, v: str) -> str:
|
|
89
|
+
# Convert to lowercase if needed
|
|
90
|
+
if not v.islower():
|
|
91
|
+
logger.warning(
|
|
92
|
+
f"Structured property type should be lowercase. Updated to {v.lower()}"
|
|
93
|
+
)
|
|
94
|
+
v = v.lower()
|
|
95
|
+
|
|
96
|
+
# Check if type is allowed
|
|
97
|
+
if not AllowedTypes.check_allowed_type(v):
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Type {v} is not allowed. Allowed types are {AllowedTypes.values()}"
|
|
100
|
+
)
|
|
116
101
|
return v
|
|
117
102
|
|
|
118
103
|
@property
|
|
119
104
|
def fqn(self) -> str:
|
|
120
105
|
assert self.urn is not None
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
106
|
+
id = StructuredPropertyUrn.from_string(self.urn).id
|
|
107
|
+
if self.qualified_name is not None:
|
|
108
|
+
# ensure that qualified name and ID match
|
|
109
|
+
assert (
|
|
110
|
+
self.qualified_name == id
|
|
111
|
+
), "ID in the urn and the qualified_name must match"
|
|
112
|
+
return id
|
|
126
113
|
|
|
127
114
|
@validator("urn", pre=True, always=True)
|
|
128
115
|
def urn_must_be_present(cls, v, values):
|
|
@@ -133,100 +120,90 @@ class StructuredProperties(ConfigModel):
|
|
|
133
120
|
return v
|
|
134
121
|
|
|
135
122
|
@staticmethod
|
|
136
|
-
def
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
displayName=structuredproperty.display_name,
|
|
161
|
-
description=structuredproperty.description,
|
|
162
|
-
entityTypes=[
|
|
163
|
-
Urn.make_entity_type_urn(entity_type)
|
|
164
|
-
for entity_type in structuredproperty.entity_types or []
|
|
165
|
-
],
|
|
166
|
-
cardinality=structuredproperty.cardinality,
|
|
167
|
-
immutable=structuredproperty.immutable,
|
|
168
|
-
allowedValues=(
|
|
169
|
-
[
|
|
170
|
-
PropertyValueClass(
|
|
171
|
-
value=v.value, description=v.description
|
|
172
|
-
)
|
|
173
|
-
for v in structuredproperty.allowed_values
|
|
174
|
-
]
|
|
175
|
-
if structuredproperty.allowed_values
|
|
176
|
-
else None
|
|
177
|
-
),
|
|
178
|
-
typeQualifier=(
|
|
179
|
-
{
|
|
180
|
-
"allowedTypes": structuredproperty.type_qualifier.allowed_types
|
|
181
|
-
}
|
|
182
|
-
if structuredproperty.type_qualifier
|
|
183
|
-
else None
|
|
184
|
-
),
|
|
185
|
-
),
|
|
186
|
-
)
|
|
187
|
-
emitter.emit_mcp(mcp)
|
|
188
|
-
|
|
189
|
-
logger.info(f"Created structured property {structuredproperty.urn}")
|
|
190
|
-
|
|
191
|
-
@classmethod
|
|
192
|
-
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
|
|
193
|
-
with StructuredPropertiesConfig.use_graph(graph):
|
|
194
|
-
structured_property: Optional[
|
|
195
|
-
StructuredPropertyDefinitionClass
|
|
196
|
-
] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
|
|
197
|
-
if structured_property is None:
|
|
198
|
-
raise Exception(
|
|
199
|
-
"StructuredPropertyDefinition aspect is None. Unable to create structured property."
|
|
200
|
-
)
|
|
201
|
-
return StructuredProperties(
|
|
202
|
-
urn=urn,
|
|
203
|
-
qualified_name=structured_property.qualifiedName,
|
|
204
|
-
display_name=structured_property.displayName,
|
|
205
|
-
type=structured_property.valueType,
|
|
206
|
-
description=structured_property.description,
|
|
207
|
-
entity_types=structured_property.entityTypes,
|
|
208
|
-
cardinality=structured_property.cardinality,
|
|
209
|
-
allowed_values=(
|
|
123
|
+
def from_yaml(file: str) -> List["StructuredProperties"]:
|
|
124
|
+
with open(file) as fp:
|
|
125
|
+
structuredproperties: List[dict] = yaml.safe_load(fp)
|
|
126
|
+
|
|
127
|
+
result: List[StructuredProperties] = []
|
|
128
|
+
for structuredproperty_raw in structuredproperties:
|
|
129
|
+
result.append(StructuredProperties.parse_obj(structuredproperty_raw))
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
def generate_mcps(self) -> List[MetadataChangeProposalWrapper]:
|
|
133
|
+
mcp = MetadataChangeProposalWrapper(
|
|
134
|
+
entityUrn=self.urn,
|
|
135
|
+
aspect=StructuredPropertyDefinitionClass(
|
|
136
|
+
qualifiedName=self.fqn,
|
|
137
|
+
valueType=Urn.make_data_type_urn(self.type),
|
|
138
|
+
displayName=self.display_name,
|
|
139
|
+
description=self.description,
|
|
140
|
+
entityTypes=[
|
|
141
|
+
Urn.make_entity_type_urn(entity_type)
|
|
142
|
+
for entity_type in self.entity_types or []
|
|
143
|
+
],
|
|
144
|
+
cardinality=self.cardinality,
|
|
145
|
+
immutable=self.immutable,
|
|
146
|
+
allowedValues=(
|
|
210
147
|
[
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
description=av.description,
|
|
214
|
-
)
|
|
215
|
-
for av in structured_property.allowedValues or []
|
|
148
|
+
PropertyValueClass(value=v.value, description=v.description)
|
|
149
|
+
for v in self.allowed_values
|
|
216
150
|
]
|
|
217
|
-
if
|
|
151
|
+
if self.allowed_values
|
|
218
152
|
else None
|
|
219
153
|
),
|
|
220
|
-
|
|
221
|
-
{
|
|
222
|
-
|
|
223
|
-
"allowedTypes"
|
|
224
|
-
)
|
|
225
|
-
}
|
|
226
|
-
if structured_property.typeQualifier
|
|
154
|
+
typeQualifier=(
|
|
155
|
+
{"allowedTypes": self.type_qualifier.allowed_types}
|
|
156
|
+
if self.type_qualifier
|
|
227
157
|
else None
|
|
228
158
|
),
|
|
159
|
+
),
|
|
160
|
+
)
|
|
161
|
+
return [mcp]
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def create(file: str, graph: DataHubGraph) -> None:
|
|
165
|
+
# TODO: Deprecate this method.
|
|
166
|
+
structuredproperties = StructuredProperties.from_yaml(file)
|
|
167
|
+
for structuredproperty in structuredproperties:
|
|
168
|
+
for mcp in structuredproperty.generate_mcps():
|
|
169
|
+
graph.emit_mcp(mcp)
|
|
170
|
+
|
|
171
|
+
logger.info(f"Created structured property {structuredproperty.urn}")
|
|
172
|
+
|
|
173
|
+
@classmethod
|
|
174
|
+
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
|
|
175
|
+
structured_property: Optional[
|
|
176
|
+
StructuredPropertyDefinitionClass
|
|
177
|
+
] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
|
|
178
|
+
if structured_property is None:
|
|
179
|
+
raise Exception(
|
|
180
|
+
"StructuredPropertyDefinition aspect is None. Unable to create structured property."
|
|
229
181
|
)
|
|
182
|
+
return StructuredProperties(
|
|
183
|
+
urn=urn,
|
|
184
|
+
qualified_name=structured_property.qualifiedName,
|
|
185
|
+
display_name=structured_property.displayName,
|
|
186
|
+
type=structured_property.valueType,
|
|
187
|
+
description=structured_property.description,
|
|
188
|
+
entity_types=structured_property.entityTypes,
|
|
189
|
+
cardinality=structured_property.cardinality,
|
|
190
|
+
allowed_values=(
|
|
191
|
+
[
|
|
192
|
+
AllowedValue(
|
|
193
|
+
value=av.value,
|
|
194
|
+
description=av.description,
|
|
195
|
+
)
|
|
196
|
+
for av in structured_property.allowedValues or []
|
|
197
|
+
]
|
|
198
|
+
if structured_property.allowedValues is not None
|
|
199
|
+
else None
|
|
200
|
+
),
|
|
201
|
+
type_qualifier=(
|
|
202
|
+
{"allowed_types": structured_property.typeQualifier.get("allowedTypes")}
|
|
203
|
+
if structured_property.typeQualifier
|
|
204
|
+
else None
|
|
205
|
+
),
|
|
206
|
+
)
|
|
230
207
|
|
|
231
208
|
def to_yaml(
|
|
232
209
|
self,
|
datahub/cli/cli_utils.py
CHANGED
|
@@ -327,6 +327,8 @@ def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
|
|
|
327
327
|
url = f"{url}/gms"
|
|
328
328
|
elif url.endswith("acryl.io/"):
|
|
329
329
|
url = f"{url}gms"
|
|
330
|
+
if url.endswith("acryl.io/api/gms"):
|
|
331
|
+
url = url.replace("acryl.io/api/gms", "acryl.io/gms")
|
|
330
332
|
|
|
331
333
|
return url
|
|
332
334
|
|
datahub/cli/delete_cli.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2
3
|
from dataclasses import dataclass
|
|
3
4
|
from datetime import datetime
|
|
4
5
|
from random import choices
|
|
@@ -214,14 +215,47 @@ def references(urn: str, dry_run: bool, force: bool) -> None:
|
|
|
214
215
|
|
|
215
216
|
|
|
216
217
|
@delete.command()
|
|
217
|
-
@click.option("--urn", required=
|
|
218
|
-
|
|
218
|
+
@click.option("--urn", required=False, type=str, help="the urn of the entity")
|
|
219
|
+
@click.option(
|
|
220
|
+
"-p",
|
|
221
|
+
"--platform",
|
|
222
|
+
required=False,
|
|
223
|
+
type=str,
|
|
224
|
+
help="Platform filter (e.g. snowflake)",
|
|
225
|
+
)
|
|
226
|
+
@click.option(
|
|
227
|
+
"-b",
|
|
228
|
+
"--batch-size",
|
|
229
|
+
required=False,
|
|
230
|
+
default=3000,
|
|
231
|
+
type=int,
|
|
232
|
+
help="Batch size when querying for entities to un-soft delete."
|
|
233
|
+
"Maximum 10000. Large batch sizes may cause timeouts.",
|
|
234
|
+
)
|
|
235
|
+
def undo_by_filter(
|
|
236
|
+
urn: Optional[str], platform: Optional[str], batch_size: int
|
|
237
|
+
) -> None:
|
|
219
238
|
"""
|
|
220
|
-
Undo
|
|
239
|
+
Undo soft deletion by filters
|
|
221
240
|
"""
|
|
222
241
|
graph = get_default_graph()
|
|
223
242
|
logger.info(f"Using {graph}")
|
|
224
|
-
|
|
243
|
+
if urn:
|
|
244
|
+
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
245
|
+
else:
|
|
246
|
+
urns = list(
|
|
247
|
+
graph.get_urns_by_filter(
|
|
248
|
+
platform=platform,
|
|
249
|
+
query="*",
|
|
250
|
+
status=RemovedStatusFilter.ONLY_SOFT_DELETED,
|
|
251
|
+
batch_size=batch_size,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
logger.info(f"Going to un-soft delete {len(urns)} urns")
|
|
255
|
+
urns_iter = progressbar.progressbar(urns, redirect_stdout=True)
|
|
256
|
+
for urn in urns_iter:
|
|
257
|
+
assert urn
|
|
258
|
+
graph.set_soft_delete_status(urn=urn, delete=False)
|
|
225
259
|
|
|
226
260
|
|
|
227
261
|
@delete.command(no_args_is_help=True)
|
|
@@ -312,6 +346,9 @@ def undo_by_filter(urn: str) -> None:
|
|
|
312
346
|
default=False,
|
|
313
347
|
help="Only delete soft-deleted entities, for hard deletion",
|
|
314
348
|
)
|
|
349
|
+
@click.option(
|
|
350
|
+
"--workers", type=int, default=1, help="Num of workers to use for deletion."
|
|
351
|
+
)
|
|
315
352
|
@upgrade.check_upgrade
|
|
316
353
|
@telemetry.with_telemetry()
|
|
317
354
|
def by_filter(
|
|
@@ -329,6 +366,7 @@ def by_filter(
|
|
|
329
366
|
batch_size: int,
|
|
330
367
|
dry_run: bool,
|
|
331
368
|
only_soft_deleted: bool,
|
|
369
|
+
workers: int = 1,
|
|
332
370
|
) -> None:
|
|
333
371
|
"""Delete metadata from datahub using a single urn or a combination of filters."""
|
|
334
372
|
|
|
@@ -349,16 +387,19 @@ def by_filter(
|
|
|
349
387
|
# TODO: add some validation on entity_type
|
|
350
388
|
|
|
351
389
|
if not force and not soft and not dry_run:
|
|
390
|
+
message = (
|
|
391
|
+
"Hard deletion will permanently delete data from DataHub and can be slow. "
|
|
392
|
+
"We generally recommend using soft deletes instead. "
|
|
393
|
+
"Do you want to continue?"
|
|
394
|
+
)
|
|
352
395
|
if only_soft_deleted:
|
|
353
396
|
click.confirm(
|
|
354
|
-
|
|
397
|
+
message,
|
|
355
398
|
abort=True,
|
|
356
399
|
)
|
|
357
400
|
else:
|
|
358
401
|
click.confirm(
|
|
359
|
-
|
|
360
|
-
"We generally recommend using soft deletes instead. "
|
|
361
|
-
"Do you want to continue?",
|
|
402
|
+
message,
|
|
362
403
|
abort=True,
|
|
363
404
|
)
|
|
364
405
|
|
|
@@ -429,26 +470,64 @@ def by_filter(
|
|
|
429
470
|
abort=True,
|
|
430
471
|
)
|
|
431
472
|
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
473
|
+
_delete_urns_parallel(
|
|
474
|
+
graph=graph,
|
|
475
|
+
urns=urns,
|
|
476
|
+
aspect_name=aspect,
|
|
477
|
+
soft=soft,
|
|
478
|
+
dry_run=dry_run,
|
|
479
|
+
delete_by_urn=delete_by_urn,
|
|
480
|
+
start_time=start_time,
|
|
481
|
+
end_time=end_time,
|
|
482
|
+
workers=workers,
|
|
483
|
+
)
|
|
484
|
+
|
|
435
485
|
|
|
436
|
-
|
|
486
|
+
def _delete_urns_parallel(
|
|
487
|
+
graph: DataHubGraph,
|
|
488
|
+
urns: List[str],
|
|
489
|
+
delete_by_urn: bool,
|
|
490
|
+
start_time: Optional[datetime],
|
|
491
|
+
end_time: Optional[datetime],
|
|
492
|
+
aspect_name: Optional[str] = None,
|
|
493
|
+
soft: bool = True,
|
|
494
|
+
dry_run: bool = False,
|
|
495
|
+
workers: int = 1,
|
|
496
|
+
) -> None:
|
|
437
497
|
deletion_result = DeletionResult()
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
498
|
+
|
|
499
|
+
def process_urn(urn):
|
|
500
|
+
return _delete_one_urn(
|
|
501
|
+
graph=graph,
|
|
502
|
+
urn=urn,
|
|
503
|
+
aspect_name=aspect_name,
|
|
504
|
+
soft=soft,
|
|
505
|
+
dry_run=dry_run,
|
|
506
|
+
start_time=start_time,
|
|
507
|
+
end_time=end_time,
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
with PerfTimer() as timer, ThreadPoolExecutor(max_workers=workers) as executor:
|
|
511
|
+
future_to_urn = {executor.submit(process_urn, urn): urn for urn in urns}
|
|
512
|
+
|
|
513
|
+
completed_futures = as_completed(future_to_urn)
|
|
514
|
+
if not delete_by_urn and not dry_run:
|
|
515
|
+
futures_iter = progressbar.progressbar(
|
|
516
|
+
as_completed(future_to_urn),
|
|
517
|
+
max_value=len(future_to_urn),
|
|
518
|
+
redirect_stdout=True,
|
|
448
519
|
)
|
|
449
|
-
|
|
520
|
+
else:
|
|
521
|
+
futures_iter = completed_futures
|
|
522
|
+
|
|
523
|
+
for future in futures_iter:
|
|
524
|
+
try:
|
|
525
|
+
one_result = future.result()
|
|
526
|
+
deletion_result.merge(one_result)
|
|
527
|
+
except Exception as e:
|
|
528
|
+
urn = future_to_urn[future]
|
|
529
|
+
click.secho(f"Error processing URN {urn}: {e}", fg="red")
|
|
450
530
|
|
|
451
|
-
# Report out a summary of the deletion result.
|
|
452
531
|
click.echo(
|
|
453
532
|
deletion_result.format_message(
|
|
454
533
|
dry_run=dry_run, soft=soft, time_sec=timer.elapsed_seconds()
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -27,6 +27,7 @@ from datahub.utilities.perf_timer import PerfTimer
|
|
|
27
27
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
|
+
INGEST_SRC_TABLE_COLUMNS = ["runId", "source", "startTime", "status", "URN"]
|
|
30
31
|
RUNS_TABLE_COLUMNS = ["runId", "rows", "created at"]
|
|
31
32
|
RUN_TABLE_COLUMNS = ["urn", "aspect name", "created at"]
|
|
32
33
|
|
|
@@ -437,6 +438,115 @@ def mcps(path: str) -> None:
|
|
|
437
438
|
sys.exit(ret)
|
|
438
439
|
|
|
439
440
|
|
|
441
|
+
@ingest.command()
|
|
442
|
+
@click.argument("page_offset", type=int, default=0)
|
|
443
|
+
@click.argument("page_size", type=int, default=100)
|
|
444
|
+
@click.option("--urn", type=str, default=None, help="Filter by ingestion source URN.")
|
|
445
|
+
@click.option(
|
|
446
|
+
"--source", type=str, default=None, help="Filter by ingestion source name."
|
|
447
|
+
)
|
|
448
|
+
@upgrade.check_upgrade
|
|
449
|
+
@telemetry.with_telemetry()
|
|
450
|
+
def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) -> None:
|
|
451
|
+
"""List ingestion source runs with their details, optionally filtered by URN or source."""
|
|
452
|
+
|
|
453
|
+
query = """
|
|
454
|
+
query listIngestionRuns($input: ListIngestionSourcesInput!) {
|
|
455
|
+
listIngestionSources(input: $input) {
|
|
456
|
+
ingestionSources {
|
|
457
|
+
urn
|
|
458
|
+
name
|
|
459
|
+
executions {
|
|
460
|
+
executionRequests {
|
|
461
|
+
id
|
|
462
|
+
result {
|
|
463
|
+
startTimeMs
|
|
464
|
+
status
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
"""
|
|
472
|
+
|
|
473
|
+
# filter by urn and/or source using CONTAINS
|
|
474
|
+
filters = []
|
|
475
|
+
if urn:
|
|
476
|
+
filters.append({"field": "urn", "values": [urn], "condition": "CONTAIN"})
|
|
477
|
+
if source:
|
|
478
|
+
filters.append({"field": "name", "values": [source], "condition": "CONTAIN"})
|
|
479
|
+
|
|
480
|
+
variables = {
|
|
481
|
+
"input": {
|
|
482
|
+
"start": page_offset,
|
|
483
|
+
"count": page_size,
|
|
484
|
+
"filters": filters,
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
client = get_default_graph()
|
|
489
|
+
session = client._session
|
|
490
|
+
gms_host = client.config.server
|
|
491
|
+
|
|
492
|
+
url = f"{gms_host}/api/graphql"
|
|
493
|
+
try:
|
|
494
|
+
response = session.post(url, json={"query": query, "variables": variables})
|
|
495
|
+
response.raise_for_status()
|
|
496
|
+
except Exception as e:
|
|
497
|
+
click.echo(f"Error fetching data: {str(e)}")
|
|
498
|
+
return
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
data = response.json()
|
|
502
|
+
except ValueError:
|
|
503
|
+
click.echo("Failed to parse JSON response from server.")
|
|
504
|
+
return
|
|
505
|
+
|
|
506
|
+
if not data:
|
|
507
|
+
click.echo("No response received from the server.")
|
|
508
|
+
return
|
|
509
|
+
|
|
510
|
+
# when urn or source filter does not match, exit gracefully
|
|
511
|
+
if (
|
|
512
|
+
not isinstance(data.get("data"), dict)
|
|
513
|
+
or "listIngestionSources" not in data["data"]
|
|
514
|
+
):
|
|
515
|
+
click.echo("No matching ingestion sources found. Please check your filters.")
|
|
516
|
+
return
|
|
517
|
+
|
|
518
|
+
ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
|
|
519
|
+
if not ingestion_sources:
|
|
520
|
+
click.echo("No ingestion sources or executions found.")
|
|
521
|
+
return
|
|
522
|
+
|
|
523
|
+
rows = []
|
|
524
|
+
for ingestion_source in ingestion_sources:
|
|
525
|
+
urn = ingestion_source.get("urn", "N/A")
|
|
526
|
+
name = ingestion_source.get("name", "N/A")
|
|
527
|
+
|
|
528
|
+
executions = ingestion_source.get("executions", {}).get("executionRequests", [])
|
|
529
|
+
for execution in executions:
|
|
530
|
+
execution_id = execution.get("id", "N/A")
|
|
531
|
+
start_time = execution.get("result", {}).get("startTimeMs", "N/A")
|
|
532
|
+
start_time = (
|
|
533
|
+
datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
|
|
534
|
+
if start_time != "N/A"
|
|
535
|
+
else "N/A"
|
|
536
|
+
)
|
|
537
|
+
status = execution.get("result", {}).get("status", "N/A")
|
|
538
|
+
|
|
539
|
+
rows.append([execution_id, name, start_time, status, urn])
|
|
540
|
+
|
|
541
|
+
click.echo(
|
|
542
|
+
tabulate(
|
|
543
|
+
rows,
|
|
544
|
+
headers=INGEST_SRC_TABLE_COLUMNS,
|
|
545
|
+
tablefmt="grid",
|
|
546
|
+
)
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
|
|
440
550
|
@ingest.command()
|
|
441
551
|
@click.argument("page_offset", type=int, default=0)
|
|
442
552
|
@click.argument("page_size", type=int, default=100)
|
datahub/cli/put_cli.py
CHANGED
|
@@ -105,7 +105,7 @@ def platform(
|
|
|
105
105
|
"""
|
|
106
106
|
|
|
107
107
|
if name.startswith(f"urn:li:{DataPlatformUrn.ENTITY_TYPE}"):
|
|
108
|
-
platform_urn = DataPlatformUrn.
|
|
108
|
+
platform_urn = DataPlatformUrn.from_string(name)
|
|
109
109
|
platform_name = platform_urn.get_entity_id_as_string()
|
|
110
110
|
else:
|
|
111
111
|
platform_name = name.lower()
|