acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of acryl-datahub might be problematic. Click here for more details.
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2462 -2460
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +214 -210
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
- datahub/__init__.py +1 -1
- datahub/api/entities/assertion/assertion_operator.py +3 -5
- datahub/api/entities/corpgroup/corpgroup.py +1 -1
- datahub/api/entities/datacontract/assertion_operator.py +3 -5
- datahub/api/entities/dataproduct/dataproduct.py +4 -4
- datahub/api/entities/dataset/dataset.py +2 -1
- datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
- datahub/cli/cli_utils.py +13 -2
- datahub/cli/delete_cli.py +3 -3
- datahub/cli/docker_cli.py +6 -6
- datahub/cli/ingest_cli.py +25 -15
- datahub/cli/lite_cli.py +2 -2
- datahub/cli/migrate.py +5 -5
- datahub/cli/specific/assertions_cli.py +3 -3
- datahub/cli/specific/structuredproperties_cli.py +84 -0
- datahub/cli/timeline_cli.py +1 -1
- datahub/configuration/common.py +1 -2
- datahub/configuration/config_loader.py +73 -50
- datahub/configuration/git.py +2 -2
- datahub/configuration/time_window_config.py +10 -5
- datahub/emitter/mce_builder.py +4 -8
- datahub/emitter/mcp_builder.py +27 -0
- datahub/emitter/mcp_patch_builder.py +1 -2
- datahub/emitter/rest_emitter.py +141 -93
- datahub/entrypoints.py +6 -0
- datahub/ingestion/api/auto_work_units/auto_ensure_aspect_size.py +5 -3
- datahub/ingestion/api/incremental_lineage_helper.py +2 -8
- datahub/ingestion/api/report.py +1 -2
- datahub/ingestion/api/source.py +8 -2
- datahub/ingestion/api/source_helpers.py +1 -1
- datahub/ingestion/extractor/json_schema_util.py +3 -3
- datahub/ingestion/extractor/schema_util.py +3 -5
- datahub/ingestion/fs/s3_fs.py +3 -3
- datahub/ingestion/glossary/classifier.py +2 -3
- datahub/ingestion/glossary/datahub_classifier.py +6 -4
- datahub/ingestion/graph/client.py +22 -19
- datahub/ingestion/graph/config.py +1 -1
- datahub/ingestion/run/pipeline.py +8 -7
- datahub/ingestion/run/pipeline_config.py +3 -3
- datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/abs/source.py +19 -8
- datahub/ingestion/source/aws/glue.py +77 -47
- datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
- datahub/ingestion/source/aws/s3_util.py +24 -1
- datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
- datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
- datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
- datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
- datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
- datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
- datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
- datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
- datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
- datahub/ingestion/source/bigquery_v2/queries.py +1 -3
- datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
- datahub/ingestion/source/bigquery_v2/usage.py +60 -60
- datahub/ingestion/source/cassandra/cassandra.py +0 -1
- datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
- datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
- datahub/ingestion/source/confluent_schema_registry.py +6 -6
- datahub/ingestion/source/csv_enricher.py +29 -29
- datahub/ingestion/source/datahub/config.py +20 -0
- datahub/ingestion/source/datahub/datahub_database_reader.py +7 -19
- datahub/ingestion/source/datahub/datahub_source.py +13 -3
- datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
- datahub/ingestion/source/dbt/dbt_common.py +9 -7
- datahub/ingestion/source/delta_lake/source.py +0 -5
- datahub/ingestion/source/demo_data.py +1 -1
- datahub/ingestion/source/dremio/dremio_api.py +4 -4
- datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
- datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
- datahub/ingestion/source/dremio/dremio_source.py +2 -2
- datahub/ingestion/source/elastic_search.py +4 -4
- datahub/ingestion/source/fivetran/fivetran.py +1 -6
- datahub/ingestion/source/gc/datahub_gc.py +11 -14
- datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
- datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
- datahub/ingestion/source/gcs/gcs_source.py +3 -2
- datahub/ingestion/source/ge_data_profiler.py +2 -5
- datahub/ingestion/source/ge_profiling_config.py +3 -3
- datahub/ingestion/source/iceberg/iceberg.py +13 -6
- datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
- datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
- datahub/ingestion/source/identity/azure_ad.py +3 -3
- datahub/ingestion/source/identity/okta.py +3 -3
- datahub/ingestion/source/kafka/kafka.py +11 -9
- datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
- datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
- datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
- datahub/ingestion/source/looker/looker_common.py +19 -19
- datahub/ingestion/source/looker/looker_config.py +11 -6
- datahub/ingestion/source/looker/looker_source.py +25 -25
- datahub/ingestion/source/looker/looker_template_language.py +3 -3
- datahub/ingestion/source/looker/looker_usage.py +5 -7
- datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
- datahub/ingestion/source/looker/lookml_source.py +13 -15
- datahub/ingestion/source/looker/view_upstream.py +5 -5
- datahub/ingestion/source/metabase.py +1 -6
- datahub/ingestion/source/mlflow.py +4 -9
- datahub/ingestion/source/mode.py +5 -5
- datahub/ingestion/source/mongodb.py +6 -4
- datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
- datahub/ingestion/source/nifi.py +24 -31
- datahub/ingestion/source/openapi.py +9 -9
- datahub/ingestion/source/powerbi/config.py +12 -12
- datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
- datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
- datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
- datahub/ingestion/source/powerbi/powerbi.py +6 -6
- datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
- datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
- datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
- datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
- datahub/ingestion/source/redash.py +0 -5
- datahub/ingestion/source/redshift/config.py +3 -3
- datahub/ingestion/source/redshift/redshift.py +45 -46
- datahub/ingestion/source/redshift/usage.py +33 -33
- datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
- datahub/ingestion/source/s3/source.py +11 -15
- datahub/ingestion/source/salesforce.py +26 -25
- datahub/ingestion/source/schema/json_schema.py +1 -1
- datahub/ingestion/source/sigma/sigma.py +3 -3
- datahub/ingestion/source/sigma/sigma_api.py +12 -10
- datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
- datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
- datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +4 -0
- datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
- datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
- datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
- datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
- datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
- datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
- datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
- datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
- datahub/ingestion/source/sql/athena.py +1 -3
- datahub/ingestion/source/sql/clickhouse.py +8 -14
- datahub/ingestion/source/sql/oracle.py +1 -3
- datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
- datahub/ingestion/source/sql/sql_types.py +1 -2
- datahub/ingestion/source/sql/sql_utils.py +5 -0
- datahub/ingestion/source/sql/teradata.py +18 -5
- datahub/ingestion/source/state/profiling_state_handler.py +3 -3
- datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
- datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
- datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
- datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
- datahub/ingestion/source/superset.py +1 -6
- datahub/ingestion/source/tableau/tableau.py +343 -117
- datahub/ingestion/source/tableau/tableau_common.py +5 -2
- datahub/ingestion/source/unity/config.py +3 -1
- datahub/ingestion/source/unity/proxy.py +1 -1
- datahub/ingestion/source/unity/source.py +74 -78
- datahub/ingestion/source/unity/usage.py +3 -1
- datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
- datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
- datahub/ingestion/source/usage/usage_common.py +1 -1
- datahub/ingestion/source_report/ingestion_stage.py +24 -20
- datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
- datahub/ingestion/transformer/add_dataset_properties.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
- datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
- datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
- datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
- datahub/ingestion/transformer/tags_to_terms.py +7 -7
- datahub/integrations/assertion/snowflake/compiler.py +10 -10
- datahub/lite/duckdb_lite.py +12 -10
- datahub/metadata/_schema_classes.py +317 -44
- datahub/metadata/_urns/urn_defs.py +69 -15
- datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
- datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
- datahub/metadata/schema.avsc +302 -89
- datahub/metadata/schemas/DataFlowKey.avsc +1 -0
- datahub/metadata/schemas/DataJobKey.avsc +1 -0
- datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
- datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
- datahub/metadata/schemas/DatasetKey.avsc +2 -1
- datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
- datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
- datahub/metadata/schemas/MLModelKey.avsc +2 -1
- datahub/metadata/schemas/MLModelProperties.avsc +96 -48
- datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
- datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
- datahub/metadata/schemas/VersionProperties.avsc +216 -0
- datahub/metadata/schemas/VersionSetKey.avsc +26 -0
- datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
- datahub/secret/datahub_secrets_client.py +12 -21
- datahub/secret/secret_common.py +14 -8
- datahub/specific/aspect_helpers/custom_properties.py +1 -2
- datahub/sql_parsing/schema_resolver.py +5 -10
- datahub/sql_parsing/sql_parsing_aggregator.py +26 -21
- datahub/sql_parsing/sqlglot_lineage.py +3 -3
- datahub/sql_parsing/sqlglot_utils.py +1 -1
- datahub/telemetry/stats.py +1 -2
- datahub/testing/mcp_diff.py +1 -1
- datahub/utilities/file_backed_collections.py +11 -11
- datahub/utilities/hive_schema_to_avro.py +2 -2
- datahub/utilities/logging_manager.py +2 -2
- datahub/utilities/lossy_collections.py +3 -3
- datahub/utilities/mapping.py +3 -3
- datahub/utilities/memory_footprint.py +3 -2
- datahub/utilities/perf_timer.py +11 -6
- datahub/utilities/serialized_lru_cache.py +3 -1
- datahub/utilities/sqlalchemy_query_combiner.py +6 -6
- datahub/utilities/sqllineage_patch.py +1 -1
- datahub/utilities/stats_collections.py +3 -1
- datahub/utilities/urns/_urn_base.py +28 -5
- datahub/utilities/urns/urn_iter.py +2 -2
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
- {acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0
datahub/__init__.py
CHANGED
|
@@ -20,15 +20,13 @@ class Operator(Protocol):
|
|
|
20
20
|
|
|
21
21
|
operator: str
|
|
22
22
|
|
|
23
|
-
def id(self) -> str:
|
|
24
|
-
...
|
|
23
|
+
def id(self) -> str: ...
|
|
25
24
|
|
|
26
|
-
def generate_parameters(self) -> AssertionStdParametersClass:
|
|
27
|
-
...
|
|
25
|
+
def generate_parameters(self) -> AssertionStdParametersClass: ...
|
|
28
26
|
|
|
29
27
|
|
|
30
28
|
def _generate_assertion_std_parameter(
|
|
31
|
-
value: Union[str, int, float, list]
|
|
29
|
+
value: Union[str, int, float, list],
|
|
32
30
|
) -> AssertionStdParameterClass:
|
|
33
31
|
if isinstance(value, str):
|
|
34
32
|
return AssertionStdParameterClass(
|
|
@@ -19,15 +19,13 @@ class Operator(Protocol):
|
|
|
19
19
|
|
|
20
20
|
operator: str
|
|
21
21
|
|
|
22
|
-
def id(self) -> str:
|
|
23
|
-
...
|
|
22
|
+
def id(self) -> str: ...
|
|
24
23
|
|
|
25
|
-
def generate_parameters(self) -> AssertionStdParametersClass:
|
|
26
|
-
...
|
|
24
|
+
def generate_parameters(self) -> AssertionStdParametersClass: ...
|
|
27
25
|
|
|
28
26
|
|
|
29
27
|
def _generate_assertion_std_parameter(
|
|
30
|
-
value: Union[str, int, float]
|
|
28
|
+
value: Union[str, int, float],
|
|
31
29
|
) -> AssertionStdParameterClass:
|
|
32
30
|
if isinstance(value, str):
|
|
33
31
|
return AssertionStdParameterClass(
|
|
@@ -321,9 +321,9 @@ class DataProduct(ConfigModel):
|
|
|
321
321
|
|
|
322
322
|
@classmethod
|
|
323
323
|
def from_datahub(cls, graph: DataHubGraph, id: str) -> DataProduct:
|
|
324
|
-
data_product_properties: Optional[
|
|
325
|
-
DataProductPropertiesClass
|
|
326
|
-
|
|
324
|
+
data_product_properties: Optional[DataProductPropertiesClass] = (
|
|
325
|
+
graph.get_aspect(id, DataProductPropertiesClass)
|
|
326
|
+
)
|
|
327
327
|
domains: Optional[DomainsClass] = graph.get_aspect(id, DomainsClass)
|
|
328
328
|
assert domains, "Data Product must have an associated domain. Found none."
|
|
329
329
|
owners: Optional[OwnershipClass] = graph.get_aspect(id, OwnershipClass)
|
|
@@ -438,7 +438,7 @@ class DataProduct(ConfigModel):
|
|
|
438
438
|
for replace_index, replace_value in patches_replace.items():
|
|
439
439
|
list_to_manipulate[replace_index] = replace_value
|
|
440
440
|
|
|
441
|
-
for
|
|
441
|
+
for drop_value in patches_drop.values():
|
|
442
442
|
list_to_manipulate.remove(drop_value)
|
|
443
443
|
|
|
444
444
|
for add_value in patches_add:
|
|
@@ -266,7 +266,8 @@ class Dataset(BaseModel):
|
|
|
266
266
|
if self.schema_metadata.fields:
|
|
267
267
|
for field in self.schema_metadata.fields:
|
|
268
268
|
field_urn = field.urn or make_schema_field_urn(
|
|
269
|
-
self.urn,
|
|
269
|
+
self.urn, # type: ignore[arg-type]
|
|
270
|
+
field.id, # type: ignore[arg-type]
|
|
270
271
|
)
|
|
271
272
|
assert field_urn.startswith("urn:li:schemaField:")
|
|
272
273
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import Iterable, List, Optional
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
from pydantic import validator
|
|
@@ -118,9 +118,9 @@ class StructuredProperties(ConfigModel):
|
|
|
118
118
|
id = StructuredPropertyUrn.from_string(self.urn).id
|
|
119
119
|
if self.qualified_name is not None:
|
|
120
120
|
# ensure that qualified name and ID match
|
|
121
|
-
assert (
|
|
122
|
-
|
|
123
|
-
)
|
|
121
|
+
assert self.qualified_name == id, (
|
|
122
|
+
"ID in the urn and the qualified_name must match"
|
|
123
|
+
)
|
|
124
124
|
return id
|
|
125
125
|
|
|
126
126
|
@validator("urn", pre=True, always=True)
|
|
@@ -184,9 +184,9 @@ class StructuredProperties(ConfigModel):
|
|
|
184
184
|
|
|
185
185
|
@classmethod
|
|
186
186
|
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
|
|
187
|
-
structured_property: Optional[
|
|
188
|
-
StructuredPropertyDefinitionClass
|
|
189
|
-
|
|
187
|
+
structured_property: Optional[StructuredPropertyDefinitionClass] = (
|
|
188
|
+
graph.get_aspect(urn, StructuredPropertyDefinitionClass)
|
|
189
|
+
)
|
|
190
190
|
if structured_property is None:
|
|
191
191
|
raise Exception(
|
|
192
192
|
"StructuredPropertyDefinition aspect is None. Unable to create structured property."
|
|
@@ -226,3 +226,14 @@ class StructuredProperties(ConfigModel):
|
|
|
226
226
|
yaml.indent(mapping=2, sequence=4, offset=2)
|
|
227
227
|
yaml.default_flow_style = False
|
|
228
228
|
yaml.dump(self.dict(), fp)
|
|
229
|
+
|
|
230
|
+
@staticmethod
|
|
231
|
+
def list_urns(graph: DataHubGraph) -> Iterable[str]:
|
|
232
|
+
return graph.get_urns_by_filter(
|
|
233
|
+
entity_types=["structuredProperty"],
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
@staticmethod
|
|
237
|
+
def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]:
|
|
238
|
+
for urn in StructuredProperties.list_urns(graph):
|
|
239
|
+
yield StructuredProperties.from_datahub(graph, urn)
|
datahub/cli/cli_utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import time
|
|
4
4
|
import typing
|
|
5
5
|
from datetime import datetime
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
|
|
7
7
|
|
|
8
8
|
import click
|
|
9
9
|
import requests
|
|
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
|
|
|
33
33
|
return next((el for el in ls if el is not None and el.strip() != ""), None)
|
|
34
34
|
|
|
35
35
|
|
|
36
|
+
_T = TypeVar("_T")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_or_else(value: Optional[_T], default: _T) -> _T:
|
|
40
|
+
# Normally we'd use `value or default`. However, that runs into issues
|
|
41
|
+
# when value is falsey but not None.
|
|
42
|
+
return value if value is not None else default
|
|
43
|
+
|
|
44
|
+
|
|
36
45
|
def parse_run_restli_response(response: requests.Response) -> dict:
|
|
37
46
|
response_json = response.json()
|
|
38
47
|
if response.status_code != 200:
|
|
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
|
|
|
321
330
|
def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
|
|
322
331
|
if "acryl.io" not in url:
|
|
323
332
|
return url
|
|
333
|
+
if url.endswith(":8080"):
|
|
334
|
+
url = url.replace(":8080", "")
|
|
324
335
|
if url.startswith("http://"):
|
|
325
336
|
url = url.replace("http://", "https://")
|
|
326
337
|
if url.endswith("acryl.io"):
|
|
@@ -401,7 +412,7 @@ def generate_access_token(
|
|
|
401
412
|
def ensure_has_system_metadata(
|
|
402
413
|
event: Union[
|
|
403
414
|
MetadataChangeProposal, MetadataChangeProposalWrapper, MetadataChangeEvent
|
|
404
|
-
]
|
|
415
|
+
],
|
|
405
416
|
) -> None:
|
|
406
417
|
if event.systemMetadata is None:
|
|
407
418
|
event.systemMetadata = SystemMetadataClass()
|
datahub/cli/delete_cli.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import random
|
|
2
3
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
3
4
|
from dataclasses import dataclass
|
|
4
5
|
from datetime import datetime
|
|
5
|
-
from random import choices
|
|
6
6
|
from typing import Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
import click
|
|
@@ -457,11 +457,11 @@ def by_filter(
|
|
|
457
457
|
click.echo("Found urns of multiple entity types")
|
|
458
458
|
for entity_type, entity_urns in urns_by_type.items():
|
|
459
459
|
click.echo(
|
|
460
|
-
f"- {len(entity_urns)} {entity_type} urn(s). Sample: {
|
|
460
|
+
f"- {len(entity_urns)} {entity_type} urn(s). Sample: {random.sample(entity_urns, k=min(5, len(entity_urns)))}"
|
|
461
461
|
)
|
|
462
462
|
else:
|
|
463
463
|
click.echo(
|
|
464
|
-
f"Found {len(urns)} {entity_type} urn(s). Sample: {
|
|
464
|
+
f"Found {len(urns)} {entity_type} urn(s). Sample: {random.sample(urns, k=min(5, len(urns)))}"
|
|
465
465
|
)
|
|
466
466
|
|
|
467
467
|
if not force and not dry_run:
|
datahub/cli/docker_cli.py
CHANGED
|
@@ -296,9 +296,9 @@ def _restore(
|
|
|
296
296
|
restore_indices: Optional[bool],
|
|
297
297
|
primary_restore_file: Optional[str],
|
|
298
298
|
) -> int:
|
|
299
|
-
assert (
|
|
300
|
-
restore_primary or restore_indices
|
|
301
|
-
)
|
|
299
|
+
assert restore_primary or restore_indices, (
|
|
300
|
+
"Either restore_primary or restore_indices must be set"
|
|
301
|
+
)
|
|
302
302
|
msg = "datahub> "
|
|
303
303
|
if restore_primary:
|
|
304
304
|
msg += f"Will restore primary database from {primary_restore_file}. "
|
|
@@ -314,9 +314,9 @@ def _restore(
|
|
|
314
314
|
assert primary_restore_file
|
|
315
315
|
resolved_restore_file = os.path.expanduser(primary_restore_file)
|
|
316
316
|
logger.info(f"Restoring primary db from backup at {resolved_restore_file}")
|
|
317
|
-
assert os.path.exists(
|
|
318
|
-
resolved_restore_file
|
|
319
|
-
)
|
|
317
|
+
assert os.path.exists(resolved_restore_file), (
|
|
318
|
+
f"File {resolved_restore_file} does not exist"
|
|
319
|
+
)
|
|
320
320
|
with open(resolved_restore_file) as fp:
|
|
321
321
|
result = subprocess.run(
|
|
322
322
|
[
|
datahub/cli/ingest_cli.py
CHANGED
|
@@ -507,15 +507,11 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
|
|
|
507
507
|
click.echo("No response received from the server.")
|
|
508
508
|
return
|
|
509
509
|
|
|
510
|
-
#
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
):
|
|
515
|
-
click.echo("No matching ingestion sources found. Please check your filters.")
|
|
516
|
-
return
|
|
510
|
+
# a lot of responses can be null if there's errors in the run
|
|
511
|
+
ingestion_sources = (
|
|
512
|
+
data.get("data", {}).get("listIngestionSources", {}).get("ingestionSources", [])
|
|
513
|
+
)
|
|
517
514
|
|
|
518
|
-
ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
|
|
519
515
|
if not ingestion_sources:
|
|
520
516
|
click.echo("No ingestion sources or executions found.")
|
|
521
517
|
return
|
|
@@ -526,18 +522,32 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
|
|
|
526
522
|
name = ingestion_source.get("name", "N/A")
|
|
527
523
|
|
|
528
524
|
executions = ingestion_source.get("executions", {}).get("executionRequests", [])
|
|
525
|
+
|
|
529
526
|
for execution in executions:
|
|
527
|
+
if execution is None:
|
|
528
|
+
continue
|
|
529
|
+
|
|
530
530
|
execution_id = execution.get("id", "N/A")
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
531
|
+
result = execution.get("result") or {}
|
|
532
|
+
status = result.get("status", "N/A")
|
|
533
|
+
|
|
534
|
+
try:
|
|
535
|
+
start_time = (
|
|
536
|
+
datetime.fromtimestamp(
|
|
537
|
+
result.get("startTimeMs", 0) / 1000
|
|
538
|
+
).strftime("%Y-%m-%d %H:%M:%S")
|
|
539
|
+
if status != "DUPLICATE" and result.get("startTimeMs") is not None
|
|
540
|
+
else "N/A"
|
|
541
|
+
)
|
|
542
|
+
except (TypeError, ValueError):
|
|
543
|
+
start_time = "N/A"
|
|
538
544
|
|
|
539
545
|
rows.append([execution_id, name, start_time, status, urn])
|
|
540
546
|
|
|
547
|
+
if not rows:
|
|
548
|
+
click.echo("No execution data found.")
|
|
549
|
+
return
|
|
550
|
+
|
|
541
551
|
click.echo(
|
|
542
552
|
tabulate(
|
|
543
553
|
rows,
|
datahub/cli/lite_cli.py
CHANGED
|
@@ -176,7 +176,7 @@ def get(
|
|
|
176
176
|
)
|
|
177
177
|
)
|
|
178
178
|
end_time = time.time()
|
|
179
|
-
logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
|
|
179
|
+
logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
@lite.command()
|
|
@@ -228,7 +228,7 @@ def ls(path: Optional[str]) -> None:
|
|
|
228
228
|
try:
|
|
229
229
|
browseables = lite.ls(path)
|
|
230
230
|
end_time = time.time()
|
|
231
|
-
logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
|
|
231
|
+
logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
|
|
232
232
|
auto_complete: List[AutoComplete] = [
|
|
233
233
|
b.auto_complete for b in browseables if b.auto_complete is not None
|
|
234
234
|
]
|
datahub/cli/migrate.py
CHANGED
|
@@ -179,7 +179,7 @@ def dataplatform2instance_func(
|
|
|
179
179
|
|
|
180
180
|
if not force and not dry_run:
|
|
181
181
|
# get a confirmation from the operator before proceeding if this is not a dry run
|
|
182
|
-
sampled_urns_to_migrate = random.
|
|
182
|
+
sampled_urns_to_migrate = random.sample(
|
|
183
183
|
urns_to_migrate, k=min(10, len(urns_to_migrate))
|
|
184
184
|
)
|
|
185
185
|
sampled_new_urns: List[str] = [
|
|
@@ -193,7 +193,7 @@ def dataplatform2instance_func(
|
|
|
193
193
|
if key
|
|
194
194
|
]
|
|
195
195
|
click.echo(
|
|
196
|
-
f"Will migrate {len(urns_to_migrate)} urns such as {random.
|
|
196
|
+
f"Will migrate {len(urns_to_migrate)} urns such as {random.sample(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
|
|
197
197
|
)
|
|
198
198
|
click.echo(f"New urns will look like {sampled_new_urns}")
|
|
199
199
|
click.confirm("Ok to proceed?", abort=True)
|
|
@@ -426,9 +426,9 @@ def batch_get_ids(
|
|
|
426
426
|
entities_yielded += 1
|
|
427
427
|
log.debug(f"yielding {x}")
|
|
428
428
|
yield x
|
|
429
|
-
assert (
|
|
430
|
-
|
|
431
|
-
)
|
|
429
|
+
assert entities_yielded == num_entities, (
|
|
430
|
+
"Did not delete all entities, try running this command again!"
|
|
431
|
+
)
|
|
432
432
|
else:
|
|
433
433
|
log.error(f"Failed to execute batch get with {str(response.content)}")
|
|
434
434
|
response.raise_for_status()
|
|
@@ -136,9 +136,9 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
|
|
|
136
136
|
extra_properties: Dict[str, str] = dict()
|
|
137
137
|
for x in extras:
|
|
138
138
|
parts = x.split("=")
|
|
139
|
-
assert (
|
|
140
|
-
|
|
141
|
-
)
|
|
139
|
+
assert len(parts) == 2, (
|
|
140
|
+
f"Invalid value for extras {x}, should be in format key=value"
|
|
141
|
+
)
|
|
142
142
|
extra_properties[parts[0]] = parts[1]
|
|
143
143
|
return extra_properties
|
|
144
144
|
|
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
from pathlib import Path
|
|
4
|
+
from typing import Iterable
|
|
4
5
|
|
|
5
6
|
import click
|
|
6
7
|
from click_default_group import DefaultGroup
|
|
8
|
+
from ruamel.yaml import YAML
|
|
7
9
|
|
|
8
10
|
from datahub.api.entities.structuredproperties.structuredproperties import (
|
|
9
11
|
StructuredProperties,
|
|
@@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None:
|
|
|
61
63
|
)
|
|
62
64
|
else:
|
|
63
65
|
click.secho(f"Structured property {urn} does not exist")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@properties.command(
|
|
69
|
+
name="list",
|
|
70
|
+
)
|
|
71
|
+
@click.option("--details/--no-details", is_flag=True, default=True)
|
|
72
|
+
@click.option("--to-file", required=False, type=str)
|
|
73
|
+
@telemetry.with_telemetry()
|
|
74
|
+
def list(details: bool, to_file: str) -> None:
|
|
75
|
+
"""List structured properties in DataHub"""
|
|
76
|
+
|
|
77
|
+
def to_yaml_list(
|
|
78
|
+
objects: Iterable[StructuredProperties], # iterable of objects to dump
|
|
79
|
+
file: Path,
|
|
80
|
+
) -> None:
|
|
81
|
+
# if file exists, first we read it
|
|
82
|
+
yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip)
|
|
83
|
+
yaml.indent(mapping=2, sequence=4, offset=2)
|
|
84
|
+
yaml.default_flow_style = False
|
|
85
|
+
serialized_objects = []
|
|
86
|
+
if file.exists():
|
|
87
|
+
with open(file, "r") as fp:
|
|
88
|
+
existing_objects = yaml.load(fp) # this is a list of dicts
|
|
89
|
+
existing_objects = [
|
|
90
|
+
StructuredProperties.parse_obj(obj) for obj in existing_objects
|
|
91
|
+
]
|
|
92
|
+
objects = [obj for obj in objects]
|
|
93
|
+
# do a positional update of the existing objects
|
|
94
|
+
existing_urns = {obj.urn for obj in existing_objects}
|
|
95
|
+
# existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects}
|
|
96
|
+
for i, obj in enumerate(existing_objects):
|
|
97
|
+
# existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}"
|
|
98
|
+
existing_urn = obj.urn
|
|
99
|
+
# breakpoint()
|
|
100
|
+
if existing_urn in {obj.urn for obj in objects}:
|
|
101
|
+
existing_objects[i] = next(
|
|
102
|
+
obj.dict(exclude_unset=True, exclude_none=True)
|
|
103
|
+
for obj in objects
|
|
104
|
+
if obj.urn == existing_urn
|
|
105
|
+
)
|
|
106
|
+
new_objects = [
|
|
107
|
+
obj.dict(exclude_unset=True, exclude_none=True)
|
|
108
|
+
for obj in objects
|
|
109
|
+
if obj.urn not in existing_urns
|
|
110
|
+
]
|
|
111
|
+
serialized_objects = existing_objects + new_objects
|
|
112
|
+
else:
|
|
113
|
+
serialized_objects = [
|
|
114
|
+
obj.dict(exclude_unset=True, exclude_none=True) for obj in objects
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
with open(file, "w") as fp:
|
|
118
|
+
yaml.dump(serialized_objects, fp)
|
|
119
|
+
|
|
120
|
+
with get_default_graph() as graph:
|
|
121
|
+
if details:
|
|
122
|
+
logger.info(
|
|
123
|
+
"Listing structured properties with details. Use --no-details for urns only"
|
|
124
|
+
)
|
|
125
|
+
structuredproperties = StructuredProperties.list(graph)
|
|
126
|
+
if to_file:
|
|
127
|
+
to_yaml_list(structuredproperties, Path(to_file))
|
|
128
|
+
else:
|
|
129
|
+
for structuredproperty in structuredproperties:
|
|
130
|
+
click.secho(
|
|
131
|
+
f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}"
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
logger.info(
|
|
135
|
+
"Listing structured property urns only, use --details for more information"
|
|
136
|
+
)
|
|
137
|
+
structured_property_urns = StructuredProperties.list_urns(graph)
|
|
138
|
+
if to_file:
|
|
139
|
+
with open(to_file, "w") as f:
|
|
140
|
+
for urn in structured_property_urns:
|
|
141
|
+
f.write(f"{urn}\n")
|
|
142
|
+
click.secho(
|
|
143
|
+
f"Structured property urns written to {to_file}", fg="green"
|
|
144
|
+
)
|
|
145
|
+
else:
|
|
146
|
+
for urn in structured_property_urns:
|
|
147
|
+
click.secho(f"{urn}")
|
datahub/cli/timeline_cli.py
CHANGED
|
@@ -50,7 +50,7 @@ def pretty_id(id: Optional[str]) -> str:
|
|
|
50
50
|
if id.startswith("urn:li:dataset"):
|
|
51
51
|
dataset_key = dataset_urn_to_key(id)
|
|
52
52
|
if dataset_key:
|
|
53
|
-
return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:'):], fg='white')}:{click.style(dataset_key.name, fg='white')}"
|
|
53
|
+
return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:') :], fg='white')}:{click.style(dataset_key.name, fg='white')}"
|
|
54
54
|
# failed to prettify, return original
|
|
55
55
|
return id
|
|
56
56
|
|
datahub/configuration/common.py
CHANGED
|
@@ -200,8 +200,7 @@ class IgnorableError(MetaError):
|
|
|
200
200
|
|
|
201
201
|
@runtime_checkable
|
|
202
202
|
class ExceptionWithProps(Protocol):
|
|
203
|
-
def get_telemetry_props(self) -> Dict[str, Any]:
|
|
204
|
-
...
|
|
203
|
+
def get_telemetry_props(self) -> Dict[str, Any]: ...
|
|
205
204
|
|
|
206
205
|
|
|
207
206
|
def should_show_stack_trace(exc: Exception) -> bool:
|
|
@@ -19,64 +19,87 @@ from datahub.configuration.yaml import YamlConfigurationMechanism
|
|
|
19
19
|
Environ = Mapping[str, str]
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def _resolve_element(element: str, environ: Environ) -> str:
|
|
23
|
-
if re.search(r"(\$\{).+(\})", element):
|
|
24
|
-
return expand(element, nounset=True, environ=environ)
|
|
25
|
-
elif element.startswith("$"):
|
|
26
|
-
try:
|
|
27
|
-
return expand(element, nounset=True, environ=environ)
|
|
28
|
-
except UnboundVariable:
|
|
29
|
-
return element
|
|
30
|
-
else:
|
|
31
|
-
return element
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def _resolve_list(ele_list: list, environ: Environ) -> list:
|
|
35
|
-
new_v: list = []
|
|
36
|
-
for ele in ele_list:
|
|
37
|
-
if isinstance(ele, str):
|
|
38
|
-
new_v.append(_resolve_element(ele, environ=environ))
|
|
39
|
-
elif isinstance(ele, list):
|
|
40
|
-
new_v.append(_resolve_list(ele, environ=environ))
|
|
41
|
-
elif isinstance(ele, dict):
|
|
42
|
-
new_v.append(resolve_env_variables(ele, environ=environ))
|
|
43
|
-
else:
|
|
44
|
-
new_v.append(ele)
|
|
45
|
-
return new_v
|
|
46
|
-
|
|
47
|
-
|
|
48
22
|
def resolve_env_variables(config: dict, environ: Environ) -> dict:
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
if isinstance(v, dict):
|
|
52
|
-
new_dict[k] = resolve_env_variables(v, environ=environ)
|
|
53
|
-
elif isinstance(v, list):
|
|
54
|
-
new_dict[k] = _resolve_list(v, environ=environ)
|
|
55
|
-
elif isinstance(v, str):
|
|
56
|
-
new_dict[k] = _resolve_element(v, environ=environ)
|
|
57
|
-
else:
|
|
58
|
-
new_dict[k] = v
|
|
59
|
-
return new_dict
|
|
23
|
+
# TODO: This is kept around for backwards compatibility.
|
|
24
|
+
return EnvResolver(environ).resolve(config)
|
|
60
25
|
|
|
61
26
|
|
|
62
27
|
def list_referenced_env_variables(config: dict) -> Set[str]:
|
|
63
|
-
# This is
|
|
64
|
-
|
|
28
|
+
# TODO: This is kept around for backwards compatibility.
|
|
29
|
+
return EnvResolver(environ=os.environ).list_referenced_variables(config)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class EnvResolver:
|
|
33
|
+
def __init__(self, environ: Environ, strict_env_syntax: bool = False):
|
|
34
|
+
self.environ = environ
|
|
35
|
+
self.strict_env_syntax = strict_env_syntax
|
|
65
36
|
|
|
66
|
-
|
|
37
|
+
def resolve(self, config: dict) -> dict:
|
|
38
|
+
return self._resolve_dict(config)
|
|
67
39
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
40
|
+
@classmethod
|
|
41
|
+
def list_referenced_variables(
|
|
42
|
+
cls,
|
|
43
|
+
config: dict,
|
|
44
|
+
strict_env_syntax: bool = False,
|
|
45
|
+
) -> Set[str]:
|
|
46
|
+
# This is a bit of a hack, but expandvars does a bunch of escaping
|
|
47
|
+
# and other logic that we don't want to duplicate here.
|
|
73
48
|
|
|
74
|
-
|
|
75
|
-
mock.get.side_effect = mock_get_env
|
|
49
|
+
vars = set()
|
|
76
50
|
|
|
77
|
-
|
|
51
|
+
def mock_get_env(key: str, default: Optional[str] = None) -> str:
|
|
52
|
+
vars.add(key)
|
|
53
|
+
if default is not None:
|
|
54
|
+
return default
|
|
55
|
+
return "mocked_value"
|
|
56
|
+
|
|
57
|
+
mock = unittest.mock.MagicMock()
|
|
58
|
+
mock.get.side_effect = mock_get_env
|
|
59
|
+
|
|
60
|
+
resolver = EnvResolver(environ=mock, strict_env_syntax=strict_env_syntax)
|
|
61
|
+
resolver._resolve_dict(config)
|
|
62
|
+
|
|
63
|
+
return vars
|
|
64
|
+
|
|
65
|
+
def _resolve_element(self, element: str) -> str:
|
|
66
|
+
if re.search(r"(\$\{).+(\})", element):
|
|
67
|
+
return expand(element, nounset=True, environ=self.environ)
|
|
68
|
+
elif not self.strict_env_syntax and element.startswith("$"):
|
|
69
|
+
try:
|
|
70
|
+
return expand(element, nounset=True, environ=self.environ)
|
|
71
|
+
except UnboundVariable:
|
|
72
|
+
# TODO: This fallback is kept around for backwards compatibility, but
|
|
73
|
+
# doesn't make a ton of sense from first principles.
|
|
74
|
+
return element
|
|
75
|
+
else:
|
|
76
|
+
return element
|
|
78
77
|
|
|
79
|
-
|
|
78
|
+
def _resolve_list(self, ele_list: list) -> list:
|
|
79
|
+
new_v: list = []
|
|
80
|
+
for ele in ele_list:
|
|
81
|
+
if isinstance(ele, str):
|
|
82
|
+
new_v.append(self._resolve_element(ele))
|
|
83
|
+
elif isinstance(ele, list):
|
|
84
|
+
new_v.append(self._resolve_list(ele))
|
|
85
|
+
elif isinstance(ele, dict):
|
|
86
|
+
new_v.append(self._resolve_dict(ele))
|
|
87
|
+
else:
|
|
88
|
+
new_v.append(ele)
|
|
89
|
+
return new_v
|
|
90
|
+
|
|
91
|
+
def _resolve_dict(self, config: dict) -> dict:
|
|
92
|
+
new_dict: Dict[Any, Any] = {}
|
|
93
|
+
for k, v in config.items():
|
|
94
|
+
if isinstance(v, dict):
|
|
95
|
+
new_dict[k] = self._resolve_dict(v)
|
|
96
|
+
elif isinstance(v, list):
|
|
97
|
+
new_dict[k] = self._resolve_list(v)
|
|
98
|
+
elif isinstance(v, str):
|
|
99
|
+
new_dict[k] = self._resolve_element(v)
|
|
100
|
+
else:
|
|
101
|
+
new_dict[k] = v
|
|
102
|
+
return new_dict
|
|
80
103
|
|
|
81
104
|
|
|
82
105
|
WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_"
|
|
@@ -159,7 +182,7 @@ def load_config_file(
|
|
|
159
182
|
|
|
160
183
|
config = raw_config.copy()
|
|
161
184
|
if resolve_env_vars:
|
|
162
|
-
config =
|
|
185
|
+
config = EnvResolver(environ=os.environ).resolve(config)
|
|
163
186
|
if process_directives:
|
|
164
187
|
config = _process_directives(config)
|
|
165
188
|
|
datahub/configuration/git.py
CHANGED
|
@@ -121,9 +121,9 @@ class GitInfo(GitReference):
|
|
|
121
121
|
|
|
122
122
|
repo: str = values["repo"]
|
|
123
123
|
if repo.startswith(_GITHUB_PREFIX):
|
|
124
|
-
return f"git@github.com:{repo[len(_GITHUB_PREFIX):]}.git"
|
|
124
|
+
return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git"
|
|
125
125
|
elif repo.startswith(_GITLAB_PREFIX):
|
|
126
|
-
return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX):]}.git"
|
|
126
|
+
return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git"
|
|
127
127
|
else:
|
|
128
128
|
raise ValueError(
|
|
129
129
|
"Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."
|