PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (211) hide show

{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/METADATA +2440 -2438
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/RECORD +211 -207
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL +1 -1
datahub/__init__.py +1 -1
datahub/api/entities/assertion/assertion_operator.py +3 -5
datahub/api/entities/corpgroup/corpgroup.py +1 -1
datahub/api/entities/datacontract/assertion_operator.py +3 -5
datahub/api/entities/dataproduct/dataproduct.py +4 -4
datahub/api/entities/dataset/dataset.py +2 -1
datahub/api/entities/structuredproperties/structuredproperties.py +18 -7
datahub/cli/cli_utils.py +13 -2
datahub/cli/delete_cli.py +3 -3
datahub/cli/docker_cli.py +6 -6
datahub/cli/ingest_cli.py +25 -15
datahub/cli/lite_cli.py +2 -2
datahub/cli/migrate.py +5 -5
datahub/cli/specific/assertions_cli.py +3 -3
datahub/cli/specific/structuredproperties_cli.py +84 -0
datahub/cli/timeline_cli.py +1 -1
datahub/configuration/common.py +1 -2
datahub/configuration/config_loader.py +73 -50
datahub/configuration/git.py +2 -2
datahub/configuration/time_window_config.py +10 -5
datahub/emitter/mce_builder.py +4 -8
datahub/emitter/mcp_builder.py +27 -0
datahub/emitter/mcp_patch_builder.py +1 -2
datahub/emitter/rest_emitter.py +126 -85
datahub/entrypoints.py +6 -0
datahub/ingestion/api/incremental_lineage_helper.py +2 -8
datahub/ingestion/api/report.py +1 -2
datahub/ingestion/api/source.py +4 -2
datahub/ingestion/api/source_helpers.py +1 -1
datahub/ingestion/extractor/json_schema_util.py +3 -3
datahub/ingestion/extractor/schema_util.py +3 -5
datahub/ingestion/fs/s3_fs.py +3 -3
datahub/ingestion/glossary/datahub_classifier.py +6 -4
datahub/ingestion/graph/client.py +22 -19
datahub/ingestion/graph/config.py +1 -1
datahub/ingestion/run/pipeline.py +8 -7
datahub/ingestion/run/pipeline_config.py +3 -3
datahub/ingestion/source/abs/datalake_profiler_config.py +3 -3
datahub/ingestion/source/abs/source.py +19 -8
datahub/ingestion/source/aws/glue.py +77 -47
datahub/ingestion/source/aws/s3_boto_utils.py +3 -3
datahub/ingestion/source/aws/s3_util.py +24 -1
datahub/ingestion/source/aws/sagemaker_processors/feature_groups.py +1 -1
datahub/ingestion/source/aws/sagemaker_processors/models.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +34 -34
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +3 -3
datahub/ingestion/source/bigquery_v2/bigquery_config.py +14 -6
datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +8 -4
datahub/ingestion/source/bigquery_v2/bigquery_report.py +0 -3
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +22 -16
datahub/ingestion/source/bigquery_v2/lineage.py +16 -16
datahub/ingestion/source/bigquery_v2/queries.py +1 -3
datahub/ingestion/source/bigquery_v2/queries_extractor.py +3 -3
datahub/ingestion/source/bigquery_v2/usage.py +60 -60
datahub/ingestion/source/cassandra/cassandra.py +0 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +24 -24
datahub/ingestion/source/cassandra/cassandra_utils.py +4 -7
datahub/ingestion/source/confluent_schema_registry.py +6 -6
datahub/ingestion/source/csv_enricher.py +29 -29
datahub/ingestion/source/datahub/config.py +10 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +4 -2
datahub/ingestion/source/datahub/datahub_source.py +12 -2
datahub/ingestion/source/dbt/dbt_cloud.py +13 -13
datahub/ingestion/source/dbt/dbt_common.py +9 -7
datahub/ingestion/source/delta_lake/source.py +0 -5
datahub/ingestion/source/demo_data.py +1 -1
datahub/ingestion/source/dremio/dremio_api.py +4 -4
datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py +3 -3
datahub/ingestion/source/dremio/dremio_reporting.py +0 -3
datahub/ingestion/source/dremio/dremio_source.py +2 -2
datahub/ingestion/source/elastic_search.py +4 -4
datahub/ingestion/source/fivetran/fivetran.py +1 -6
datahub/ingestion/source/gc/datahub_gc.py +11 -14
datahub/ingestion/source/gc/execution_request_cleanup.py +31 -6
datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py +48 -15
datahub/ingestion/source/gcs/gcs_source.py +3 -2
datahub/ingestion/source/ge_data_profiler.py +2 -5
datahub/ingestion/source/ge_profiling_config.py +3 -3
datahub/ingestion/source/iceberg/iceberg.py +13 -6
datahub/ingestion/source/iceberg/iceberg_common.py +49 -9
datahub/ingestion/source/iceberg/iceberg_profiler.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +3 -3
datahub/ingestion/source/identity/okta.py +3 -3
datahub/ingestion/source/kafka/kafka.py +11 -9
datahub/ingestion/source/kafka_connect/kafka_connect.py +3 -9
datahub/ingestion/source/kafka_connect/sink_connectors.py +3 -3
datahub/ingestion/source/kafka_connect/source_connectors.py +3 -3
datahub/ingestion/source/looker/looker_common.py +19 -19
datahub/ingestion/source/looker/looker_config.py +11 -6
datahub/ingestion/source/looker/looker_source.py +25 -25
datahub/ingestion/source/looker/looker_template_language.py +3 -3
datahub/ingestion/source/looker/looker_usage.py +5 -7
datahub/ingestion/source/looker/lookml_concept_context.py +6 -6
datahub/ingestion/source/looker/lookml_source.py +13 -15
datahub/ingestion/source/looker/view_upstream.py +5 -5
datahub/ingestion/source/metabase.py +1 -6
datahub/ingestion/source/mlflow.py +4 -9
datahub/ingestion/source/mode.py +5 -5
datahub/ingestion/source/mongodb.py +6 -4
datahub/ingestion/source/neo4j/neo4j_source.py +1 -1
datahub/ingestion/source/nifi.py +24 -31
datahub/ingestion/source/openapi.py +9 -9
datahub/ingestion/source/powerbi/config.py +12 -12
datahub/ingestion/source/powerbi/m_query/parser.py +11 -11
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +26 -24
datahub/ingestion/source/powerbi/m_query/resolver.py +13 -13
datahub/ingestion/source/powerbi/powerbi.py +6 -6
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +9 -9
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +7 -7
datahub/ingestion/source/powerbi_report_server/report_server.py +1 -1
datahub/ingestion/source/qlik_sense/qlik_api.py +1 -1
datahub/ingestion/source/redash.py +0 -5
datahub/ingestion/source/redshift/config.py +3 -3
datahub/ingestion/source/redshift/redshift.py +45 -46
datahub/ingestion/source/redshift/usage.py +33 -33
datahub/ingestion/source/s3/datalake_profiler_config.py +3 -3
datahub/ingestion/source/s3/source.py +11 -15
datahub/ingestion/source/salesforce.py +26 -25
datahub/ingestion/source/schema/json_schema.py +1 -1
datahub/ingestion/source/sigma/sigma.py +3 -3
datahub/ingestion/source/sigma/sigma_api.py +12 -10
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_connection.py +6 -6
datahub/ingestion/source/snowflake/snowflake_queries.py +2 -2
datahub/ingestion/source/snowflake/snowflake_report.py +0 -3
datahub/ingestion/source/snowflake/snowflake_schema.py +8 -5
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +136 -42
datahub/ingestion/source/snowflake/snowflake_tag.py +21 -11
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +49 -50
datahub/ingestion/source/snowflake/snowflake_utils.py +1 -2
datahub/ingestion/source/snowflake/snowflake_v2.py +51 -47
datahub/ingestion/source/sql/athena.py +1 -3
datahub/ingestion/source/sql/clickhouse.py +8 -14
datahub/ingestion/source/sql/oracle.py +1 -3
datahub/ingestion/source/sql/sql_generic_profiler.py +1 -2
datahub/ingestion/source/sql/sql_types.py +1 -2
datahub/ingestion/source/sql/sql_utils.py +5 -0
datahub/ingestion/source/sql/teradata.py +18 -5
datahub/ingestion/source/state/profiling_state_handler.py +3 -3
datahub/ingestion/source/state/redundant_run_skip_handler.py +5 -7
datahub/ingestion/source/state/stale_entity_removal_handler.py +3 -3
datahub/ingestion/source/state_provider/datahub_ingestion_checkpointing_provider.py +9 -9
datahub/ingestion/source/state_provider/file_ingestion_checkpointing_provider.py +1 -1
datahub/ingestion/source/superset.py +1 -6
datahub/ingestion/source/tableau/tableau.py +343 -117
datahub/ingestion/source/tableau/tableau_common.py +5 -2
datahub/ingestion/source/unity/config.py +3 -1
datahub/ingestion/source/unity/proxy.py +1 -1
datahub/ingestion/source/unity/source.py +74 -74
datahub/ingestion/source/unity/usage.py +3 -1
datahub/ingestion/source/usage/clickhouse_usage.py +4 -4
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -3
datahub/ingestion/source/usage/usage_common.py +1 -1
datahub/ingestion/source_report/ingestion_stage.py +24 -20
datahub/ingestion/transformer/add_dataset_dataproduct.py +4 -4
datahub/ingestion/transformer/add_dataset_properties.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_tags.py +3 -3
datahub/ingestion/transformer/add_dataset_schema_terms.py +3 -3
datahub/ingestion/transformer/dataset_domain_based_on_tags.py +4 -4
datahub/ingestion/transformer/extract_ownership_from_tags.py +3 -3
datahub/ingestion/transformer/tags_to_terms.py +7 -7
datahub/integrations/assertion/snowflake/compiler.py +10 -10
datahub/lite/duckdb_lite.py +12 -10
datahub/metadata/_schema_classes.py +317 -44
datahub/metadata/_urns/urn_defs.py +69 -15
datahub/metadata/com/linkedin/pegasus2avro/common/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/versionset/__init__.py +17 -0
datahub/metadata/schema.avsc +302 -89
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessInstanceInput.avsc +4 -2
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +2 -0
datahub/metadata/schemas/DatasetKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureProperties.avsc +51 -0
datahub/metadata/schemas/MLModelDeploymentProperties.avsc +51 -0
datahub/metadata/schemas/MLModelGroupProperties.avsc +96 -23
datahub/metadata/schemas/MLModelKey.avsc +2 -1
datahub/metadata/schemas/MLModelProperties.avsc +96 -48
datahub/metadata/schemas/MLPrimaryKeyProperties.avsc +51 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +98 -71
datahub/metadata/schemas/VersionProperties.avsc +216 -0
datahub/metadata/schemas/VersionSetKey.avsc +26 -0
datahub/metadata/schemas/VersionSetProperties.avsc +49 -0
datahub/secret/datahub_secrets_client.py +12 -21
datahub/secret/secret_common.py +14 -8
datahub/specific/aspect_helpers/custom_properties.py +1 -2
datahub/sql_parsing/schema_resolver.py +5 -10
datahub/sql_parsing/sql_parsing_aggregator.py +18 -16
datahub/sql_parsing/sqlglot_lineage.py +3 -3
datahub/sql_parsing/sqlglot_utils.py +1 -1
datahub/telemetry/stats.py +1 -2
datahub/testing/mcp_diff.py +1 -1
datahub/utilities/file_backed_collections.py +11 -11
datahub/utilities/hive_schema_to_avro.py +2 -2
datahub/utilities/logging_manager.py +2 -2
datahub/utilities/lossy_collections.py +3 -3
datahub/utilities/mapping.py +3 -3
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/perf_timer.py +11 -6
datahub/utilities/serialized_lru_cache.py +3 -1
datahub/utilities/sqlalchemy_query_combiner.py +6 -6
datahub/utilities/sqllineage_patch.py +1 -1
datahub/utilities/stats_collections.py +3 -1
datahub/utilities/urns/_urn_base.py +28 -5
datahub/utilities/urns/urn_iter.py +2 -2
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/entry_points.txt +0 -0
{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/top_level.txt +0 -0

{acryl_datahub-0.15.0.1rc17.dist-info → acryl_datahub-0.15.0.2.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (75.7.0)
+Generator: setuptools (75.8.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

datahub/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import warnings
 # Published at https://pypi.org/project/acryl-datahub/.
 __package_name__ = "acryl-datahub"
-__version__ = "0.15.0.1rc17"
+__version__ = "0.15.0.2"
 def is_dev_mode() -> bool:

datahub/api/entities/assertion/assertion_operator.py CHANGED Viewed

@@ -20,15 +20,13 @@ class Operator(Protocol):
     operator: str
-    def id(self) -> str:
-        ...
+    def id(self) -> str: ...
-    def generate_parameters(self) -> AssertionStdParametersClass:
-        ...
+    def generate_parameters(self) -> AssertionStdParametersClass: ...
 def _generate_assertion_std_parameter(
-    value: Union[str, int, float, list]
+    value: Union[str, int, float, list],
 ) -> AssertionStdParameterClass:
     if isinstance(value, str):
         return AssertionStdParameterClass(

datahub/api/entities/corpgroup/corpgroup.py CHANGED Viewed

@@ -114,7 +114,7 @@ class CorpGroup(BaseModel):
                 )
                 urns_created.add(m.urn)
             else:
-                logger.warn(
+                logger.warning(
                     f"Suppressing emission of member {m.urn} before we already emitted metadata for it"
                 )

datahub/api/entities/datacontract/assertion_operator.py CHANGED Viewed

@@ -19,15 +19,13 @@ class Operator(Protocol):
     operator: str
-    def id(self) -> str:
-        ...
+    def id(self) -> str: ...
-    def generate_parameters(self) -> AssertionStdParametersClass:
-        ...
+    def generate_parameters(self) -> AssertionStdParametersClass: ...
 def _generate_assertion_std_parameter(
-    value: Union[str, int, float]
+    value: Union[str, int, float],
 ) -> AssertionStdParameterClass:
     if isinstance(value, str):
         return AssertionStdParameterClass(

datahub/api/entities/dataproduct/dataproduct.py CHANGED Viewed

@@ -321,9 +321,9 @@ class DataProduct(ConfigModel):
     @classmethod
     def from_datahub(cls, graph: DataHubGraph, id: str) -> DataProduct:
-        data_product_properties: Optional[
-            DataProductPropertiesClass
-        ] = graph.get_aspect(id, DataProductPropertiesClass)
+        data_product_properties: Optional[DataProductPropertiesClass] = (
+            graph.get_aspect(id, DataProductPropertiesClass)
+        )
         domains: Optional[DomainsClass] = graph.get_aspect(id, DomainsClass)
         assert domains, "Data Product must have an associated domain. Found none."
         owners: Optional[OwnershipClass] = graph.get_aspect(id, OwnershipClass)
@@ -438,7 +438,7 @@ class DataProduct(ConfigModel):
             for replace_index, replace_value in patches_replace.items():
                 list_to_manipulate[replace_index] = replace_value
-            for drop_index, drop_value in patches_drop.items():
+            for drop_value in patches_drop.values():
                 list_to_manipulate.remove(drop_value)
             for add_value in patches_add:

datahub/api/entities/dataset/dataset.py CHANGED Viewed

@@ -266,7 +266,8 @@ class Dataset(BaseModel):
             if self.schema_metadata.fields:
                 for field in self.schema_metadata.fields:
                     field_urn = field.urn or make_schema_field_urn(
-                        self.urn, field.id  # type: ignore[arg-type]
+                        self.urn,  # type: ignore[arg-type]
+                        field.id,  # type: ignore[arg-type]
                     )
                     assert field_urn.startswith("urn:li:schemaField:")

datahub/api/entities/structuredproperties/structuredproperties.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import List, Optional
+from typing import Iterable, List, Optional
 import yaml
 from pydantic import validator
@@ -118,9 +118,9 @@ class StructuredProperties(ConfigModel):
         id = StructuredPropertyUrn.from_string(self.urn).id
         if self.qualified_name is not None:
             # ensure that qualified name and ID match
-            assert (
-                self.qualified_name == id
-            ), "ID in the urn and the qualified_name must match"
+            assert self.qualified_name == id, (
+                "ID in the urn and the qualified_name must match"
+            )
         return id
     @validator("urn", pre=True, always=True)
@@ -184,9 +184,9 @@ class StructuredProperties(ConfigModel):
     @classmethod
     def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
-        structured_property: Optional[
-            StructuredPropertyDefinitionClass
-        ] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
+        structured_property: Optional[StructuredPropertyDefinitionClass] = (
+            graph.get_aspect(urn, StructuredPropertyDefinitionClass)
+        )
         if structured_property is None:
             raise Exception(
                 "StructuredPropertyDefinition aspect is None. Unable to create structured property."
@@ -226,3 +226,14 @@ class StructuredProperties(ConfigModel):
             yaml.indent(mapping=2, sequence=4, offset=2)
             yaml.default_flow_style = False
             yaml.dump(self.dict(), fp)
+    @staticmethod
+    def list_urns(graph: DataHubGraph) -> Iterable[str]:
+        return graph.get_urns_by_filter(
+            entity_types=["structuredProperty"],
+        )
+    @staticmethod
+    def list(graph: DataHubGraph) -> Iterable["StructuredProperties"]:
+        for urn in StructuredProperties.list_urns(graph):
+            yield StructuredProperties.from_datahub(graph, urn)

datahub/cli/cli_utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import time
 import typing
 from datetime import datetime
-from typing import Any, Dict, List, Optional, Tuple, Type, Union
+from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union
 import click
 import requests
@@ -33,6 +33,15 @@ def first_non_null(ls: List[Optional[str]]) -> Optional[str]:
     return next((el for el in ls if el is not None and el.strip() != ""), None)
+_T = TypeVar("_T")
+def get_or_else(value: Optional[_T], default: _T) -> _T:
+    # Normally we'd use `value or default`. However, that runs into issues
+    # when value is falsey but not None.
+    return value if value is not None else default
 def parse_run_restli_response(response: requests.Response) -> dict:
     response_json = response.json()
     if response.status_code != 200:
@@ -321,6 +330,8 @@ def get_frontend_session_login_as(
 def _ensure_valid_gms_url_acryl_cloud(url: str) -> str:
     if "acryl.io" not in url:
         return url
+    if url.endswith(":8080"):
+        url = url.replace(":8080", "")
     if url.startswith("http://"):
         url = url.replace("http://", "https://")
     if url.endswith("acryl.io"):
@@ -401,7 +412,7 @@ def generate_access_token(
 def ensure_has_system_metadata(
     event: Union[
         MetadataChangeProposal, MetadataChangeProposalWrapper, MetadataChangeEvent
-    ]
+    ],
 ) -> None:
     if event.systemMetadata is None:
         event.systemMetadata = SystemMetadataClass()

datahub/cli/delete_cli.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import logging
+import random
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from datetime import datetime
-from random import choices
 from typing import Dict, List, Optional
 import click
@@ -457,11 +457,11 @@ def by_filter(
             click.echo("Found urns of multiple entity types")
             for entity_type, entity_urns in urns_by_type.items():
                 click.echo(
-                    f"- {len(entity_urns)} {entity_type} urn(s). Sample: {choices(entity_urns, k=min(5, len(entity_urns)))}"
+                    f"- {len(entity_urns)} {entity_type} urn(s). Sample: {random.sample(entity_urns, k=min(5, len(entity_urns)))}"
                 )
         else:
             click.echo(
-                f"Found {len(urns)} {entity_type} urn(s). Sample: {choices(urns, k=min(5, len(urns)))}"
+                f"Found {len(urns)} {entity_type} urn(s). Sample: {random.sample(urns, k=min(5, len(urns)))}"
             )
         if not force and not dry_run:

datahub/cli/docker_cli.py CHANGED Viewed

@@ -296,9 +296,9 @@ def _restore(
     restore_indices: Optional[bool],
     primary_restore_file: Optional[str],
 ) -> int:
-    assert (
-        restore_primary or restore_indices
-    ), "Either restore_primary or restore_indices must be set"
+    assert restore_primary or restore_indices, (
+        "Either restore_primary or restore_indices must be set"
+    )
     msg = "datahub> "
     if restore_primary:
         msg += f"Will restore primary database from {primary_restore_file}. "
@@ -314,9 +314,9 @@ def _restore(
         assert primary_restore_file
         resolved_restore_file = os.path.expanduser(primary_restore_file)
         logger.info(f"Restoring primary db from backup at {resolved_restore_file}")
-        assert os.path.exists(
-            resolved_restore_file
-        ), f"File {resolved_restore_file} does not exist"
+        assert os.path.exists(resolved_restore_file), (
+            f"File {resolved_restore_file} does not exist"
+        )
         with open(resolved_restore_file) as fp:
             result = subprocess.run(
                 [

datahub/cli/ingest_cli.py CHANGED Viewed

@@ -507,15 +507,11 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
         click.echo("No response received from the server.")
         return
-    # when urn or source filter does not match, exit gracefully
-    if (
-        not isinstance(data.get("data"), dict)
-        or "listIngestionSources" not in data["data"]
-    ):
-        click.echo("No matching ingestion sources found. Please check your filters.")
-        return
+    # a lot of responses can be null if there's errors in the run
+    ingestion_sources = (
+        data.get("data", {}).get("listIngestionSources", {}).get("ingestionSources", [])
+    )
-    ingestion_sources = data["data"]["listIngestionSources"]["ingestionSources"]
     if not ingestion_sources:
         click.echo("No ingestion sources or executions found.")
         return
@@ -526,18 +522,32 @@ def list_source_runs(page_offset: int, page_size: int, urn: str, source: str) ->
         name = ingestion_source.get("name", "N/A")
         executions = ingestion_source.get("executions", {}).get("executionRequests", [])
         for execution in executions:
+            if execution is None:
+                continue
             execution_id = execution.get("id", "N/A")
-            start_time = execution.get("result", {}).get("startTimeMs", "N/A")
-            start_time = (
-                datetime.fromtimestamp(start_time / 1000).strftime("%Y-%m-%d %H:%M:%S")
-                if start_time != "N/A"
-                else "N/A"
-            )
-            status = execution.get("result", {}).get("status", "N/A")
+            result = execution.get("result") or {}
+            status = result.get("status", "N/A")
+            try:
+                start_time = (
+                    datetime.fromtimestamp(
+                        result.get("startTimeMs", 0) / 1000
+                    ).strftime("%Y-%m-%d %H:%M:%S")
+                    if status != "DUPLICATE" and result.get("startTimeMs") is not None
+                    else "N/A"
+                )
+            except (TypeError, ValueError):
+                start_time = "N/A"
             rows.append([execution_id, name, start_time, status, urn])
+    if not rows:
+        click.echo("No execution data found.")
+        return
     click.echo(
         tabulate(
             rows,

datahub/cli/lite_cli.py CHANGED Viewed

@@ -176,7 +176,7 @@ def get(
             )
         )
     end_time = time.time()
-    logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
+    logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
 @lite.command()
@@ -228,7 +228,7 @@ def ls(path: Optional[str]) -> None:
     try:
         browseables = lite.ls(path)
         end_time = time.time()
-        logger.debug(f"Time taken: {int((end_time - start_time)*1000.0)} millis")
+        logger.debug(f"Time taken: {int((end_time - start_time) * 1000.0)} millis")
         auto_complete: List[AutoComplete] = [
             b.auto_complete for b in browseables if b.auto_complete is not None
         ]

datahub/cli/migrate.py CHANGED Viewed

@@ -179,7 +179,7 @@ def dataplatform2instance_func(
     if not force and not dry_run:
         # get a confirmation from the operator before proceeding if this is not a dry run
-        sampled_urns_to_migrate = random.choices(
+        sampled_urns_to_migrate = random.sample(
             urns_to_migrate, k=min(10, len(urns_to_migrate))
         )
         sampled_new_urns: List[str] = [
@@ -193,7 +193,7 @@ def dataplatform2instance_func(
             if key
         ]
         click.echo(
-            f"Will migrate {len(urns_to_migrate)} urns such as {random.choices(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
+            f"Will migrate {len(urns_to_migrate)} urns such as {random.sample(urns_to_migrate, k=min(10, len(urns_to_migrate)))}"
         )
         click.echo(f"New urns will look like {sampled_new_urns}")
         click.confirm("Ok to proceed?", abort=True)
@@ -426,9 +426,9 @@ def batch_get_ids(
             entities_yielded += 1
             log.debug(f"yielding {x}")
             yield x
-        assert (
-            entities_yielded == num_entities
-        ), "Did not delete all entities, try running this command again!"
+        assert entities_yielded == num_entities, (
+            "Did not delete all entities, try running this command again!"
+        )
     else:
         log.error(f"Failed to execute batch get with {str(response.content)}")
         response.raise_for_status()

datahub/cli/specific/assertions_cli.py CHANGED Viewed

@@ -136,9 +136,9 @@ def extras_list_to_dict(extras: List[str]) -> Dict[str, str]:
     extra_properties: Dict[str, str] = dict()
     for x in extras:
         parts = x.split("=")
-        assert (
-            len(parts) == 2
-        ), f"Invalid value for extras {x}, should be in format key=value"
+        assert len(parts) == 2, (
+            f"Invalid value for extras {x}, should be in format key=value"
+        )
         extra_properties[parts[0]] = parts[1]
     return extra_properties

datahub/cli/specific/structuredproperties_cli.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import json
 import logging
 from pathlib import Path
+from typing import Iterable
 import click
 from click_default_group import DefaultGroup
+from ruamel.yaml import YAML
 from datahub.api.entities.structuredproperties.structuredproperties import (
     StructuredProperties,
@@ -61,3 +63,85 @@ def get(urn: str, to_file: str) -> None:
                 )
         else:
             click.secho(f"Structured property {urn} does not exist")
+@properties.command(
+    name="list",
+)
+@click.option("--details/--no-details", is_flag=True, default=True)
+@click.option("--to-file", required=False, type=str)
+@telemetry.with_telemetry()
+def list(details: bool, to_file: str) -> None:
+    """List structured properties in DataHub"""
+    def to_yaml_list(
+        objects: Iterable[StructuredProperties],  # iterable of objects to dump
+        file: Path,
+    ) -> None:
+        # if file exists, first we read it
+        yaml = YAML(typ="rt")  # default, if not specfied, is 'rt' (round-trip)
+        yaml.indent(mapping=2, sequence=4, offset=2)
+        yaml.default_flow_style = False
+        serialized_objects = []
+        if file.exists():
+            with open(file, "r") as fp:
+                existing_objects = yaml.load(fp)  # this is a list of dicts
+                existing_objects = [
+                    StructuredProperties.parse_obj(obj) for obj in existing_objects
+                ]
+                objects = [obj for obj in objects]
+                # do a positional update of the existing objects
+                existing_urns = {obj.urn for obj in existing_objects}
+                # existing_urns = {obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}" for obj in existing_objects}
+                for i, obj in enumerate(existing_objects):
+                    # existing_urn = obj["urn"] if "urn" in obj else f"urn:li:structuredProperty:{obj['id']}"
+                    existing_urn = obj.urn
+                    # breakpoint()
+                    if existing_urn in {obj.urn for obj in objects}:
+                        existing_objects[i] = next(
+                            obj.dict(exclude_unset=True, exclude_none=True)
+                            for obj in objects
+                            if obj.urn == existing_urn
+                        )
+                new_objects = [
+                    obj.dict(exclude_unset=True, exclude_none=True)
+                    for obj in objects
+                    if obj.urn not in existing_urns
+                ]
+                serialized_objects = existing_objects + new_objects
+        else:
+            serialized_objects = [
+                obj.dict(exclude_unset=True, exclude_none=True) for obj in objects
+            ]
+        with open(file, "w") as fp:
+            yaml.dump(serialized_objects, fp)
+    with get_default_graph() as graph:
+        if details:
+            logger.info(
+                "Listing structured properties with details. Use --no-details for urns only"
+            )
+            structuredproperties = StructuredProperties.list(graph)
+            if to_file:
+                to_yaml_list(structuredproperties, Path(to_file))
+            else:
+                for structuredproperty in structuredproperties:
+                    click.secho(
+                        f"{json.dumps(structuredproperty.dict(exclude_unset=True, exclude_none=True), indent=2)}"
+                    )
+        else:
+            logger.info(
+                "Listing structured property urns only, use --details for more information"
+            )
+            structured_property_urns = StructuredProperties.list_urns(graph)
+            if to_file:
+                with open(to_file, "w") as f:
+                    for urn in structured_property_urns:
+                        f.write(f"{urn}\n")
+                click.secho(
+                    f"Structured property urns written to {to_file}", fg="green"
+                )
+            else:
+                for urn in structured_property_urns:
+                    click.secho(f"{urn}")

datahub/cli/timeline_cli.py CHANGED Viewed

@@ -50,7 +50,7 @@ def pretty_id(id: Optional[str]) -> str:
     if id.startswith("urn:li:dataset"):
         dataset_key = dataset_urn_to_key(id)
         if dataset_key:
-            return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:'):], fg='white')}:{click.style(dataset_key.name, fg='white')}"
+            return f"{click.style('dataset', fg='cyan')}:{click.style(dataset_key.platform[len('urn:li:dataPlatform:') :], fg='white')}:{click.style(dataset_key.name, fg='white')}"
     # failed to prettify, return original
     return id

datahub/configuration/common.py CHANGED Viewed

@@ -200,8 +200,7 @@ class IgnorableError(MetaError):
 @runtime_checkable
 class ExceptionWithProps(Protocol):
-    def get_telemetry_props(self) -> Dict[str, Any]:
-        ...
+    def get_telemetry_props(self) -> Dict[str, Any]: ...
 def should_show_stack_trace(exc: Exception) -> bool:

datahub/configuration/config_loader.py CHANGED Viewed

@@ -19,64 +19,87 @@ from datahub.configuration.yaml import YamlConfigurationMechanism
 Environ = Mapping[str, str]
-def _resolve_element(element: str, environ: Environ) -> str:
-    if re.search(r"(\$\{).+(\})", element):
-        return expand(element, nounset=True, environ=environ)
-    elif element.startswith("$"):
-        try:
-            return expand(element, nounset=True, environ=environ)
-        except UnboundVariable:
-            return element
-    else:
-        return element
-def _resolve_list(ele_list: list, environ: Environ) -> list:
-    new_v: list = []
-    for ele in ele_list:
-        if isinstance(ele, str):
-            new_v.append(_resolve_element(ele, environ=environ))
-        elif isinstance(ele, list):
-            new_v.append(_resolve_list(ele, environ=environ))
-        elif isinstance(ele, dict):
-            new_v.append(resolve_env_variables(ele, environ=environ))
-        else:
-            new_v.append(ele)
-    return new_v
 def resolve_env_variables(config: dict, environ: Environ) -> dict:
-    new_dict: Dict[Any, Any] = {}
-    for k, v in config.items():
-        if isinstance(v, dict):
-            new_dict[k] = resolve_env_variables(v, environ=environ)
-        elif isinstance(v, list):
-            new_dict[k] = _resolve_list(v, environ=environ)
-        elif isinstance(v, str):
-            new_dict[k] = _resolve_element(v, environ=environ)
-        else:
-            new_dict[k] = v
-    return new_dict
+    # TODO: This is kept around for backwards compatibility.
+    return EnvResolver(environ).resolve(config)
 def list_referenced_env_variables(config: dict) -> Set[str]:
-    # This is a bit of a hack, but expandvars does a bunch of escaping
-    # and other logic that we don't want to duplicate here.
+    # TODO: This is kept around for backwards compatibility.
+    return EnvResolver(environ=os.environ).list_referenced_variables(config)
+class EnvResolver:
+    def __init__(self, environ: Environ, strict_env_syntax: bool = False):
+        self.environ = environ
+        self.strict_env_syntax = strict_env_syntax
-    vars = set()
+    def resolve(self, config: dict) -> dict:
+        return self._resolve_dict(config)
-    def mock_get_env(key: str, default: Optional[str] = None) -> str:
-        vars.add(key)
-        if default is not None:
-            return default
-        return "mocked_value"
+    @classmethod
+    def list_referenced_variables(
+        cls,
+        config: dict,
+        strict_env_syntax: bool = False,
+    ) -> Set[str]:
+        # This is a bit of a hack, but expandvars does a bunch of escaping
+        # and other logic that we don't want to duplicate here.
-    mock = unittest.mock.MagicMock()
-    mock.get.side_effect = mock_get_env
+        vars = set()
-    resolve_env_variables(config, environ=mock)
+        def mock_get_env(key: str, default: Optional[str] = None) -> str:
+            vars.add(key)
+            if default is not None:
+                return default
+            return "mocked_value"
+        mock = unittest.mock.MagicMock()
+        mock.get.side_effect = mock_get_env
+        resolver = EnvResolver(environ=mock, strict_env_syntax=strict_env_syntax)
+        resolver._resolve_dict(config)
+        return vars
+    def _resolve_element(self, element: str) -> str:
+        if re.search(r"(\$\{).+(\})", element):
+            return expand(element, nounset=True, environ=self.environ)
+        elif not self.strict_env_syntax and element.startswith("$"):
+            try:
+                return expand(element, nounset=True, environ=self.environ)
+            except UnboundVariable:
+                # TODO: This fallback is kept around for backwards compatibility, but
+                # doesn't make a ton of sense from first principles.
+                return element
+        else:
+            return element
-    return vars
+    def _resolve_list(self, ele_list: list) -> list:
+        new_v: list = []
+        for ele in ele_list:
+            if isinstance(ele, str):
+                new_v.append(self._resolve_element(ele))
+            elif isinstance(ele, list):
+                new_v.append(self._resolve_list(ele))
+            elif isinstance(ele, dict):
+                new_v.append(self._resolve_dict(ele))
+            else:
+                new_v.append(ele)
+        return new_v
+    def _resolve_dict(self, config: dict) -> dict:
+        new_dict: Dict[Any, Any] = {}
+        for k, v in config.items():
+            if isinstance(v, dict):
+                new_dict[k] = self._resolve_dict(v)
+            elif isinstance(v, list):
+                new_dict[k] = self._resolve_list(v)
+            elif isinstance(v, str):
+                new_dict[k] = self._resolve_element(v)
+            else:
+                new_dict[k] = v
+        return new_dict
 WRITE_TO_FILE_DIRECTIVE_PREFIX = "__DATAHUB_TO_FILE_"
@@ -159,7 +182,7 @@ def load_config_file(
     config = raw_config.copy()
     if resolve_env_vars:
-        config = resolve_env_variables(config, environ=os.environ)
+        config = EnvResolver(environ=os.environ).resolve(config)
     if process_directives:
         config = _process_directives(config)

datahub/configuration/git.py CHANGED Viewed

@@ -121,9 +121,9 @@ class GitInfo(GitReference):
         repo: str = values["repo"]
         if repo.startswith(_GITHUB_PREFIX):
-            return f"git@github.com:{repo[len(_GITHUB_PREFIX):]}.git"
+            return f"git@github.com:{repo[len(_GITHUB_PREFIX) :]}.git"
         elif repo.startswith(_GITLAB_PREFIX):
-            return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX):]}.git"
+            return f"git@gitlab.com:{repo[len(_GITLAB_PREFIX) :]}.git"
         else:
             raise ValueError(
                 "Unable to infer repo_ssh_locator from repo. Please set repo_ssh_locator manually."

acryl-datahub 0.15.0.1rc17__py3-none-any.whl → 0.15.0.2__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc17py3-none-any.whl → 0.15.0.2py3-none-any.whl