PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc3py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (204) hide show

{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2552 -2523
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +204 -191
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +1 -1
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +3 -5
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +1 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +3 -3
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +6 -12
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +7 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +251 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +29 -5
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +20 -13
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc3.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/sdk/search_filters.py ADDED Viewed

@@ -0,0 +1,374 @@
+from __future__ import annotations
+import abc
+from typing import (
+    Any,
+    List,
+    Sequence,
+    TypedDict,
+    Union,
+)
+import pydantic
+from datahub.configuration.common import ConfigModel
+from datahub.configuration.pydantic_migration_helpers import PYDANTIC_VERSION_2
+from datahub.ingestion.graph.client import entity_type_to_graphql
+from datahub.ingestion.graph.filters import SearchFilterRule
+from datahub.metadata.schema_classes import EntityTypeName
+from datahub.metadata.urns import DataPlatformUrn, DomainUrn
+_AndSearchFilterRule = TypedDict(
+    "_AndSearchFilterRule", {"and": List[SearchFilterRule]}
+)
+_OrFilters = List[_AndSearchFilterRule]
+class _BaseFilter(ConfigModel):
+    class Config:
+        # We can't wrap this in a TYPE_CHECKING block because the pydantic plugin
+        # doesn't recognize it properly. So unfortunately we'll need to live
+        # with the deprecation warning w/ pydantic v2.
+        allow_population_by_field_name = True
+        if PYDANTIC_VERSION_2:
+            populate_by_name = True
+    @abc.abstractmethod
+    def compile(self) -> _OrFilters:
+        pass
+def _flexible_entity_type_to_graphql(entity_type: str) -> str:
+    if entity_type.upper() == entity_type:
+        # Assume that we were passed a graphql EntityType enum value,
+        # so no conversion is needed.
+        return entity_type
+    return entity_type_to_graphql(entity_type)
+class _EntityTypeFilter(_BaseFilter):
+    entity_type: List[str] = pydantic.Field(
+        description="The entity type to filter on. Can be 'dataset', 'chart', 'dashboard', 'corpuser', etc.",
+    )
+    def _build_rule(self) -> SearchFilterRule:
+        return SearchFilterRule(
+            field="_entityType",
+            condition="EQUAL",
+            values=[_flexible_entity_type_to_graphql(t) for t in self.entity_type],
+        )
+    def compile(self) -> _OrFilters:
+        return [{"and": [self._build_rule()]}]
+class _EntitySubtypeFilter(_BaseFilter):
+    entity_type: str
+    entity_subtype: str = pydantic.Field(
+        description="The entity subtype to filter on. Can be 'Table', 'View', 'Source', etc. depending on the native platform's concepts.",
+    )
+    def compile(self) -> _OrFilters:
+        rules = [
+            SearchFilterRule(
+                field="_entityType",
+                condition="EQUAL",
+                values=[_flexible_entity_type_to_graphql(self.entity_type)],
+            ),
+            SearchFilterRule(
+                field="typeNames",
+                condition="EQUAL",
+                values=[self.entity_subtype],
+            ),
+        ]
+        return [{"and": rules}]
+class _PlatformFilter(_BaseFilter):
+    platform: List[str]
+    # TODO: Add validator to convert string -> list of strings
+    @pydantic.validator("platform", each_item=True)
+    def validate_platform(cls, v: str) -> str:
+        # Subtle - we use the constructor instead of the from_string method
+        # because coercion is acceptable here.
+        return str(DataPlatformUrn(v))
+    def _build_rule(self) -> SearchFilterRule:
+        return SearchFilterRule(
+            field="platform.keyword",
+            condition="EQUAL",
+            values=self.platform,
+        )
+    def compile(self) -> _OrFilters:
+        return [{"and": [self._build_rule()]}]
+class _DomainFilter(_BaseFilter):
+    domain: List[str]
+    @pydantic.validator("domain", each_item=True)
+    def validate_domain(cls, v: str) -> str:
+        return str(DomainUrn.from_string(v))
+    def _build_rule(self) -> SearchFilterRule:
+        return SearchFilterRule(
+            field="domains",
+            condition="EQUAL",
+            values=self.domain,
+        )
+    def compile(self) -> _OrFilters:
+        return [{"and": [self._build_rule()]}]
+class _EnvFilter(_BaseFilter):
+    # Note that not all entity types have an env (e.g. dashboards / charts).
+    # If the env filter is specified, these will be excluded.
+    env: List[str]
+    def compile(self) -> _OrFilters:
+        return [
+            # For most entity types, we look at the origin field.
+            {
+                "and": [
+                    SearchFilterRule(
+                        field="origin",
+                        condition="EQUAL",
+                        values=self.env,
+                    ),
+                ]
+            },
+            # For containers, we now have an "env" property as of
+            # https://github.com/datahub-project/datahub/pull/11214
+            # Prior to this, we put "env" in the customProperties. But we're
+            # not bothering with that here.
+            {
+                "and": [
+                    SearchFilterRule(
+                        field="env",
+                        condition="EQUAL",
+                        values=self.env,
+                    ),
+                ]
+            },
+        ]
+class _CustomCondition(_BaseFilter):
+    """Represents a single field condition"""
+    field: str
+    condition: str
+    values: List[str]
+    def compile(self) -> _OrFilters:
+        rule = SearchFilterRule(
+            field=self.field,
+            condition=self.condition,
+            values=self.values,
+        )
+        return [{"and": [rule]}]
+class _And(_BaseFilter):
+    """Represents an AND conjunction of filters"""
+    and_: Sequence["Filter"] = pydantic.Field(alias="and")
+    # TODO: Add validator to ensure that the "and" field is not empty
+    def compile(self) -> _OrFilters:
+        # The "and" operator must be implemented by doing a Cartesian product
+        # of the OR clauses.
+        # Example 1:
+        # (A or B) and (C or D) ->
+        # (A and C) or (A and D) or (B and C) or (B and D)
+        # Example 2:
+        # (A or B) and (C or D) and (E or F) ->
+        # (A and C and E) or (A and C and F) or (A and D and E) or (A and D and F) or
+        # (B and C and E) or (B and C and F) or (B and D and E) or (B and D and F)
+        # Start with the first filter's OR clauses
+        result = self.and_[0].compile()
+        # For each subsequent filter
+        for filter in self.and_[1:]:
+            new_result = []
+            # Get its OR clauses
+            other_clauses = filter.compile()
+            # Create Cartesian product
+            for existing_clause in result:
+                for other_clause in other_clauses:
+                    # Merge the AND conditions from both clauses
+                    new_result.append(self._merge_ands(existing_clause, other_clause))
+            result = new_result
+        return result
+    @classmethod
+    def _merge_ands(
+        cls, a: _AndSearchFilterRule, b: _AndSearchFilterRule
+    ) -> _AndSearchFilterRule:
+        return {
+            "and": [
+                *a["and"],
+                *b["and"],
+            ]
+        }
+class _Or(_BaseFilter):
+    """Represents an OR conjunction of filters"""
+    or_: Sequence["Filter"] = pydantic.Field(alias="or")
+    # TODO: Add validator to ensure that the "or" field is not empty
+    def compile(self) -> _OrFilters:
+        merged_filter = []
+        for filter in self.or_:
+            merged_filter.extend(filter.compile())
+        return merged_filter
+class _Not(_BaseFilter):
+    """Represents a NOT filter"""
+    not_: "Filter" = pydantic.Field(alias="not")
+    @pydantic.validator("not_", pre=False)
+    def validate_not(cls, v: "Filter") -> "Filter":
+        inner_filter = v.compile()
+        if len(inner_filter) != 1:
+            raise ValueError(
+                "Cannot negate a filter with multiple OR clauses [not yet supported]"
+            )
+        return v
+    def compile(self) -> _OrFilters:
+        # TODO: Eventually we'll want to implement a full DNF normalizer.
+        # https://en.wikipedia.org/wiki/Disjunctive_normal_form#Conversion_to_DNF
+        inner_filter = self.not_.compile()
+        assert len(inner_filter) == 1  # validated above
+        # ¬(A and B) -> (¬A) OR (¬B)
+        and_filters = inner_filter[0]["and"]
+        final_filters: _OrFilters = []
+        for rule in and_filters:
+            final_filters.append({"and": [rule.negate()]})
+        return final_filters
+# TODO: With pydantic 2, we can use a RootModel with a
+# discriminated union to make the error messages more informative.
+Filter = Union[
+    _And,
+    _Or,
+    _Not,
+    _EntityTypeFilter,
+    _EntitySubtypeFilter,
+    _PlatformFilter,
+    _DomainFilter,
+    _EnvFilter,
+    _CustomCondition,
+]
+# Required to resolve forward references to "Filter"
+if PYDANTIC_VERSION_2:
+    _And.model_rebuild()  # type: ignore
+    _Or.model_rebuild()  # type: ignore
+    _Not.model_rebuild()  # type: ignore
+else:
+    _And.update_forward_refs()
+    _Or.update_forward_refs()
+    _Not.update_forward_refs()
+def load_filters(obj: Any) -> Filter:
+    if PYDANTIC_VERSION_2:
+        return pydantic.TypeAdapter(Filter).validate_python(obj)  # type: ignore
+    else:
+        return pydantic.parse_obj_as(Filter, obj)  # type: ignore
+# We need FilterDsl for two reasons:
+# 1. To provide wrapper methods around lots of filters while avoid bloating the
+#    yaml spec.
+# 2. Pydantic models in general don't support positional arguments, making the
+#    calls feel repetitive (e.g. Platform(platform=...)).
+#    See https://github.com/pydantic/pydantic/issues/6792
+#    We also considered using dataclasses / pydantic dataclasses, but
+#    ultimately decided that they didn't quite suit our requirements,
+#    particularly with regards to the field aliases for and/or/not.
+class FilterDsl:
+    @staticmethod
+    def and_(*args: "Filter") -> _And:
+        return _And(and_=list(args))
+    @staticmethod
+    def or_(*args: "Filter") -> _Or:
+        return _Or(or_=list(args))
+    @staticmethod
+    def not_(arg: "Filter") -> _Not:
+        return _Not(not_=arg)
+    @staticmethod
+    def entity_type(
+        entity_type: Union[EntityTypeName, Sequence[EntityTypeName]],
+    ) -> _EntityTypeFilter:
+        return _EntityTypeFilter(
+            entity_type=(
+                [entity_type] if isinstance(entity_type, str) else list(entity_type)
+            )
+        )
+    @staticmethod
+    def entity_subtype(entity_type: str, subtype: str) -> _EntitySubtypeFilter:
+        return _EntitySubtypeFilter(
+            entity_type=entity_type,
+            entity_subtype=subtype,
+        )
+    @staticmethod
+    def platform(platform: Union[str, List[str]], /) -> _PlatformFilter:
+        return _PlatformFilter(
+            platform=[platform] if isinstance(platform, str) else platform
+        )
+    # TODO: Add a platform_instance filter
+    @staticmethod
+    def domain(domain: Union[str, List[str]], /) -> _DomainFilter:
+        return _DomainFilter(domain=[domain] if isinstance(domain, str) else domain)
+    @staticmethod
+    def env(env: Union[str, List[str]], /) -> _EnvFilter:
+        return _EnvFilter(env=[env] if isinstance(env, str) else env)
+    @staticmethod
+    def has_custom_property(key: str, value: str) -> _CustomCondition:
+        return _CustomCondition(
+            field="customProperties",
+            condition="EQUAL",
+            values=[f"{key}={value}"],
+        )
+    # TODO: Add a soft-deletion status filter
+    # TODO: add a container / browse path filter
+    # TODO add shortcut for custom filters
+    @staticmethod
+    def custom_filter(
+        field: str, condition: str, values: List[str]
+    ) -> _CustomCondition:
+        return _CustomCondition(
+            field=field,
+            condition=condition,
+            values=values,
+        )

datahub/specific/dataset.py CHANGED Viewed

@@ -15,6 +15,7 @@ from datahub.metadata.schema_classes import (
     UpstreamClass as Upstream,
     UpstreamLineageClass as UpstreamLineage,
 )
+from datahub.metadata.urns import DatasetUrn, TagUrn, Urn
 from datahub.specific.aspect_helpers.custom_properties import HasCustomPropertiesPatch
 from datahub.specific.aspect_helpers.ownership import HasOwnershipPatch
 from datahub.specific.aspect_helpers.structured_properties import (
@@ -22,8 +23,6 @@ from datahub.specific.aspect_helpers.structured_properties import (
 )
 from datahub.specific.aspect_helpers.tags import HasTagsPatch
 from datahub.specific.aspect_helpers.terms import HasTermsPatch
-from datahub.utilities.urns.tag_urn import TagUrn
-from datahub.utilities.urns.urn import Urn
 _Parent = TypeVar("_Parent", bound=MetadataPatchProposal)
@@ -104,12 +103,12 @@ class DatasetPatchBuilder(
 ):
     def __init__(
         self,
-        urn: str,
+        urn: Union[str, DatasetUrn],
         system_metadata: Optional[SystemMetadataClass] = None,
         audit_header: Optional[KafkaAuditHeaderClass] = None,
     ) -> None:
         super().__init__(
-            urn, system_metadata=system_metadata, audit_header=audit_header
+            str(urn), system_metadata=system_metadata, audit_header=audit_header
         )
     @classmethod

datahub/sql_parsing/_sqlglot_patch.py CHANGED Viewed

@@ -172,17 +172,9 @@ def _patch_lineage() -> None:
          derived_tables = [
              source.expression.parent
              for source in scope.sources.values()
-@@ -254,6 +257,7 @@ def to_node(
-         if dt.comments and dt.comments[0].startswith("source: ")
-     }
-+    c: exp.Column
-     for c in source_columns:
-         table = c.table
-         source = scope.sources.get(table)
@@ -281,8 +285,21 @@ def to_node(
-             # it means this column's lineage is unknown. This can happen if the definition of a source used in a query
-             # is not passed into the `sources` map.
+             # is unknown. This can happen if the definition of a source used in a query is not
+             # passed into the `sources` map.
              source = source or exp.Placeholder()
 +
 +            subfields = []

datahub/sql_parsing/schema_resolver.py CHANGED Viewed

@@ -13,7 +13,7 @@ from datahub.ingestion.graph.client import DataHubGraph
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.metadata.schema_classes import SchemaFieldClass, SchemaMetadataClass
 from datahub.metadata.urns import DataPlatformUrn
-from datahub.sql_parsing._models import _TableName as _TableName  # noqa: I250
+from datahub.sql_parsing._models import _TableName as _TableName
 from datahub.sql_parsing.sql_parsing_common import PLATFORMS_WITH_CASE_SENSITIVE_TABLES
 from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict
 from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path

datahub/sql_parsing/split_statements.py CHANGED Viewed

@@ -8,11 +8,11 @@ END_KEYWORD = "END"
 CONTROL_FLOW_KEYWORDS = [
     "GO",
-    r"BEGIN\w+TRY",
-    r"BEGIN\w+CATCH",
+    r"BEGIN\s+TRY",
+    r"BEGIN\s+CATCH",
     "BEGIN",
-    r"END\w+TRY",
-    r"END\w+CATCH",
+    r"END\s+TRY",
+    r"END\s+CATCH",
     # This isn't strictly correct, but we assume that IF | (condition) | (block) should all be split up
     # This mainly ensures that IF statements don't get tacked onto the previous statement incorrectly
     "IF",
@@ -73,25 +73,31 @@ class _StatementSplitter:
         # what a given END is closing.
         self.current_case_statements = 0
-    def _is_keyword_at_position(self, pos: int, keyword: str) -> bool:
+    def _is_keyword_at_position(self, pos: int, keyword: str) -> Tuple[bool, str]:
         """
         Check if a keyword exists at the given position using regex word boundaries.
         """
         sql = self.sql
-        if pos + len(keyword) > len(sql):
-            return False
+        keyword_length = len(keyword.replace(r"\s+", " "))
+        if pos + keyword_length > len(sql):
+            return False, ""
         # If we're not at a word boundary, we can't generate a keyword.
         if pos > 0 and not (
             bool(re.match(r"\w\W", sql[pos - 1 : pos + 1]))
             or bool(re.match(r"\W\w", sql[pos - 1 : pos + 1]))
         ):
-            return False
+            return False, ""
-        pattern = rf"^{re.escape(keyword)}\b"
+        pattern = rf"^{keyword}\b"
         match = re.match(pattern, sql[pos:], re.IGNORECASE)
-        return bool(match)
+        is_match = bool(match)
+        actual_match = (
+            sql[pos:][match.start() : match.end()] if match is not None else ""
+        )
+        return is_match, actual_match
     def _look_ahead_for_keywords(self, keywords: List[str]) -> Tuple[bool, str, int]:
         """
@@ -99,7 +105,8 @@ class _StatementSplitter:
         """
         for keyword in keywords:
-            if self._is_keyword_at_position(self.i, keyword):
+            is_match, keyword = self._is_keyword_at_position(self.i, keyword)
+            if is_match:
                 return True, keyword, len(keyword)
         return False, "", 0
@@ -118,7 +125,7 @@ class _StatementSplitter:
     def process(self) -> Iterator[str]:
         if not self.sql or not self.sql.strip():
-            return
+            yield from ()
         prev_real_char = "\0"  # the most recent non-whitespace, non-comment character
         while self.i < len(self.sql):
@@ -181,7 +188,7 @@ class _StatementSplitter:
     def _process_normal(self, most_recent_real_char: str) -> Iterator[str]:
         c = self.sql[self.i]
-        if self._is_keyword_at_position(self.i, CASE_KEYWORD):
+        if self._is_keyword_at_position(self.i, CASE_KEYWORD)[0]:
             self.current_case_statements += 1
         is_control_keyword, keyword, keyword_len = self._look_ahead_for_keywords(

datahub/sql_parsing/sql_parsing_common.py CHANGED Viewed

@@ -24,12 +24,19 @@ DIALECTS_WITH_CASE_INSENSITIVE_COLS = {
     # For SQL server, the default collation rules mean that all identifiers (schema, table, column names)
     # are case preserving but case insensitive.
     "mssql",
+    # Oracle automatically converts unquoted identifiers to uppercase.
+    # https://docs.oracle.com/en/database/oracle/oracle-database/19/sqlrf/Database-Object-Names-and-Qualifiers.html#GUID-3C59E44A-5140-4BCA-B9E1-3039C8050C49
+    # In our Oracle connector, we then normalize column names to lowercase. This behavior
+    # actually comes from the underlying Oracle sqlalchemy dialect.
+    # https://github.com/sqlalchemy/sqlalchemy/blob/d9b4d8ff3aae504402d324f3ebf0b8faff78f5dc/lib/sqlalchemy/dialects/oracle/base.py#L2579
+    "oracle",
 }
 DIALECTS_WITH_DEFAULT_UPPERCASE_COLS = {
     # In some dialects, column identifiers are effectively case insensitive
     # because they are automatically converted to uppercase. Most other systems
     # automatically lowercase unquoted identifiers.
     "snowflake",
+    "oracle",
 }
 assert DIALECTS_WITH_DEFAULT_UPPERCASE_COLS.issubset(
     DIALECTS_WITH_CASE_INSENSITIVE_COLS

datahub/sql_parsing/sqlglot_lineage.py CHANGED Viewed

@@ -473,7 +473,7 @@ def _create_table_ddl_cll(
     return column_lineage
-def _select_statement_cll(  # noqa: C901
+def _select_statement_cll(
     statement: _SupportedColumnLineageTypes,
     dialect: sqlglot.Dialect,
     root_scope: sqlglot.optimizer.Scope,

datahub/sql_parsing/sqlglot_utils.py CHANGED Viewed

@@ -56,10 +56,7 @@ def get_dialect(platform: DialectOrStr) -> sqlglot.Dialect:
 def is_dialect_instance(
     dialect: sqlglot.Dialect, platforms: Union[str, Iterable[str]]
 ) -> bool:
-    if isinstance(platforms, str):
-        platforms = [platforms]
-    else:
-        platforms = list(platforms)
+    platforms = [platforms] if isinstance(platforms, str) else list(platforms)
     dialects = [get_dialect(platform) for platform in platforms]

datahub/testing/check_sql_parser_result.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import os
 import pathlib
 from typing import Any, Dict, Optional
@@ -8,11 +7,10 @@ import deepdiff
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier
 from datahub.sql_parsing.schema_resolver import SchemaInfo, SchemaResolver
 from datahub.sql_parsing.sqlglot_lineage import SqlParsingResult, sqlglot_lineage
+from datahub.testing.pytest_hooks import get_golden_settings
 logger = logging.getLogger(__name__)
-UPDATE_FILES = os.environ.get("UPDATE_SQLPARSER_FILES", "false").lower() == "true"
 def assert_sql_result_with_resolver(
     sql: str,
@@ -22,6 +20,8 @@ def assert_sql_result_with_resolver(
     allow_table_error: bool = False,
     **kwargs: Any,
 ) -> None:
+    settings = get_golden_settings()
     # HACK: Our BigQuery source overwrites this value and doesn't undo it.
     # As such, we need to handle that here.
     BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "_yyyymmdd"
@@ -47,15 +47,14 @@ def assert_sql_result_with_resolver(
         )
     txt = res.json(indent=4)
-    if UPDATE_FILES:
+    if settings.update_golden:
         expected_file.write_text(txt)
         return
     if not expected_file.exists():
         expected_file.write_text(txt)
         raise AssertionError(
-            f"Expected file {expected_file} does not exist. "
-            "Created it with the expected output. Please verify it."
+            f"Missing expected golden file; run with --update-golden-files to create it: {expected_file}"
         )
     expected = SqlParsingResult.parse_raw(expected_file.read_text())

datahub/testing/compare_metadata_json.py CHANGED Viewed

@@ -16,6 +16,7 @@ from deepdiff import DeepDiff
 from datahub.ingestion.sink.file import write_metadata_file
 from datahub.ingestion.source.file import read_metadata_file
 from datahub.testing.mcp_diff import CannotCompareMCPs, MCPDiff, get_aspects_by_urn
+from datahub.testing.pytest_hooks import get_golden_settings
 logger = logging.getLogger(__name__)
@@ -40,26 +41,26 @@ def load_json_file(filename: Union[str, os.PathLike]) -> MetadataJson:
 def assert_metadata_files_equal(
     output_path: Union[str, os.PathLike],
     golden_path: Union[str, os.PathLike],
-    update_golden: bool,
-    copy_output: bool,
     ignore_paths: Sequence[str] = (),
     ignore_paths_v2: Sequence[str] = (),
     ignore_order: bool = True,
 ) -> None:
+    settings = get_golden_settings()
     golden_exists = os.path.isfile(golden_path)
-    if copy_output:
+    if settings.copy_output:
         shutil.copyfile(str(output_path), str(golden_path) + ".output")
         logger.info(f"Copied output file to {golden_path}.output")
-    if not update_golden and not golden_exists:
+    if not settings.update_golden and not golden_exists:
         raise FileNotFoundError(
             "Golden file does not exist. Please run with the --update-golden-files option to create."
         )
     output = load_json_file(output_path)
-    if update_golden and not golden_exists:
+    if settings.update_golden and not golden_exists:
         shutil.copyfile(str(output_path), str(golden_path))
         return
     else:
@@ -87,7 +88,7 @@ def assert_metadata_files_equal(
     ignore_paths = (*ignore_paths, *default_exclude_paths)
     diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order)
-    if diff and update_golden:
+    if diff and settings.update_golden:
         if isinstance(diff, MCPDiff) and diff.is_delta_valid:
             logger.info(f"Applying delta to golden file {golden_path}")
             diff.apply_delta(golden)

acryl-datahub 0.15.0.6rc3__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc3py3-none-any.whl → 1.0.0py3-none-any.whl