PyPI - acryl-datahub - Versions diffs - 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0.1rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (223) hide show

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/METADATA +2617 -2590
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/RECORD +223 -189
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/entry_points.txt +2 -0
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +1 -1
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/lake_formation_external_entites.py +161 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +173 -0
datahub/cli/check_cli.py +88 -7
datahub/cli/cli_utils.py +63 -0
datahub/cli/container_cli.py +5 -0
datahub/cli/delete_cli.py +124 -27
datahub/cli/docker_check.py +107 -12
datahub/cli/docker_cli.py +149 -227
datahub/cli/exists_cli.py +0 -2
datahub/cli/get_cli.py +0 -2
datahub/cli/iceberg_cli.py +5 -0
datahub/cli/ingest_cli.py +3 -15
datahub/cli/migrate.py +2 -0
datahub/cli/put_cli.py +1 -4
datahub/cli/quickstart_versioning.py +50 -7
datahub/cli/specific/assertions_cli.py +0 -4
datahub/cli/specific/datacontract_cli.py +0 -3
datahub/cli/specific/dataproduct_cli.py +0 -11
datahub/cli/specific/dataset_cli.py +1 -8
datahub/cli/specific/forms_cli.py +0 -4
datahub/cli/specific/group_cli.py +0 -2
datahub/cli/specific/structuredproperties_cli.py +1 -4
datahub/cli/specific/user_cli.py +0 -2
datahub/cli/state_cli.py +0 -2
datahub/cli/timeline_cli.py +0 -2
datahub/configuration/pydantic_migration_helpers.py +7 -5
datahub/emitter/rest_emitter.py +70 -12
datahub/entrypoints.py +4 -3
datahub/ingestion/api/decorators.py +15 -3
datahub/ingestion/api/report.py +332 -3
datahub/ingestion/api/sink.py +3 -0
datahub/ingestion/api/source.py +48 -44
datahub/ingestion/autogenerated/__init__.py +0 -0
datahub/ingestion/autogenerated/capability_summary.json +3449 -0
datahub/ingestion/autogenerated/lineage.json +401 -0
datahub/ingestion/autogenerated/lineage_helper.py +177 -0
datahub/ingestion/extractor/schema_util.py +13 -4
datahub/ingestion/glossary/classification_mixin.py +5 -0
datahub/ingestion/graph/client.py +100 -15
datahub/ingestion/graph/config.py +1 -0
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +20 -10
datahub/ingestion/run/pipeline.py +54 -2
datahub/ingestion/sink/datahub_rest.py +13 -0
datahub/ingestion/source/abs/source.py +1 -1
datahub/ingestion/source/aws/aws_common.py +4 -0
datahub/ingestion/source/aws/glue.py +489 -244
datahub/ingestion/source/aws/tag_entities.py +292 -0
datahub/ingestion/source/azure/azure_common.py +2 -2
datahub/ingestion/source/bigquery_v2/bigquery.py +50 -23
datahub/ingestion/source/bigquery_v2/bigquery_config.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_queries.py +1 -0
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +2 -0
datahub/ingestion/source/bigquery_v2/common.py +1 -1
datahub/ingestion/source/bigquery_v2/profiler.py +4 -2
datahub/ingestion/source/bigquery_v2/queries.py +3 -3
datahub/ingestion/source/cassandra/cassandra.py +1 -1
datahub/ingestion/source/cassandra/cassandra_profiling.py +6 -5
datahub/ingestion/source/common/subtypes.py +45 -0
datahub/ingestion/source/data_lake_common/object_store.py +115 -27
datahub/ingestion/source/data_lake_common/path_spec.py +10 -21
datahub/ingestion/source/datahub/datahub_database_reader.py +1 -2
datahub/ingestion/source/dbt/dbt_cloud.py +10 -2
datahub/ingestion/source/dbt/dbt_common.py +6 -2
datahub/ingestion/source/dbt/dbt_core.py +3 -0
datahub/ingestion/source/debug/__init__.py +0 -0
datahub/ingestion/source/debug/datahub_debug.py +300 -0
datahub/ingestion/source/dremio/dremio_api.py +114 -73
datahub/ingestion/source/dremio/dremio_config.py +2 -0
datahub/ingestion/source/dremio/dremio_reporting.py +23 -2
datahub/ingestion/source/dremio/dremio_source.py +94 -81
datahub/ingestion/source/dremio/dremio_sql_queries.py +82 -21
datahub/ingestion/source/file.py +3 -0
datahub/ingestion/source/fivetran/fivetran.py +34 -26
datahub/ingestion/source/gcs/gcs_source.py +13 -2
datahub/ingestion/source/ge_data_profiler.py +76 -28
datahub/ingestion/source/ge_profiling_config.py +11 -0
datahub/ingestion/source/hex/api.py +26 -1
datahub/ingestion/source/iceberg/iceberg.py +3 -1
datahub/ingestion/source/identity/azure_ad.py +1 -1
datahub/ingestion/source/identity/okta.py +1 -14
datahub/ingestion/source/kafka/kafka.py +16 -0
datahub/ingestion/source/kafka_connect/sink_connectors.py +156 -47
datahub/ingestion/source/kafka_connect/source_connectors.py +59 -4
datahub/ingestion/source/looker/looker_source.py +1 -0
datahub/ingestion/source/mlflow.py +11 -1
datahub/ingestion/source/mock_data/__init__.py +0 -0
datahub/ingestion/source/mock_data/datahub_mock_data.py +507 -0
datahub/ingestion/source/mock_data/datahub_mock_data_report.py +12 -0
datahub/ingestion/source/mock_data/table_naming_helper.py +97 -0
datahub/ingestion/source/nifi.py +1 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -5
datahub/ingestion/source/powerbi/rest_api_wrapper/powerbi_api.py +0 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +0 -23
datahub/ingestion/source/preset.py +2 -2
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -0
datahub/ingestion/source/redshift/redshift.py +21 -1
datahub/ingestion/source/redshift/usage.py +4 -3
datahub/ingestion/source/s3/report.py +4 -2
datahub/ingestion/source/s3/source.py +367 -115
datahub/ingestion/source/sac/sac.py +3 -1
datahub/ingestion/source/salesforce.py +6 -3
datahub/ingestion/source/sigma/sigma.py +7 -1
datahub/ingestion/source/slack/slack.py +2 -1
datahub/ingestion/source/snowflake/snowflake_config.py +30 -7
datahub/ingestion/source/snowflake/snowflake_queries.py +348 -82
datahub/ingestion/source/snowflake/snowflake_summary.py +5 -0
datahub/ingestion/source/snowflake/snowflake_usage_v2.py +8 -2
datahub/ingestion/source/snowflake/snowflake_utils.py +2 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +16 -2
datahub/ingestion/source/snowflake/stored_proc_lineage.py +143 -0
datahub/ingestion/source/sql/athena.py +119 -11
datahub/ingestion/source/sql/athena_properties_extractor.py +777 -0
datahub/ingestion/source/sql/clickhouse.py +3 -1
datahub/ingestion/source/sql/cockroachdb.py +0 -1
datahub/ingestion/source/sql/hana.py +3 -1
datahub/ingestion/source/sql/hive_metastore.py +3 -11
datahub/ingestion/source/sql/mariadb.py +0 -1
datahub/ingestion/source/sql/mssql/source.py +239 -34
datahub/ingestion/source/sql/mysql.py +0 -1
datahub/ingestion/source/sql/oracle.py +1 -1
datahub/ingestion/source/sql/postgres.py +0 -1
datahub/ingestion/source/sql/sql_common.py +121 -34
datahub/ingestion/source/sql/sql_generic_profiler.py +2 -1
datahub/ingestion/source/sql/teradata.py +997 -235
datahub/ingestion/source/sql/vertica.py +10 -6
datahub/ingestion/source/sql_queries.py +2 -2
datahub/ingestion/source/state/stateful_ingestion_base.py +1 -1
datahub/ingestion/source/superset.py +58 -3
datahub/ingestion/source/tableau/tableau.py +58 -37
datahub/ingestion/source/tableau/tableau_common.py +4 -2
datahub/ingestion/source/tableau/tableau_constant.py +0 -4
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +118 -0
datahub/ingestion/source/unity/source.py +195 -17
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/ingestion/source/usage/clickhouse_usage.py +4 -1
datahub/ingestion/source/usage/starburst_trino_usage.py +3 -0
datahub/ingestion/transformer/add_dataset_ownership.py +18 -2
datahub/integrations/assertion/snowflake/compiler.py +4 -3
datahub/metadata/_internal_schema_classes.py +1522 -569
datahub/metadata/_urns/urn_defs.py +1826 -1658
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/com/linkedin/pegasus2avro/identity/__init__.py +2 -0
datahub/metadata/com/linkedin/pegasus2avro/logical/__init__.py +15 -0
datahub/metadata/com/linkedin/pegasus2avro/metadata/key/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/module/__init__.py +29 -0
datahub/metadata/com/linkedin/pegasus2avro/settings/global/__init__.py +4 -0
datahub/metadata/com/linkedin/pegasus2avro/template/__init__.py +25 -0
datahub/metadata/schema.avsc +17758 -17097
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/ContainerProperties.avsc +8 -0
datahub/metadata/schemas/CorpUserSettings.avsc +41 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowInfo.avsc +8 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubPageModuleKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +237 -0
datahub/metadata/schemas/DataHubPageTemplateKey.avsc +21 -0
datahub/metadata/schemas/DataHubPageTemplateProperties.avsc +175 -0
datahub/metadata/schemas/DataHubPolicyInfo.avsc +12 -1
datahub/metadata/schemas/DataJobInfo.avsc +8 -0
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProcessKey.avsc +8 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +11 -1
datahub/metadata/schemas/GlobalSettingsInfo.avsc +62 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/IcebergWarehouseInfo.avsc +8 -0
datahub/metadata/schemas/LogicalParent.avsc +140 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelDeploymentKey.avsc +8 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +9 -0
datahub/metadata/schemas/MLModelKey.avsc +9 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -1
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/QuerySubjects.avsc +1 -12
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +2 -0
datahub/sdk/_all_entities.py +7 -0
datahub/sdk/_shared.py +116 -0
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +7 -0
datahub/sdk/datajob.py +45 -13
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +82 -2
datahub/sdk/lineage_client.py +683 -82
datahub/sdk/main_client.py +46 -16
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sdk/search_client.py +4 -3
datahub/sdk/search_filters.py +95 -27
datahub/specific/chart.py +1 -1
datahub/specific/dataproduct.py +4 -0
datahub/sql_parsing/sql_parsing_aggregator.py +29 -17
datahub/sql_parsing/sqlglot_lineage.py +62 -13
datahub/telemetry/telemetry.py +17 -11
datahub/testing/sdk_v2_helpers.py +7 -1
datahub/upgrade/upgrade.py +56 -14
datahub/utilities/server_config_util.py +8 -0
datahub/utilities/sqlalchemy_query_combiner.py +5 -2
datahub/utilities/stats_collections.py +4 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt +0 -0

datahub/sql_parsing/sql_parsing_aggregator.py CHANGED Viewed

@@ -58,6 +58,7 @@ from datahub.sql_parsing.tool_meta_extractor import (
     ToolMetaExtractorReport,
 )
 from datahub.utilities.cooperative_timeout import CooperativeTimeoutError
+from datahub.utilities.dedup_list import deduplicate_list
 from datahub.utilities.file_backed_collections import (
     ConnectionWrapper,
     FileBackedDict,
@@ -140,6 +141,7 @@ class QueryMetadata:
     used_temp_tables: bool = True
+    extra_info: Optional[dict] = None
     origin: Optional[Urn] = None
     def make_created_audit_stamp(self) -> models.AuditStampClass:
@@ -263,7 +265,7 @@ class PreparsedQuery:
     query_type_props: QueryTypeProps = dataclasses.field(
         default_factory=lambda: QueryTypeProps()
     )
-    # Use this to store addtitional key-value information about query for debugging
+    # Use this to store additional key-value information about the query for debugging.
     extra_info: Optional[dict] = None
     origin: Optional[Urn] = None
@@ -948,6 +950,7 @@ class SqlParsingAggregator(Closeable):
                 column_usage=parsed.column_usage or {},
                 confidence_score=parsed.confidence_score,
                 used_temp_tables=session_has_temp_tables,
+                extra_info=parsed.extra_info,
                 origin=parsed.origin,
             )
         )
@@ -1491,9 +1494,9 @@ class SqlParsingAggregator(Closeable):
             return
         # If a query doesn't involve any allowed tables, skip it.
-        if downstream_urn is None and not any(
-            self.is_allowed_table(urn) for urn in query.upstreams
-        ):
+        if (
+            downstream_urn is None or not self.is_allowed_table(downstream_urn)
+        ) and not any(self.is_allowed_table(urn) for urn in query.upstreams):
             self.report.num_queries_skipped_due_to_filters += 1
             return
@@ -1574,27 +1577,33 @@ class SqlParsingAggregator(Closeable):
         @dataclasses.dataclass
         class QueryLineageInfo:
-            upstreams: List[UrnStr]  # this is direct upstreams, with *no temp tables*
-            column_lineage: List[ColumnLineageInfo]
+            upstreams: OrderedSet[
+                UrnStr
+            ]  # this is direct upstreams, with *no temp tables*
+            column_lineage: OrderedSet[ColumnLineageInfo]
             confidence_score: float
             def _merge_lineage_from(self, other_query: "QueryLineageInfo") -> None:
-                self.upstreams += other_query.upstreams
-                self.column_lineage += other_query.column_lineage
+                self.upstreams.update(other_query.upstreams)
+                self.column_lineage.update(other_query.column_lineage)
                 self.confidence_score = min(
                     self.confidence_score, other_query.confidence_score
                 )
+        cache: Dict[str, QueryLineageInfo] = {}
         def _recurse_into_query(
             query: QueryMetadata, recursion_path: List[QueryId]
         ) -> QueryLineageInfo:
             if query.query_id in recursion_path:
                 # This is a cycle, so we just return the query as-is.
                 return QueryLineageInfo(
-                    upstreams=query.upstreams,
-                    column_lineage=query.column_lineage,
+                    upstreams=OrderedSet(query.upstreams),
+                    column_lineage=OrderedSet(query.column_lineage),
                     confidence_score=query.confidence_score,
                 )
+            if query.query_id in cache:
+                return cache[query.query_id]
             recursion_path = [*recursion_path, query.query_id]
             composed_of_queries.add(query.query_id)
@@ -1609,7 +1618,7 @@ class SqlParsingAggregator(Closeable):
                         upstream_query = self._query_map.get(upstream_query_id)
                         if (
                             upstream_query
-                            and upstream_query.query_id not in composed_of_queries
+                            and upstream_query.query_id not in recursion_path
                         ):
                             temp_query_lineage_info = _recurse_into_query(
                                 upstream_query, recursion_path
@@ -1669,11 +1678,14 @@ class SqlParsingAggregator(Closeable):
                 ]
             )
-            return QueryLineageInfo(
-                upstreams=list(new_upstreams),
-                column_lineage=new_cll,
+            ret = QueryLineageInfo(
+                upstreams=new_upstreams,
+                column_lineage=OrderedSet(new_cll),
                 confidence_score=new_confidence_score,
             )
+            cache[query.query_id] = ret
+            return ret
         resolved_lineage_info = _recurse_into_query(base_query, [])
@@ -1706,15 +1718,15 @@ class SqlParsingAggregator(Closeable):
         )
         merged_query_text = ";\n\n".join(
-            [q.formatted_query_string for q in ordered_queries]
+            deduplicate_list([q.formatted_query_string for q in ordered_queries])
         )
         resolved_query = dataclasses.replace(
             base_query,
             query_id=composite_query_id,
             formatted_query_string=merged_query_text,
-            upstreams=resolved_lineage_info.upstreams,
-            column_lineage=resolved_lineage_info.column_lineage,
+            upstreams=list(resolved_lineage_info.upstreams),
+            column_lineage=list(resolved_lineage_info.column_lineage),
             confidence_score=resolved_lineage_info.confidence_score,
         )

datahub/sql_parsing/sqlglot_lineage.py CHANGED Viewed

@@ -56,6 +56,7 @@ from datahub.sql_parsing.sql_parsing_common import (
     QueryTypeProps,
 )
 from datahub.sql_parsing.sqlglot_utils import (
+    DialectOrStr,
     get_dialect,
     get_query_fingerprint_debug,
     is_dialect_instance,
@@ -124,6 +125,17 @@ class _DownstreamColumnRef(_ParserBaseModel):
 class DownstreamColumnRef(_ParserBaseModel):
+    """
+    TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
+          What stops us is that `column_type` field of type `SchemaFieldDataTypeClass` is not hashable - it's an
+          auto-generated class from .pdl model files. We need generic solution allowing us to either:
+          1. Implement hashing for .pdl model objects
+          2. Reliably provide pydantic (both v1 and v2) with information to skip particular fields from default
+             hash function - with a twist here that _FrozenModel implements its own `__lt__` function - it needs
+             to understand that instruction as well.
+          Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
+    """
     table: Optional[Urn] = None
     column: str
     column_type: Optional[SchemaFieldDataTypeClass] = None
@@ -139,8 +151,11 @@ class DownstreamColumnRef(_ParserBaseModel):
             return v
         return SchemaFieldDataTypeClass.from_obj(v)
+    def __hash__(self) -> int:
+        return hash((self.table, self.column, self.native_column_type))
-class ColumnTransformation(_ParserBaseModel):
+class ColumnTransformation(_FrozenModel):
     is_direct_copy: bool
     column_logic: str
@@ -153,11 +168,21 @@ class _ColumnLineageInfo(_ParserBaseModel):
 class ColumnLineageInfo(_ParserBaseModel):
+    """
+    TODO: Instead of implementing custom __hash__ function this class should simply inherit from _FrozenModel.
+          To achieve this, we need to change `upstreams` to `Tuple[ColumnRef, ...]` - along with many code lines
+          depending on it.
+          Instances of this class needs to be hashable as we store them in a set when processing lineage from queries.
+    """
     downstream: DownstreamColumnRef
     upstreams: List[ColumnRef]
     logic: Optional[ColumnTransformation] = pydantic.Field(default=None)
+    def __hash__(self) -> int:
+        return hash((self.downstream, tuple(self.upstreams), self.logic))
 class _JoinInfo(_ParserBaseModel):
     join_type: str
@@ -1231,12 +1256,12 @@ def _sqlglot_lineage_inner(
     schema_resolver: SchemaResolverInterface,
     default_db: Optional[str] = None,
     default_schema: Optional[str] = None,
-    default_dialect: Optional[str] = None,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
-    if not default_dialect:
-        dialect = get_dialect(schema_resolver.platform)
+    if override_dialect:
+        dialect = get_dialect(override_dialect)
     else:
-        dialect = get_dialect(default_dialect)
+        dialect = get_dialect(schema_resolver.platform)
     default_db = _normalize_db_or_schema(default_db, dialect)
     default_schema = _normalize_db_or_schema(default_schema, dialect)
@@ -1423,7 +1448,7 @@ def _sqlglot_lineage_nocache(
     schema_resolver: SchemaResolverInterface,
     default_db: Optional[str] = None,
     default_schema: Optional[str] = None,
-    default_dialect: Optional[str] = None,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
     """Parse a SQL statement and generate lineage information.
@@ -1441,8 +1466,8 @@ def _sqlglot_lineage_nocache(
     can be brittle with respect to missing schema information and complex
     SQL logic like UNNESTs.
-    The SQL dialect can be given as an argument called default_dialect or it can
-    be inferred from the schema_resolver's platform.
+    The SQL dialect will be inferred from the schema_resolver's platform.
+    That inference can be overridden by passing an override_dialect argument.
     The set of supported dialects is the same as sqlglot's. See their
     `documentation <https://sqlglot.com/sqlglot/dialects/dialect.html#Dialects>`_
     for the full list.
@@ -1457,7 +1482,7 @@ def _sqlglot_lineage_nocache(
         schema_resolver: The schema resolver to use for resolving table schemas.
         default_db: The default database to use for unqualified table names.
         default_schema: The default schema to use for unqualified table names.
-        default_dialect: A default dialect to override the dialect provided by 'schema_resolver'.
+        override_dialect: Override the dialect provided by 'schema_resolver'.
     Returns:
         A SqlParsingResult object containing the parsed lineage information.
@@ -1482,10 +1507,32 @@ def _sqlglot_lineage_nocache(
             schema_resolver=schema_resolver,
             default_db=default_db,
             default_schema=default_schema,
-            default_dialect=default_dialect,
+            override_dialect=override_dialect,
         )
     except Exception as e:
         return SqlParsingResult.make_from_error(e)
+    except BaseException as e:
+        # Check if this is a PanicException from SQLGlot's Rust tokenizer
+        # We use runtime type checking instead of isinstance() because pyo3_runtime
+        # is only available when sqlglot[rs] is installed and may not be importable
+        # at module load time, but the exception can still be raised at runtime
+        if (
+            e.__class__.__name__ == "PanicException"
+            and e.__class__.__module__ == "pyo3_runtime"
+        ):
+            # Handle pyo3_runtime.PanicException from SQLGlot's Rust tokenizer.
+            # pyo3_runtime.PanicException inherits from BaseException (like SystemExit or
+            # KeyboardInterrupt) rather than Exception, so it bypasses normal exception handling.
+            # Avoid catching BaseException, as it includes KeyboardInterrupt
+            # and would prevent Ctrl+C from working.
+            wrapped_exception = Exception(
+                f"pyo3_runtime.PanicException during SQL parsing: {e}"
+            )
+            wrapped_exception.__cause__ = e
+            return SqlParsingResult.make_from_error(wrapped_exception)
+        else:
+            # Re-raise other BaseException types (SystemExit, KeyboardInterrupt, etc.)
+            raise
 _sqlglot_lineage_cached = functools.lru_cache(maxsize=SQL_PARSE_RESULT_CACHE_SIZE)(
@@ -1498,15 +1545,15 @@ def sqlglot_lineage(
     schema_resolver: SchemaResolverInterface,
     default_db: Optional[str] = None,
     default_schema: Optional[str] = None,
-    default_dialect: Optional[str] = None,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
     if schema_resolver.includes_temp_tables():
         return _sqlglot_lineage_nocache(
-            sql, schema_resolver, default_db, default_schema, default_dialect
+            sql, schema_resolver, default_db, default_schema, override_dialect
         )
     else:
         return _sqlglot_lineage_cached(
-            sql, schema_resolver, default_db, default_schema, default_dialect
+            sql, schema_resolver, default_db, default_schema, override_dialect
         )
@@ -1558,6 +1605,7 @@ def create_lineage_sql_parsed_result(
     default_schema: Optional[str] = None,
     graph: Optional[DataHubGraph] = None,
     schema_aware: bool = True,
+    override_dialect: Optional[DialectOrStr] = None,
 ) -> SqlParsingResult:
     schema_resolver = create_schema_resolver(
         platform=platform,
@@ -1577,6 +1625,7 @@ def create_lineage_sql_parsed_result(
             schema_resolver=schema_resolver,
             default_db=default_db,
             default_schema=default_schema,
+            override_dialect=override_dialect,
         )
     except Exception as e:
         return SqlParsingResult.make_from_error(e)

datahub/telemetry/telemetry.py CHANGED Viewed

@@ -104,7 +104,7 @@ SENTRY_DSN: Optional[str] = os.environ.get("SENTRY_DSN", None)
 SENTRY_ENVIRONMENT: str = os.environ.get("SENTRY_ENVIRONMENT", "dev")
-def _default_telemetry_properties() -> Dict[str, Any]:
+def _default_global_properties() -> Dict[str, Any]:
     return {
         "datahub_version": nice_version_name(),
         "python_version": platform.python_version(),
@@ -122,6 +122,7 @@ class Telemetry:
     context_properties: Dict[str, Any] = {}
     def __init__(self):
+        self.global_properties = _default_global_properties()
         self.context_properties = {}
         if SENTRY_DSN:
@@ -247,6 +248,10 @@ class Telemetry:
         return False
+    def add_global_property(self, key: str, value: Any) -> None:
+        self.global_properties[key] = value
+        self._update_sentry_properties()
     def set_context(
         self,
         server: Optional["DataHubGraph"] = None,
@@ -257,16 +262,17 @@ class Telemetry:
             **(properties or {}),
         }
-        if self.sentry_enabled:
-            from sentry_sdk import set_tag
+        self._update_sentry_properties()
-            properties = {
-                **_default_telemetry_properties(),
-                **self.context_properties,
-            }
+    def _update_sentry_properties(self) -> None:
+        properties = {
+            **self.global_properties,
+            **self.context_properties,
+        }
+        if self.sentry_enabled:
+            import sentry_sdk
-            for key in properties:
-                set_tag(key, properties[key])
+            sentry_sdk.set_tags(properties)
     def init_capture_exception(self) -> None:
         if self.sentry_enabled:
@@ -300,7 +306,7 @@ class Telemetry:
         try:
             self.mp.people_set(
                 self.client_id,
-                _default_telemetry_properties(),
+                self.global_properties,
             )
         except Exception as e:
             logger.debug(f"Error initializing telemetry: {e}")
@@ -334,7 +340,7 @@ class Telemetry:
                 logger.debug(f"Sending telemetry for {event_name}")
             properties = {
-                **_default_telemetry_properties(),
+                **self.global_properties,
                 **self.context_properties,
                 **properties,
             }

datahub/testing/sdk_v2_helpers.py CHANGED Viewed

@@ -1,12 +1,18 @@
 import pathlib
+from typing import Sequence
 from datahub.sdk.entity import Entity
 from datahub.testing import mce_helpers
-def assert_entity_golden(entity: Entity, golden_path: pathlib.Path) -> None:
+def assert_entity_golden(
+    entity: Entity,
+    golden_path: pathlib.Path,
+    ignore_paths: Sequence[str] = (),
+) -> None:
     mce_helpers.check_goldens_stream(
         outputs=entity.as_mcps(),
         golden_path=golden_path,
         ignore_order=False,
+        ignore_paths=ignore_paths,
     )

datahub/upgrade/upgrade.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Any, Callable, Optional, Tuple, TypeVar
 import click
 import humanfriendly
-from packaging.version import Version
+from packaging.version import InvalidVersion, Version
 from pydantic import BaseModel
 from datahub._version import __version__
@@ -28,10 +28,23 @@ class VersionStats(BaseModel, arbitrary_types_allowed=True):
     release_date: Optional[datetime] = None
+def _safe_version_stats(version_string: str) -> Optional[VersionStats]:
+    """
+    Safely create a VersionStats object from a version string.
+    Returns None if the version string is invalid.
+    """
+    try:
+        return VersionStats(version=Version(version_string), release_date=None)
+    except InvalidVersion:
+        log.warning(f"Invalid version format received: {version_string!r}")
+        return None
 class ServerVersionStats(BaseModel):
     current: VersionStats
     latest: Optional[VersionStats] = None
     current_server_type: Optional[str] = None
+    current_server_default_cli_version: Optional[VersionStats] = None
 class ClientVersionStats(BaseModel):
@@ -44,7 +57,7 @@ class DataHubVersionStats(BaseModel):
     client: ClientVersionStats
-async def get_client_version_stats():
+async def get_client_version_stats() -> ClientVersionStats:
     import aiohttp
     current_version_string = __version__
@@ -52,6 +65,7 @@ async def get_client_version_stats():
     client_version_stats: ClientVersionStats = ClientVersionStats(
         current=VersionStats(version=current_version, release_date=None), latest=None
     )
     async with aiohttp.ClientSession() as session:
         pypi_url = "https://pypi.org/pypi/acryl_datahub/json"
         async with session.get(pypi_url) as resp:
@@ -131,7 +145,7 @@ async def get_server_config(gms_url: str, token: Optional[str]) -> RestServiceCo
 async def get_server_version_stats(
     server: Optional[DataHubGraph] = None,
-) -> Tuple[Optional[str], Optional[Version], Optional[datetime]]:
+) -> Tuple[Optional[str], Optional[Version], Optional[str], Optional[datetime]]:
     import aiohttp
     server_config: Optional[RestServiceConfig] = None
@@ -151,12 +165,13 @@ async def get_server_version_stats(
     server_type = None
     server_version: Optional[Version] = None
+    current_server_default_cli_version = None
     current_server_release_date = None
     if server_config:
         server_version_string = server_config.service_version
         commit_hash = server_config.commit_hash
         server_type = server_config.server_type
+        current_server_default_cli_version = server_config.default_cli_version
         if server_type == "quickstart" and commit_hash:
             async with aiohttp.ClientSession(
                 headers={"Accept": "application/vnd.github.v3+json"}
@@ -171,7 +186,12 @@ async def get_server_version_stats(
         if server_version_string and server_version_string.startswith("v"):
             server_version = Version(server_version_string[1:])
-    return (server_type, server_version, current_server_release_date)
+    return (
+        server_type,
+        server_version,
+        current_server_default_cli_version,
+        current_server_release_date,
+    )
 def retrieve_version_stats(
@@ -214,6 +234,7 @@ async def _retrieve_version_stats(
     (
         current_server_type,
         current_server_version,
+        current_server_default_cli_version,
         current_server_release_date,
     ) = results[2]
@@ -223,6 +244,11 @@ async def _retrieve_version_stats(
             current=VersionStats(
                 version=current_server_version, release_date=current_server_release_date
             ),
+            current_server_default_cli_version=(
+                _safe_version_stats(current_server_default_cli_version)
+                if current_server_default_cli_version
+                else None
+            ),
             latest=(
                 VersionStats(version=last_server_version, release_date=last_server_date)
                 if last_server_version
@@ -255,21 +281,14 @@ def valid_client_version(version: Version) -> bool:
     """Only version strings like 0.4.5 and 0.6.7.8 are valid. 0.8.6.7rc1 is not"""
     if version.is_prerelease or version.is_postrelease or version.is_devrelease:
         return False
-    if version.major == 0 and version.minor in [8, 9, 10, 11]:
-        return True
-    return False
+    return True
 def valid_server_version(version: Version) -> bool:
     """Only version strings like 0.8.x, 0.9.x or 0.10.x are valid. 0.1.x is not"""
     if version.is_prerelease or version.is_postrelease or version.is_devrelease:
         return False
-    if version.major == 0 and version.minor in [8, 9, 10]:
-        return True
-    return False
+    return True
 def is_client_server_compatible(client: VersionStats, server: VersionStats) -> int:
@@ -291,6 +310,27 @@ def is_client_server_compatible(client: VersionStats, server: VersionStats) -> i
         return server.version.micro - client.version.micro
+def is_server_default_cli_ahead(version_stats: DataHubVersionStats) -> bool:
+    """
+    Check if the server default CLI version is ahead of the current CLI version.
+    Returns True if server default CLI is newer and both versions are valid.
+    """
+    if not version_stats.server.current_server_default_cli_version:
+        return False
+    current_cli = version_stats.client.current
+    server_default_cli = version_stats.server.current_server_default_cli_version
+    is_valid_client_version = valid_client_version(current_cli.version)
+    is_valid_server_version = valid_client_version(server_default_cli.version)
+    if not (is_valid_client_version and is_valid_server_version):
+        return False
+    compatibility_result = is_client_server_compatible(current_cli, server_default_cli)
+    return compatibility_result > 0
 def _maybe_print_upgrade_message(
     version_stats: Optional[DataHubVersionStats],
 ) -> None:
@@ -429,6 +469,8 @@ def check_upgrade_post(
 def check_upgrade(func: Callable[..., T]) -> Callable[..., T]:
+    log.debug(f"Checking upgrade for {func.__module__}.{func.__name__}")
     @wraps(func)
     def async_wrapper(*args: Any, **kwargs: Any) -> Any:
         with PerfTimer() as timer:

datahub/utilities/server_config_util.py CHANGED Viewed

@@ -183,6 +183,14 @@ class RestServiceConfig:
         managed_ingestion = self.raw_config.get("managedIngestion") or {}
         return managed_ingestion.get("enabled", False)
+    @property
+    def default_cli_version(self) -> Optional[str]:
+        """
+        Get the default CLI version.
+        """
+        managed_ingestion = self.raw_config.get("managedIngestion") or {}
+        return managed_ingestion.get("defaultCliVersion")
     @property
     def is_datahub_cloud(self) -> bool:
         """

datahub/utilities/sqlalchemy_query_combiner.py CHANGED Viewed

@@ -272,8 +272,11 @@ class SQLAlchemyQueryCombiner:
                     self.report.uncombined_queries_issued += 1
                     return _sa_execute_underlying_method(conn, query, *args, **kwargs)
-        with _sa_execute_method_patching_lock, unittest.mock.patch(
-            "sqlalchemy.engine.Connection.execute", _sa_execute_fake
+        with (
+            _sa_execute_method_patching_lock,
+            unittest.mock.patch(
+                "sqlalchemy.engine.Connection.execute", _sa_execute_fake
+            ),
         ):
             yield self

datahub/utilities/stats_collections.py CHANGED Viewed

@@ -56,3 +56,7 @@ class TopKDict(DefaultDict[_KT, _VT]):
 def int_top_k_dict() -> TopKDict[str, int]:
     return TopKDict(int)
+def float_top_k_dict() -> TopKDict[str, float]:
+    return TopKDict(float)

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{acryl_datahub-1.1.1rc4.dist-info → acryl_datahub-1.2.0.1rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 1.1.1rc4__py3-none-any.whl → 1.2.0.1rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc4py3-none-any.whl → 1.2.0.1rc1py3-none-any.whl