PyPI - acryl-datahub - Versions diffs - 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2rc1__py3-none-any.whl - Mend

acryl-datahub 0.15.0.1rc16py3-none-any.whl → 0.15.0.2rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (25) hide show

datahub/ingestion/source/aws/glue.py CHANGED Viewed

@@ -1054,49 +1054,66 @@ class GlueSource(StatefulIngestionSourceBase):
             yield from self.gen_database_containers(database)
         for table in tables:
-            database_name = table["DatabaseName"]
             table_name = table["Name"]
-            full_table_name = f"{database_name}.{table_name}"
-            self.report.report_table_scanned()
-            if not self.source_config.database_pattern.allowed(
-                database_name
-            ) or not self.source_config.table_pattern.allowed(full_table_name):
-                self.report.report_table_dropped(full_table_name)
-                continue
+            try:
+                yield from self._gen_table_wu(table=table)
+            except KeyError as e:
+                self.report.report_failure(
+                    message="Failed to extract workunit for table",
+                    context=f"Table: {table_name}",
+                    exc=e,
+                )
+        if self.extract_transforms:
+            yield from self._transform_extraction()
-            dataset_urn = make_dataset_urn_with_platform_instance(
-                platform=self.platform,
-                name=full_table_name,
-                env=self.env,
-                platform_instance=self.source_config.platform_instance,
-            )
+    def _gen_table_wu(self, table: Dict) -> Iterable[MetadataWorkUnit]:
+        database_name = table["DatabaseName"]
+        table_name = table["Name"]
+        full_table_name = f"{database_name}.{table_name}"
+        self.report.report_table_scanned()
+        if not self.source_config.database_pattern.allowed(
+            database_name
+        ) or not self.source_config.table_pattern.allowed(full_table_name):
+            self.report.report_table_dropped(full_table_name)
+            return
+        dataset_urn = make_dataset_urn_with_platform_instance(
+            platform=self.platform,
+            name=full_table_name,
+            env=self.env,
+            platform_instance=self.source_config.platform_instance,
+        )
-            mce = self._extract_record(dataset_urn, table, full_table_name)
-            yield MetadataWorkUnit(full_table_name, mce=mce)
+        mce = self._extract_record(dataset_urn, table, full_table_name)
+        yield MetadataWorkUnit(full_table_name, mce=mce)
-            # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
-            # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
-            yield MetadataChangeProposalWrapper(
-                entityUrn=dataset_urn,
-                aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
-            ).as_workunit()
+        # We also want to assign "table" subType to the dataset representing glue table - unfortunately it is not
+        # possible via Dataset snapshot embedded in a mce, so we have to generate a mcp.
+        yield MetadataChangeProposalWrapper(
+            entityUrn=dataset_urn,
+            aspect=SubTypes(typeNames=[DatasetSubTypes.TABLE]),
+        ).as_workunit()
-            yield from self._get_domain_wu(
-                dataset_name=full_table_name,
-                entity_urn=dataset_urn,
-            )
-            yield from self.add_table_to_database_container(
-                dataset_urn=dataset_urn, db_name=database_name
-            )
+        yield from self._get_domain_wu(
+            dataset_name=full_table_name,
+            entity_urn=dataset_urn,
+        )
+        yield from self.add_table_to_database_container(
+            dataset_urn=dataset_urn, db_name=database_name
+        )
-            wu = self.get_lineage_if_enabled(mce)
-            if wu:
-                yield wu
+        wu = self.get_lineage_if_enabled(mce)
+        if wu:
+            yield wu
+        try:
             yield from self.get_profile_if_enabled(mce, database_name, table_name)
-        if self.extract_transforms:
-            yield from self._transform_extraction()
+        except KeyError as e:
+            self.report.report_failure(
+                message="Failed to extract profile for table",
+                context=f"Table: {dataset_urn}",
+                exc=e,
+            )
     def _transform_extraction(self) -> Iterable[MetadataWorkUnit]:
         dags: Dict[str, Optional[Dict[str, Any]]] = {}

datahub/ingestion/source/bigquery_v2/bigquery.py CHANGED Viewed

@@ -281,6 +281,8 @@ class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource):
                     include_lineage=self.config.include_table_lineage,
                     include_usage_statistics=self.config.include_usage_statistics,
                     include_operations=self.config.usage.include_operational_stats,
+                    include_queries=self.config.include_queries,
+                    include_query_usage_statistics=self.config.include_query_usage_statistics,
                     top_n_queries=self.config.usage.top_n_queries,
                     region_qualifiers=self.config.region_qualifiers,
                 ),

datahub/ingestion/source/bigquery_v2/bigquery_config.py CHANGED Viewed

@@ -447,6 +447,14 @@ class BigQueryV2Config(
         default=False,
         description="If enabled, uses the new queries extractor to extract queries from bigquery.",
     )
+    include_queries: bool = Field(
+        default=True,
+        description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+    )
+    include_query_usage_statistics: bool = Field(
+        default=True,
+        description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+    )
     @property
     def have_table_data_read_permission(self) -> bool:

datahub/ingestion/source/datahub/config.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 from typing import Optional, Set
+import pydantic
 from pydantic import Field, root_validator
 from datahub.configuration.common import AllowDenyPattern
@@ -119,3 +120,12 @@ class DataHubSourceConfig(StatefulIngestionConfigBase):
                 " Please specify at least one of `database_connection` or `kafka_connection`, ideally both."
             )
         return values
+    @pydantic.validator("database_connection")
+    def validate_mysql_scheme(
+        cls, v: SQLAlchemyConnectionConfig
+    ) -> SQLAlchemyConnectionConfig:
+        if "mysql" in v.scheme:
+            if v.scheme != "mysql+pymysql":
+                raise ValueError("For MySQL, the scheme must be mysql+pymysql.")
+        return v

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -151,8 +151,10 @@ class DataHubDatabaseReader:
         self, query: str, params: Dict[str, Any]
     ) -> Iterable[Dict[str, Any]]:
         with self.engine.connect() as conn:
-            if self.engine.dialect.name == "postgresql":
+            if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
                 with conn.begin():  # Transaction required for PostgreSQL server-side cursor
+                    # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
+                    # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
                     conn = conn.execution_options(
                         stream_results=True,
                         yield_per=self.config.database_query_batch_size,
@@ -160,22 +162,6 @@ class DataHubDatabaseReader:
                     result = conn.execute(query, params)
                     for row in result:
                         yield dict(row)
-            elif self.engine.dialect.name == "mysql":  # MySQL
-                import MySQLdb
-                with contextlib.closing(
-                    conn.connection.cursor(MySQLdb.cursors.SSCursor)
-                ) as cursor:
-                    logger.debug(f"Using Cursor type: {cursor.__class__.__name__}")
-                    cursor.execute(query, params)
-                    columns = [desc[0] for desc in cursor.description]
-                    while True:
-                        rows = cursor.fetchmany(self.config.database_query_batch_size)
-                        if not rows:
-                            break  # Use break instead of return in generator
-                        for row in rows:
-                            yield dict(zip(columns, row))
             else:
                 raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -130,7 +130,7 @@ class DataHubSource(StatefulIngestionSourceBase):
             self._commit_progress(i)
     def _get_kafka_workunits(
-        self, from_offsets: Dict[int, int], soft_deleted_urns: List[str] = []
+        self, from_offsets: Dict[int, int], soft_deleted_urns: List[str]
     ) -> Iterable[MetadataWorkUnit]:
         if self.config.kafka_connection is None:
             return

datahub/ingestion/source/gc/soft_deleted_entity_cleanup.py CHANGED Viewed

@@ -19,8 +19,8 @@ from datahub.utilities.urns._urn_base import Urn
 logger = logging.getLogger(__name__)
-QUERY_QUERY_ENTITY = """
-query listQueries($input: ScrollAcrossEntitiesInput!) {
+QUERY_ENTITIES = """
+query listEntities($input: ScrollAcrossEntitiesInput!) {
   scrollAcrossEntities(input: $input) {
     nextScrollId
     count
@@ -29,6 +29,9 @@ query listQueries($input: ScrollAcrossEntitiesInput!) {
         ... on QueryEntity {
           urn
         }
+        ... on DataProcessInstance {
+          urn
+        }
       }
     }
   }
@@ -225,16 +228,16 @@ class SoftDeletedEntitiesCleanup:
                     time.sleep(self.config.delay)
         return futures
-    def _get_soft_deleted_queries(self) -> Iterable[str]:
+    def _get_soft_deleted(self, graphql_query: str, entity_type: str) -> Iterable[str]:
         assert self.ctx.graph
         scroll_id: Optional[str] = None
         while True:
             try:
                 result = self.ctx.graph.execute_graphql(
-                    QUERY_QUERY_ENTITY,
+                    graphql_query,
                     {
                         "input": {
-                            "types": ["QUERY"],
+                            "types": [entity_type],
                             "query": "*",
                             "scrollId": scroll_id if scroll_id else None,
                             "count": self.config.batch_size,
@@ -254,7 +257,7 @@ class SoftDeletedEntitiesCleanup:
                 )
             except Exception as e:
                 self.report.failure(
-                    f"While trying to get queries with {scroll_id}", exc=e
+                    f"While trying to get {entity_type} with {scroll_id}", exc=e
                 )
                 break
             scroll_across_entities = result.get("scrollAcrossEntities")
@@ -275,7 +278,8 @@ class SoftDeletedEntitiesCleanup:
             status=RemovedStatusFilter.ONLY_SOFT_DELETED,
             batch_size=self.config.batch_size,
         )
-        yield from self._get_soft_deleted_queries()
+        yield from self._get_soft_deleted(QUERY_ENTITIES, "QUERY")
+        yield from self._get_soft_deleted(QUERY_ENTITIES, "DATA_PROCESS_INSTANCE")
     def _times_up(self) -> bool:
         if (

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -221,6 +221,14 @@ class SnowflakeV2Config(
         default=False,
         description="If enabled, uses the new queries extractor to extract queries from snowflake.",
     )
+    include_queries: bool = Field(
+        default=True,
+        description="If enabled, generate query entities associated with lineage edges. Only applicable if `use_queries_v2` is enabled.",
+    )
+    include_query_usage_statistics: bool = Field(
+        default=True,
+        description="If enabled, generate query popularity statistics. Only applicable if `use_queries_v2` is enabled.",
+    )
     lazy_schema_resolver: bool = Field(
         default=True,

datahub/ingestion/source/snowflake/snowflake_lineage_v2.py CHANGED Viewed

@@ -40,6 +40,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
     ColumnRef,
     DownstreamColumnRef,
 )
+from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
 from datahub.utilities.perf_timer import PerfTimer
 from datahub.utilities.time import ts_millis_to_datetime
@@ -239,6 +240,9 @@ class SnowflakeLineageExtractor(SnowflakeCommonMixin, Closeable):
         downstream_table_urn = self.identifiers.gen_dataset_urn(dataset_name)
         known_lineage = KnownQueryLineageInfo(
+            query_id=get_query_fingerprint(
+                query.query_text, self.identifiers.platform, fast=True
+            ),
             query_text=query.query_text,
             downstream=downstream_table_urn,
             upstreams=self.map_query_result_upstreams(

datahub/ingestion/source/snowflake/snowflake_v2.py CHANGED Viewed

@@ -528,6 +528,8 @@ class SnowflakeV2Source(
                     include_lineage=self.config.include_table_lineage,
                     include_usage_statistics=self.config.include_usage_stats,
                     include_operations=self.config.include_operational_stats,
+                    include_queries=self.config.include_queries,
+                    include_query_usage_statistics=self.config.include_query_usage_statistics,
                     user_email_pattern=self.config.user_email_pattern,
                 ),
                 structured_report=self.report,

datahub/ingestion/source/unity/source.py CHANGED Viewed

@@ -26,9 +26,6 @@ from datahub.emitter.mcp_builder import (
     gen_containers,
 )
 from datahub.emitter.sql_parsing_builder import SqlParsingBuilder
-from datahub.ingestion.api.auto_work_units.auto_ensure_aspect_size import (
-    EnsureAspectSizeProcessor,
-)
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
     SupportStatus,
@@ -263,7 +260,6 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource):
             StaleEntityRemovalHandler.create(
                 self, self.config, self.ctx
             ).workunit_processor,
-            EnsureAspectSizeProcessor(self.get_report()).ensure_aspect_size,
         ]
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:

datahub/sql_parsing/sql_parsing_aggregator.py CHANGED Viewed

@@ -165,6 +165,7 @@ class KnownQueryLineageInfo:
     timestamp: Optional[datetime] = None
     session_id: Optional[str] = None
     query_type: QueryType = QueryType.UNKNOWN
+    query_id: Optional[str] = None
 @dataclasses.dataclass
@@ -618,11 +619,13 @@ class SqlParsingAggregator(Closeable):
         self.report.num_known_query_lineage += 1
         # Generate a fingerprint for the query.
-        with self.report.sql_fingerprinting_timer:
-            query_fingerprint = get_query_fingerprint(
-                known_query_lineage.query_text,
-                platform=self.platform.platform_name,
-            )
+        query_fingerprint = known_query_lineage.query_id
+        if not query_fingerprint:
+            with self.report.sql_fingerprinting_timer:
+                query_fingerprint = get_query_fingerprint(
+                    known_query_lineage.query_text,
+                    platform=self.platform.platform_name,
+                )
         formatted_query = self._maybe_format_query(known_query_lineage.query_text)
         # Register the query.

{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/WHEEL RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{acryl_datahub-0.15.0.1rc16.dist-info → acryl_datahub-0.15.0.2rc1.dist-info}/top_level.txt RENAMED Viewed

File without changes

acryl-datahub 0.15.0.1rc16__py3-none-any.whl → 0.15.0.2rc1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.1rc16py3-none-any.whl → 0.15.0.2rc1py3-none-any.whl