PyPI - acryl-datahub - Versions diffs - 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl - Mend

acryl-datahub 0.14.1.13rc4py3-none-any.whl → 0.14.1.13rc6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (34) hide show

datahub/ingestion/source/sql/mssql/source.py CHANGED Viewed

@@ -24,6 +24,8 @@ from datahub.ingestion.api.decorators import (
     platform_name,
     support_status,
 )
+from datahub.ingestion.api.source import StructuredLogLevel
+from datahub.ingestion.api.source_helpers import auto_workunit
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.sql.mssql.job_models import (
     JobStep,
@@ -36,6 +38,9 @@ from datahub.ingestion.source.sql.mssql.job_models import (
     ProcedureParameter,
     StoredProcedure,
 )
+from datahub.ingestion.source.sql.mssql.stored_procedure_lineage import (
+    generate_procedure_lineage,
+)
 from datahub.ingestion.source.sql.sql_common import (
     SQLAlchemySource,
     SqlWorkUnit,
@@ -51,6 +56,7 @@ from datahub.metadata.schema_classes import (
     StringTypeClass,
     UnionTypeClass,
 )
+from datahub.utilities.file_backed_collections import FileBackedList
 logger: logging.Logger = logging.getLogger(__name__)
@@ -99,6 +105,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
         default=False,
         description="Enable to convert the SQL Server assets urns to lowercase",
     )
+    include_lineage: bool = Field(
+        default=True,
+        description="Enable lineage extraction for stored procedures",
+    )
     @pydantic.validator("uri_args")
     def passwords_match(cls, v, values, **kwargs):
@@ -161,6 +171,7 @@ class SQLServerSource(SQLAlchemySource):
         self.current_database = None
         self.table_descriptions: Dict[str, str] = {}
         self.column_descriptions: Dict[str, str] = {}
+        self.stored_procedures: FileBackedList[StoredProcedure] = FileBackedList()
         if self.config.include_descriptions:
             for inspector in self.get_inspectors():
                 db_name: str = self.get_db_name(inspector)
@@ -374,7 +385,7 @@ class SQLServerSource(SQLAlchemySource):
     def loop_job_steps(
         self, job: MSSQLJob, job_steps: Dict[str, Any]
     ) -> Iterable[MetadataWorkUnit]:
-        for step_id, step_data in job_steps.items():
+        for _step_id, step_data in job_steps.items():
             step = JobStep(
                 job_name=job.formatted_name,
                 step_name=step_data["step_name"],
@@ -412,37 +423,44 @@ class SQLServerSource(SQLAlchemySource):
             if procedures:
                 yield from self.construct_flow_workunits(data_flow=data_flow)
             for procedure in procedures:
-                upstream = self._get_procedure_upstream(conn, procedure)
-                downstream = self._get_procedure_downstream(conn, procedure)
-                data_job = MSSQLDataJob(
-                    entity=procedure,
-                )
-                # TODO: because of this upstream and downstream are more dependencies,
-                #  can't be used as DataJobInputOutput.
-                #  Should be reorganized into lineage.
-                data_job.add_property("procedure_depends_on", str(upstream.as_property))
-                data_job.add_property(
-                    "depending_on_procedure", str(downstream.as_property)
-                )
-                procedure_definition, procedure_code = self._get_procedure_code(
-                    conn, procedure
-                )
-                if procedure_definition:
-                    data_job.add_property("definition", procedure_definition)
-                if sql_config.include_stored_procedures_code and procedure_code:
-                    data_job.add_property("code", procedure_code)
-                procedure_inputs = self._get_procedure_inputs(conn, procedure)
-                properties = self._get_procedure_properties(conn, procedure)
-                data_job.add_property(
-                    "input parameters", str([param.name for param in procedure_inputs])
-                )
-                for param in procedure_inputs:
-                    data_job.add_property(
-                        f"parameter {param.name}", str(param.properties)
-                    )
-                for property_name, property_value in properties.items():
-                    data_job.add_property(property_name, str(property_value))
-                yield from self.construct_job_workunits(data_job)
+                yield from self._process_stored_procedure(conn, procedure)
+    def _process_stored_procedure(
+        self, conn: Connection, procedure: StoredProcedure
+    ) -> Iterable[MetadataWorkUnit]:
+        upstream = self._get_procedure_upstream(conn, procedure)
+        downstream = self._get_procedure_downstream(conn, procedure)
+        data_job = MSSQLDataJob(
+            entity=procedure,
+        )
+        # TODO: because of this upstream and downstream are more dependencies,
+        #  can't be used as DataJobInputOutput.
+        #  Should be reorganized into lineage.
+        data_job.add_property("procedure_depends_on", str(upstream.as_property))
+        data_job.add_property("depending_on_procedure", str(downstream.as_property))
+        procedure_definition, procedure_code = self._get_procedure_code(conn, procedure)
+        procedure.code = procedure_code
+        if procedure_definition:
+            data_job.add_property("definition", procedure_definition)
+        if procedure_code and self.config.include_stored_procedures_code:
+            data_job.add_property("code", procedure_code)
+        procedure_inputs = self._get_procedure_inputs(conn, procedure)
+        properties = self._get_procedure_properties(conn, procedure)
+        data_job.add_property(
+            "input parameters", str([param.name for param in procedure_inputs])
+        )
+        for param in procedure_inputs:
+            data_job.add_property(f"parameter {param.name}", str(param.properties))
+        for property_name, property_value in properties.items():
+            data_job.add_property(property_name, str(property_value))
+        if self.config.include_lineage:
+            # These will be used to construct lineage
+            self.stored_procedures.append(procedure)
+        yield from self.construct_job_workunits(
+            data_job,
+            # For stored procedure lineage is ingested later
+            include_lineage=False,
+        )
     @staticmethod
     def _get_procedure_downstream(
@@ -546,8 +564,8 @@ class SQLServerSource(SQLAlchemySource):
                 code_list.append(row["Text"])
                 if code_slice_text in re.sub(" +", " ", row["Text"].lower()).strip():
                     code_slice_index = index
-            definition = "\n".join(code_list[:code_slice_index])
-            code = "\n".join(code_list[code_slice_index:])
+            definition = "".join(code_list[:code_slice_index])
+            code = "".join(code_list[code_slice_index:])
         except ResourceClosedError:
             logger.warning(
                 "Connection was closed from procedure '%s'",
@@ -602,16 +620,18 @@ class SQLServerSource(SQLAlchemySource):
     def construct_job_workunits(
         self,
         data_job: MSSQLDataJob,
+        include_lineage: bool = True,
     ) -> Iterable[MetadataWorkUnit]:
         yield MetadataChangeProposalWrapper(
             entityUrn=data_job.urn,
             aspect=data_job.as_datajob_info_aspect,
         ).as_workunit()
-        yield MetadataChangeProposalWrapper(
-            entityUrn=data_job.urn,
-            aspect=data_job.as_datajob_input_output_aspect,
-        ).as_workunit()
+        if include_lineage:
+            yield MetadataChangeProposalWrapper(
+                entityUrn=data_job.urn,
+                aspect=data_job.as_datajob_input_output_aspect,
+            ).as_workunit()
         # TODO: Add SubType when it appear
     def construct_flow_workunits(
@@ -664,3 +684,58 @@ class SQLServerSource(SQLAlchemySource):
             if self.config.convert_urns_to_lowercase
             else qualified_table_name
         )
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        yield from super().get_workunits_internal()
+        # This is done at the end so that we will have access to tables
+        # from all databases in schema_resolver and discovered_tables
+        for procedure in self.stored_procedures:
+            with self.report.report_exc(
+                message="Failed to parse stored procedure lineage",
+                context=procedure.full_name,
+                level=StructuredLogLevel.WARN,
+            ):
+                yield from auto_workunit(
+                    generate_procedure_lineage(
+                        schema_resolver=self.schema_resolver,
+                        procedure=procedure,
+                        procedure_job_urn=MSSQLDataJob(entity=procedure).urn,
+                        is_temp_table=self.is_temp_table,
+                    )
+                )
+    def is_temp_table(self, name: str) -> bool:
+        try:
+            parts = name.split(".")
+            table_name = parts[-1]
+            schema_name = parts[-2]
+            db_name = parts[-3]
+            if table_name.startswith("#"):
+                return True
+            # This is also a temp table if
+            #   1. this name would be allowed by the dataset patterns, and
+            #   2. we have a list of discovered tables, and
+            #   3. it's not in the discovered tables list
+            if (
+                self.config.database_pattern.allowed(db_name)
+                and self.config.schema_pattern.allowed(schema_name)
+                and self.config.table_pattern.allowed(name)
+                and self.standardize_identifier_case(name)
+                not in self.discovered_datasets
+            ):
+                logger.debug(f"inferred as temp table {name}")
+                return True
+        except Exception:
+            logger.warning(f"Error parsing table name {name} ")
+        return False
+    def standardize_identifier_case(self, table_ref_str: str) -> str:
+        return (
+            table_ref_str.lower()
+            if self.config.convert_urns_to_lowercase
+            else table_ref_str
+        )

datahub/ingestion/source/sql/mssql/stored_procedure_lineage.py ADDED Viewed

@@ -0,0 +1,84 @@
+import logging
+from typing import Callable, Iterable, Optional
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.ingestion.source.sql.mssql.job_models import StoredProcedure
+from datahub.metadata.schema_classes import DataJobInputOutputClass
+from datahub.sql_parsing.datajob import to_datajob_input_output
+from datahub.sql_parsing.schema_resolver import SchemaResolver
+from datahub.sql_parsing.split_statements import split_statements
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    ObservedQuery,
+    SqlParsingAggregator,
+)
+logger = logging.getLogger(__name__)
+def parse_procedure_code(
+    *,
+    schema_resolver: SchemaResolver,
+    default_db: Optional[str],
+    default_schema: Optional[str],
+    code: str,
+    is_temp_table: Callable[[str], bool],
+    raise_: bool = False,
+) -> Optional[DataJobInputOutputClass]:
+    aggregator = SqlParsingAggregator(
+        platform=schema_resolver.platform,
+        env=schema_resolver.env,
+        schema_resolver=schema_resolver,
+        generate_lineage=True,
+        generate_queries=False,
+        generate_usage_statistics=False,
+        generate_operations=False,
+        generate_query_subject_fields=False,
+        generate_query_usage_statistics=False,
+        is_temp_table=is_temp_table,
+    )
+    for query in split_statements(code):
+        # TODO: We should take into account `USE x` statements.
+        aggregator.add_observed_query(
+            observed=ObservedQuery(
+                default_db=default_db,
+                default_schema=default_schema,
+                query=query,
+            )
+        )
+    if aggregator.report.num_observed_queries_failed and raise_:
+        logger.info(aggregator.report.as_string())
+        raise ValueError(
+            f"Failed to parse {aggregator.report.num_observed_queries_failed} queries."
+        )
+    mcps = list(aggregator.gen_metadata())
+    return to_datajob_input_output(
+        mcps=mcps,
+        ignore_extra_mcps=True,
+    )
+# Is procedure handling generic enough to be added to SqlParsingAggregator?
+def generate_procedure_lineage(
+    *,
+    schema_resolver: SchemaResolver,
+    procedure: StoredProcedure,
+    procedure_job_urn: str,
+    is_temp_table: Callable[[str], bool] = lambda _: False,
+    raise_: bool = False,
+) -> Iterable[MetadataChangeProposalWrapper]:
+    if procedure.code:
+        datajob_input_output = parse_procedure_code(
+            schema_resolver=schema_resolver,
+            default_db=procedure.db,
+            default_schema=procedure.schema,
+            code=procedure.code,
+            is_temp_table=is_temp_table,
+            raise_=raise_,
+        )
+        if datajob_input_output:
+            yield MetadataChangeProposalWrapper(
+                entityUrn=procedure_job_urn,
+                aspect=datajob_input_output,
+            )

datahub/ingestion/source/sql/oracle.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import datetime
 import logging
 import re
@@ -631,3 +632,52 @@ class OracleSource(SQLAlchemySource):
             clear=False,
         ):
             return super().get_workunits()
+    def generate_profile_candidates(
+        self,
+        inspector: Inspector,
+        threshold_time: Optional[datetime.datetime],
+        schema: str,
+    ) -> Optional[List[str]]:
+        tables_table_name = (
+            "ALL_TABLES" if self.config.data_dictionary_mode == "ALL" else "DBA_TABLES"
+        )
+        # If stats are available , they are used even if they are stale.
+        # Assuming that the table would typically grow over time, this will ensure to filter
+        # large tables known at stats collection time from profiling candidates.
+        # If stats are not available (NULL), such tables are not filtered and are considered
+        # as profiling candidates.
+        cursor = inspector.bind.execute(
+            sql.text(
+                f"""SELECT
+                            t.OWNER,
+                            t.TABLE_NAME,
+                            t.NUM_ROWS,
+                            t.LAST_ANALYZED,
+                            COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) AS SIZE_GB
+                        FROM {tables_table_name} t
+                        WHERE t.OWNER = :owner
+                        AND (t.NUM_ROWS < :table_row_limit OR t.NUM_ROWS IS NULL)
+                        AND COALESCE(t.NUM_ROWS * t.AVG_ROW_LEN, 0) / (1024 * 1024 * 1024) < :table_size_limit
+                """
+            ),
+            dict(
+                owner=inspector.dialect.denormalize_name(schema),
+                table_row_limit=self.config.profiling.profile_table_row_limit,
+                table_size_limit=self.config.profiling.profile_table_size_limit,
+            ),
+        )
+        TABLE_NAME_COL_LOC = 1
+        return [
+            self.get_identifier(
+                schema=schema,
+                entity=inspector.dialect.normalize_name(row[TABLE_NAME_COL_LOC])
+                or _raise_err(
+                    ValueError(f"Invalid table name: {row[TABLE_NAME_COL_LOC]}")
+                ),
+                inspector=inspector,
+            )
+            for row in cursor
+        ]

datahub/ingestion/source/sql/sql_common.py CHANGED Viewed

@@ -51,7 +51,6 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.glossary.classification_mixin import (
     SAMPLE_SIZE_MULTIPLIER,
     ClassificationHandler,
-    ClassificationReportMixin,
 )
 from datahub.ingestion.source.common.data_reader import DataReader
 from datahub.ingestion.source.common.subtypes import (
@@ -59,6 +58,7 @@ from datahub.ingestion.source.common.subtypes import (
     DatasetSubTypes,
 )
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.sql_utils import (
     add_table_to_schema_container,
     downgrade_schema_from_v2,
@@ -74,7 +74,6 @@ from datahub.ingestion.source.sql.sqlalchemy_data_reader import (
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
-    StaleEntityRemovalSourceReport,
 )
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionSourceBase,
@@ -118,9 +117,7 @@ from datahub.sql_parsing.sqlglot_lineage import (
 )
 from datahub.telemetry import telemetry
 from datahub.utilities.file_backed_collections import FileBackedDict
-from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.registries.domain_registry import DomainRegistry
-from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
 from datahub.utilities.sqlalchemy_type_converter import (
     get_native_data_type_for_sqlalchemy_type,
 )
@@ -134,43 +131,6 @@ if TYPE_CHECKING:
 logger: logging.Logger = logging.getLogger(__name__)
-@dataclass
-class SQLSourceReport(StaleEntityRemovalSourceReport, ClassificationReportMixin):
-    tables_scanned: int = 0
-    views_scanned: int = 0
-    entities_profiled: int = 0
-    filtered: LossyList[str] = field(default_factory=LossyList)
-    query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
-    num_view_definitions_parsed: int = 0
-    num_view_definitions_failed_parsing: int = 0
-    num_view_definitions_failed_column_parsing: int = 0
-    view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
-    def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
-        """
-        Entity could be a view or a table
-        """
-        if ent_type == "table":
-            self.tables_scanned += 1
-        elif ent_type == "view":
-            self.views_scanned += 1
-        else:
-            raise KeyError(f"Unknown entity {ent_type}.")
-    def report_entity_profiled(self, name: str) -> None:
-        self.entities_profiled += 1
-    def report_dropped(self, ent_name: str) -> None:
-        self.filtered.append(ent_name)
-    def report_from_query_combiner(
-        self, query_combiner_report: SQLAlchemyQueryCombinerReport
-    ) -> None:
-        self.query_combiner = query_combiner_report
 class SqlWorkUnit(MetadataWorkUnit):
     pass
@@ -352,7 +312,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
     def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str):
         super().__init__(config, ctx)
-        self.config = config
+        self.config: SQLCommonConfig = config
         self.platform = platform
         self.report: SQLSourceReport = SQLSourceReport()
         self.profile_metadata_info: ProfileMetadata = ProfileMetadata()
@@ -392,6 +352,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
             platform_instance=self.config.platform_instance,
             env=self.config.env,
         )
+        self.discovered_datasets: Set[str] = set()
         self._view_definition_cache: MutableMapping[str, str]
         if self.config.use_file_backed_cache:
             self._view_definition_cache = FileBackedDict[str]()
@@ -831,8 +792,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
         self._classify(dataset_name, schema, table, data_reader, schema_metadata)
         dataset_snapshot.aspects.append(schema_metadata)
-        if self.config.include_view_lineage:
+        if self._save_schema_to_resolver():
             self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
+            self.discovered_datasets.add(dataset_name)
         db_name = self.get_db_name(inspector)
         yield from self.add_table_to_schema_container(
@@ -1126,8 +1088,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 columns,
                 canonical_schema=schema_fields,
             )
-            if self.config.include_view_lineage:
+            if self._save_schema_to_resolver():
                 self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata)
+                self.discovered_datasets.add(dataset_name)
         description, properties, _ = self.get_table_properties(inspector, schema, view)
         try:
             view_definition = inspector.get_view_definition(view, schema)
@@ -1190,6 +1153,11 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 domain_registry=self.domain_registry,
             )
+    def _save_schema_to_resolver(self):
+        return self.config.include_view_lineage or (
+            hasattr(self.config, "include_lineage") and self.config.include_lineage
+        )
     def _run_sql_parser(
         self, view_identifier: str, query: str, schema_resolver: SchemaResolver
     ) -> Optional[SqlParsingResult]:
@@ -1274,17 +1242,22 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
     def is_dataset_eligible_for_profiling(
         self,
         dataset_name: str,
-        sql_config: SQLCommonConfig,
+        schema: str,
         inspector: Inspector,
         profile_candidates: Optional[List[str]],
     ) -> bool:
-        return (
-            sql_config.table_pattern.allowed(dataset_name)
-            and sql_config.profile_pattern.allowed(dataset_name)
-        ) and (
-            profile_candidates is None
-            or (profile_candidates is not None and dataset_name in profile_candidates)
-        )
+        if not (
+            self.config.table_pattern.allowed(dataset_name)
+            and self.config.profile_pattern.allowed(dataset_name)
+        ):
+            self.report.profiling_skipped_table_profile_pattern[schema] += 1
+            return False
+        if profile_candidates is not None and dataset_name not in profile_candidates:
+            self.report.profiling_skipped_other[schema] += 1
+            return False
+        return True
     def loop_profiler_requests(
         self,
@@ -1299,7 +1272,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
         if (
             sql_config.profiling.profile_if_updated_since_days is not None
             or sql_config.profiling.profile_table_size_limit is not None
-            or sql_config.profiling.profile_table_row_limit is None
+            or sql_config.profiling.profile_table_row_limit is not None
         ):
             try:
                 threshold_time: Optional[datetime.datetime] = None
@@ -1320,8 +1293,9 @@ class SQLAlchemySource(StatefulIngestionSourceBase, TestableSource):
                 schema=schema, entity=table, inspector=inspector
             )
             if not self.is_dataset_eligible_for_profiling(
-                dataset_name, sql_config, inspector, profile_candidates
+                dataset_name, schema, inspector, profile_candidates
             ):
+                self.report.num_tables_not_eligible_profiling[schema] += 1
                 if self.config.profiling.report_dropped_profiles:
                     self.report.report_dropped(f"profile of {dataset_name}")
                 continue

datahub/ingestion/source/sql/sql_generic_profiler.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 from abc import abstractmethod
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from typing import Dict, Iterable, List, Optional, Union, cast
@@ -14,42 +14,13 @@ from datahub.ingestion.source.ge_data_profiler import (
     DatahubGEProfiler,
     GEProfilerRequest,
 )
-from datahub.ingestion.source.sql.sql_common import SQLSourceReport
 from datahub.ingestion.source.sql.sql_config import SQLCommonConfig
 from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView
+from datahub.ingestion.source.sql.sql_report import SQLSourceReport
 from datahub.ingestion.source.sql.sql_utils import check_table_with_profile_pattern
 from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler
 from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile
 from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType
-from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
-@dataclass
-class DetailedProfilerReportMixin:
-    profiling_skipped_not_updated: TopKDict[str, int] = field(
-        default_factory=int_top_k_dict
-    )
-    profiling_skipped_size_limit: TopKDict[str, int] = field(
-        default_factory=int_top_k_dict
-    )
-    profiling_skipped_row_limit: TopKDict[str, int] = field(
-        default_factory=int_top_k_dict
-    )
-    profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
-        default_factory=int_top_k_dict
-    )
-    profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
-    num_tables_not_eligible_profiling: Dict[str, int] = field(
-        default_factory=int_top_k_dict
-    )
-class ProfilingSqlReport(DetailedProfilerReportMixin, SQLSourceReport):
-    pass
 @dataclass
@@ -65,7 +36,7 @@ class GenericProfiler:
     def __init__(
         self,
         config: SQLCommonConfig,
-        report: ProfilingSqlReport,
+        report: SQLSourceReport,
         platform: str,
         state_handler: Optional[ProfilingHandler] = None,
     ) -> None:

datahub/ingestion/source/sql/sql_report.py ADDED Viewed

@@ -0,0 +1,75 @@
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+from datahub.ingestion.glossary.classification_mixin import ClassificationReportMixin
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+    StaleEntityRemovalSourceReport,
+)
+from datahub.utilities.lossy_collections import LossyList
+from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
+from datahub.utilities.stats_collections import TopKDict, int_top_k_dict
+@dataclass
+class DetailedProfilerReportMixin:
+    profiling_skipped_not_updated: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+    profiling_skipped_size_limit: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+    profiling_skipped_row_limit: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+    profiling_skipped_table_profile_pattern: TopKDict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+    profiling_skipped_other: TopKDict[str, int] = field(default_factory=int_top_k_dict)
+    num_tables_not_eligible_profiling: Dict[str, int] = field(
+        default_factory=int_top_k_dict
+    )
+@dataclass
+class SQLSourceReport(
+    StaleEntityRemovalSourceReport,
+    ClassificationReportMixin,
+    DetailedProfilerReportMixin,
+):
+    tables_scanned: int = 0
+    views_scanned: int = 0
+    entities_profiled: int = 0
+    filtered: LossyList[str] = field(default_factory=LossyList)
+    query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
+    num_view_definitions_parsed: int = 0
+    num_view_definitions_failed_parsing: int = 0
+    num_view_definitions_failed_column_parsing: int = 0
+    view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList)
+    def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
+        """
+        Entity could be a view or a table
+        """
+        if ent_type == "table":
+            self.tables_scanned += 1
+        elif ent_type == "view":
+            self.views_scanned += 1
+        else:
+            raise KeyError(f"Unknown entity {ent_type}.")
+    def report_entity_profiled(self, name: str) -> None:
+        self.entities_profiled += 1
+    def report_dropped(self, ent_name: str) -> None:
+        self.filtered.append(ent_name)
+    def report_from_query_combiner(
+        self, query_combiner_report: SQLAlchemyQueryCombinerReport
+    ) -> None:
+        self.query_combiner = query_combiner_report

acryl-datahub 0.14.1.13rc4__py3-none-any.whl → 0.14.1.13rc6__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.14.1.13rc4py3-none-any.whl → 0.14.1.13rc6py3-none-any.whl