PyPI - acryl-datahub - Versions diffs - 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl - Mend

acryl-datahub 1.2.0.6py3-none-any.whl → 1.2.0.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (84) hide show

{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/METADATA +2629 -2543
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/RECORD +83 -75
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/graphql/operation.py +1 -1
datahub/ingestion/autogenerated/capability_summary.json +46 -6
datahub/ingestion/autogenerated/lineage.json +3 -2
datahub/ingestion/run/pipeline.py +1 -0
datahub/ingestion/source/aws/s3_boto_utils.py +97 -5
datahub/ingestion/source/bigquery_v2/bigquery_connection.py +12 -1
datahub/ingestion/source/common/subtypes.py +3 -0
datahub/ingestion/source/data_lake_common/path_spec.py +1 -1
datahub/ingestion/source/datahub/datahub_database_reader.py +19 -8
datahub/ingestion/source/dbt/dbt_common.py +74 -0
datahub/ingestion/source/dremio/dremio_aspects.py +3 -2
datahub/ingestion/source/dremio/dremio_source.py +4 -0
datahub/ingestion/source/dynamodb/dynamodb.py +10 -7
datahub/ingestion/source/excel/__init__.py +0 -0
datahub/ingestion/source/excel/config.py +92 -0
datahub/ingestion/source/excel/excel_file.py +539 -0
datahub/ingestion/source/excel/profiling.py +308 -0
datahub/ingestion/source/excel/report.py +49 -0
datahub/ingestion/source/excel/source.py +662 -0
datahub/ingestion/source/excel/util.py +18 -0
datahub/ingestion/source/fivetran/fivetran_query.py +8 -1
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/powerbi/config.py +33 -0
datahub/ingestion/source/powerbi/m_query/data_classes.py +1 -0
datahub/ingestion/source/powerbi/m_query/pattern_handler.py +100 -10
datahub/ingestion/source/powerbi/powerbi.py +5 -0
datahub/ingestion/source/qlik_sense/qlik_sense.py +1 -1
datahub/ingestion/source/redshift/config.py +9 -6
datahub/ingestion/source/redshift/lineage.py +386 -687
datahub/ingestion/source/redshift/redshift.py +19 -106
datahub/ingestion/source/s3/source.py +65 -59
datahub/ingestion/source/snowflake/constants.py +2 -0
datahub/ingestion/source/snowflake/snowflake_config.py +10 -0
datahub/ingestion/source/snowflake/snowflake_connection.py +16 -5
datahub/ingestion/source/snowflake/snowflake_query.py +27 -0
datahub/ingestion/source/snowflake/snowflake_report.py +1 -0
datahub/ingestion/source/snowflake/snowflake_schema.py +179 -7
datahub/ingestion/source/snowflake/snowflake_schema_gen.py +25 -7
datahub/ingestion/source/snowflake/snowflake_summary.py +1 -0
datahub/ingestion/source/snowflake/snowflake_utils.py +18 -5
datahub/ingestion/source/snowflake/snowflake_v2.py +6 -1
datahub/ingestion/source/sql/hive_metastore.py +1 -0
datahub/ingestion/source/sql/mssql/job_models.py +3 -1
datahub/ingestion/source/sql/mssql/source.py +62 -3
datahub/ingestion/source/sql_queries.py +24 -2
datahub/ingestion/source/state/checkpoint.py +3 -28
datahub/ingestion/source/unity/config.py +74 -9
datahub/ingestion/source/unity/proxy.py +167 -5
datahub/ingestion/source/unity/proxy_patch.py +321 -0
datahub/ingestion/source/unity/proxy_types.py +24 -0
datahub/ingestion/source/unity/report.py +5 -0
datahub/ingestion/source/unity/source.py +111 -1
datahub/ingestion/source/usage/usage_common.py +1 -0
datahub/metadata/_internal_schema_classes.py +573 -517
datahub/metadata/_urns/urn_defs.py +1748 -1748
datahub/metadata/schema.avsc +18564 -18484
datahub/metadata/schemas/ChartInfo.avsc +2 -1
datahub/metadata/schemas/DataHubPageModuleProperties.avsc +9 -0
datahub/metadata/schemas/InstitutionalMemory.avsc +9 -0
datahub/metadata/schemas/LogicalParent.avsc +104 -100
datahub/metadata/schemas/MetadataChangeEvent.avsc +81 -45
datahub/metadata/schemas/Ownership.avsc +69 -0
datahub/metadata/schemas/SchemaFieldKey.avsc +3 -1
datahub/metadata/schemas/StructuredProperties.avsc +69 -0
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +3 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/chart.py +36 -22
datahub/sdk/dashboard.py +38 -62
datahub/sdk/lineage_client.py +6 -26
datahub/sdk/main_client.py +7 -3
datahub/sdk/search_filters.py +16 -0
datahub/specific/aspect_helpers/siblings.py +73 -0
datahub/specific/dataset.py +2 -0
datahub/sql_parsing/sql_parsing_aggregator.py +3 -0
datahub/sql_parsing/tool_meta_extractor.py +1 -3
datahub/upgrade/upgrade.py +14 -2
datahub/ingestion/source/redshift/lineage_v2.py +0 -466
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/WHEEL +0 -0
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.2.0.6.dist-info → acryl_datahub-1.2.0.7.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/redshift/lineage.py CHANGED Viewed

@@ -1,21 +1,20 @@
 import logging
-import traceback
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
-from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 from urllib.parse import urlparse
-import humanfriendly
 import redshift_connector
 import sqlglot
-import datahub.emitter.mce_builder as builder
 import datahub.sql_parsing.sqlglot_lineage as sqlglot_l
 from datahub.emitter import mce_builder
 from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.s3_util import strip_s3_prefix
 from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
 from datahub.ingestion.source.redshift.query import (
@@ -35,30 +34,20 @@ from datahub.ingestion.source.redshift.report import RedshiftReport
 from datahub.ingestion.source.state.redundant_run_skip_handler import (
     RedundantLineageRunSkipHandler,
 )
-from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
-    FineGrainedLineage,
-    FineGrainedLineageDownstreamType,
-    FineGrainedLineageUpstreamType,
-    UpstreamLineage,
-)
-from datahub.metadata.com.linkedin.pegasus2avro.schema import (
-    OtherSchema,
-    SchemaField,
-    SchemaMetadata,
-)
 from datahub.metadata.schema_classes import (
     DatasetLineageTypeClass,
-    UpstreamClass,
-    UpstreamLineageClass,
 )
 from datahub.metadata.urns import DatasetUrn
-from datahub.sql_parsing.schema_resolver import SchemaResolver
-from datahub.sql_parsing.sql_parsing_aggregator import TableRename
+from datahub.sql_parsing.sql_parsing_aggregator import (
+    KnownQueryLineageInfo,
+    ObservedQuery,
+    SqlParsingAggregator,
+    TableRename,
+)
 from datahub.sql_parsing.sqlglot_utils import get_dialect, parse_statement
-from datahub.utilities import memory_footprint
-from datahub.utilities.dedup_list import deduplicate_list
+from datahub.utilities.perf_timer import PerfTimer
-logger: logging.Logger = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 class LineageDatasetPlatform(Enum):
@@ -100,30 +89,6 @@ class LineageItem:
         else:
             self.dataset_lineage_type = DatasetLineageTypeClass.TRANSFORMED
-    def merge_lineage(
-        self,
-        upstreams: Set[LineageDataset],
-        cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
-    ) -> None:
-        self.upstreams = self.upstreams.union(upstreams)
-        # Merge CLL using the output column name as the merge key.
-        self.cll = self.cll or []
-        existing_cll: Dict[str, sqlglot_l.ColumnLineageInfo] = {
-            c.downstream.column: c for c in self.cll
-        }
-        for c in cll or []:
-            if c.downstream.column in existing_cll:
-                # Merge using upstream + column name as the merge key.
-                existing_cll[c.downstream.column].upstreams = deduplicate_list(
-                    [*existing_cll[c.downstream.column].upstreams, *c.upstreams]
-                )
-            else:
-                # New output column, just add it as is.
-                self.cll.append(c)
-        self.cll = self.cll or None
 def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str, str]:
     """
@@ -142,117 +107,48 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str,
     return schema, prev_name, new_name
-def split_qualified_table_name(urn: str) -> Tuple[str, str, str]:
-    qualified_table_name = DatasetUrn.from_string(urn).name
-    # -3 because platform instance is optional and that can cause the split to have more than 3 elements
-    db, schema, table = qualified_table_name.split(".")[-3:]
+class RedshiftSqlLineage(Closeable):
+    # does lineage and usage based on SQL parsing.
-    return db, schema, table
-class RedshiftLineageExtractor:
     def __init__(
         self,
         config: RedshiftConfig,
         report: RedshiftReport,
         context: PipelineContext,
+        database: str,
         redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None,
     ):
+        self.platform = "redshift"
         self.config = config
         self.report = report
         self.context = context
-        self._lineage_map: Dict[str, LineageItem] = defaultdict()
+        self.database = database
+        self.known_urns: Set[str] = set()  # will be set later
+        self.redundant_run_skip_handler = redundant_run_skip_handler
+        self.aggregator = SqlParsingAggregator(
+            platform=self.platform,
+            platform_instance=self.config.platform_instance,
+            env=self.config.env,
+            generate_lineage=True,
+            generate_queries=self.config.lineage_generate_queries,
+            generate_usage_statistics=False,
+            generate_operations=False,
+            usage_config=self.config,
+            graph=self.context.graph,
+            is_temp_table=self._is_temp_table,
+        )
+        self.report.sql_aggregator = self.aggregator.report
         self.queries: RedshiftCommonQuery = RedshiftProvisionedQuery()
         if self.config.is_serverless:
             self.queries = RedshiftServerlessQuery()
-        self.redundant_run_skip_handler = redundant_run_skip_handler
         self.start_time, self.end_time = (
             self.report.lineage_start_time,
             self.report.lineage_end_time,
         ) = self.get_time_window()
-        self.temp_tables: Dict[str, TempTableRow] = {}
-    def _init_temp_table_schema(
-        self, database: str, temp_tables: List[TempTableRow]
-    ) -> None:
-        if self.context.graph is None:  # to silent lint
-            return
-        schema_resolver: SchemaResolver = self.context.graph._make_schema_resolver(
-            platform=LineageDatasetPlatform.REDSHIFT.value,
-            platform_instance=self.config.platform_instance,
-            env=self.config.env,
-        )
-        dataset_vs_columns: Dict[str, List[SchemaField]] = {}
-        # prepare dataset_urn vs List of schema fields
-        for table in temp_tables:
-            logger.debug(
-                f"Processing temp table: {table.create_command} with query text {table.query_text}"
-            )
-            result = sqlglot_l.create_lineage_sql_parsed_result(
-                platform=LineageDatasetPlatform.REDSHIFT.value,
-                platform_instance=self.config.platform_instance,
-                env=self.config.env,
-                default_db=database,
-                default_schema=self.config.default_schema,
-                query=table.query_text,
-                graph=self.context.graph,
-            )
-            if (
-                result is None
-                or result.column_lineage is None
-                or not result.query_type.is_create()
-                or not result.out_tables
-            ):
-                logger.debug(f"Unsupported temp table query found: {table.query_text}")
-                continue
-            table.parsed_result = result
-            if result.column_lineage[0].downstream.table:
-                table.urn = result.column_lineage[0].downstream.table
-            self.temp_tables[result.out_tables[0]] = table
-        for table in self.temp_tables.values():
-            if (
-                table.parsed_result is None
-                or table.urn is None
-                or table.parsed_result.column_lineage is None
-            ):
-                continue
-            # Initialise the temp table urn, we later need this to merge CLL
-            downstream_urn = table.urn
-            if downstream_urn not in dataset_vs_columns:
-                dataset_vs_columns[downstream_urn] = []
-            dataset_vs_columns[downstream_urn].extend(
-                sqlglot_l.infer_output_schema(table.parsed_result) or []
-            )
-        # Add datasets, and it's respective fields in schema_resolver, so that later schema_resolver would be able
-        # correctly generates the upstreams for temporary tables
-        for urn in dataset_vs_columns:
-            db, schema, table_name = split_qualified_table_name(urn)
-            schema_resolver.add_schema_metadata(
-                urn=urn,
-                schema_metadata=SchemaMetadata(
-                    schemaName=table_name,
-                    platform=builder.make_data_platform_urn(
-                        LineageDatasetPlatform.REDSHIFT.value
-                    ),
-                    version=0,
-                    hash="",
-                    platformSchema=OtherSchema(rawSchema=""),
-                    fields=dataset_vs_columns[urn],
-                ),
-            )
     def get_time_window(self) -> Tuple[datetime, datetime]:
         if self.redundant_run_skip_handler:
             self.report.stateful_lineage_ingestion_enabled = True
@@ -262,9 +158,20 @@ class RedshiftLineageExtractor:
         else:
             return self.config.start_time, self.config.end_time
-    def warn(self, log: logging.Logger, key: str, reason: str) -> None:
-        # TODO: Remove this method.
-        self.report.warning(key, reason)
+    def report_status(self, step: str, status: bool) -> None:
+        if self.redundant_run_skip_handler:
+            self.redundant_run_skip_handler.report_current_run_status(step, status)
+    def _is_temp_table(self, name: str) -> bool:
+        return (
+            DatasetUrn.create_from_ids(
+                self.platform,
+                name,
+                env=self.config.env,
+                platform_instance=self.config.platform_instance,
+            ).urn()
+            not in self.known_urns
+        )
     def _get_s3_path(self, path: str) -> Optional[str]:
         if self.config.s3_lineage_config:
@@ -289,6 +196,15 @@ class RedshiftLineageExtractor:
         return path
+    def _build_s3_path_from_row(self, filename: str) -> Optional[str]:
+        path = filename.strip()
+        if urlparse(path).scheme != "s3":
+            raise ValueError(
+                f"Only s3 source supported with copy/unload. The source was: {path}"
+            )
+        s3_path = self._get_s3_path(path)
+        return strip_s3_prefix(s3_path) if s3_path else None
     def _get_sources_from_query(
         self,
         db_name: str,
@@ -335,15 +251,6 @@ class RedshiftLineageExtractor:
             ),
         )
-    def _build_s3_path_from_row(self, filename: str) -> Optional[str]:
-        path = filename.strip()
-        if urlparse(path).scheme != "s3":
-            raise ValueError(
-                f"Only s3 source supported with copy/unload. The source was: {path}"
-            )
-        s3_path = self._get_s3_path(path)
-        return strip_s3_prefix(s3_path) if s3_path else None
     def _get_sources(
         self,
         lineage_type: LineageCollectorType,
@@ -418,112 +325,6 @@ class RedshiftLineageExtractor:
         return sources, cll
-    def _populate_lineage_map(
-        self,
-        query: str,
-        database: str,
-        lineage_type: LineageCollectorType,
-        connection: redshift_connector.Connection,
-        all_tables_set: Dict[str, Dict[str, Set[str]]],
-    ) -> None:
-        """
-        This method generate table level lineage based with the given query.
-        The query should return the following columns: target_schema, target_table, source_table, source_schema
-        source_table and source_schema can be omitted if the sql_field is set because then it assumes the source_table
-        and source_schema will be extracted from the sql_field by sql parsing.
-        :param query: The query to run to extract lineage.
-        :type query: str
-        :param lineage_type: The way the lineage should be processed
-        :type lineage_type: LineageType
-        return: The method does not return with anything as it directly modify the self._lineage_map property.
-        :rtype: None
-        """
-        logger.info(f"Extracting {lineage_type.name} lineage for db {database}")
-        try:
-            logger.debug(f"Processing lineage query: {query}")
-            cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None
-            raw_db_name = database
-            alias_db_name = self.config.database
-            for lineage_row in RedshiftDataDictionary.get_lineage_rows(
-                conn=connection, query=query
-            ):
-                target = self._get_target_lineage(
-                    alias_db_name,
-                    lineage_row,
-                    lineage_type,
-                    all_tables_set=all_tables_set,
-                )
-                if not target:
-                    continue
-                logger.debug(
-                    f"Processing {lineage_type.name} lineage row: {lineage_row}"
-                )
-                sources, cll = self._get_sources(
-                    lineage_type,
-                    alias_db_name,
-                    source_schema=lineage_row.source_schema,
-                    source_table=lineage_row.source_table,
-                    ddl=lineage_row.ddl,
-                    filename=lineage_row.filename,
-                )
-                target.upstreams.update(
-                    self._get_upstream_lineages(
-                        sources=sources,
-                        target_table=target.dataset.urn,
-                        target_dataset_cll=cll,
-                        all_tables_set=all_tables_set,
-                        alias_db_name=alias_db_name,
-                        raw_db_name=raw_db_name,
-                        connection=connection,
-                    )
-                )
-                target.cll = cll
-                # Merging upstreams if dataset already exists and has upstreams
-                if target.dataset.urn in self._lineage_map:
-                    self._lineage_map[target.dataset.urn].merge_lineage(
-                        upstreams=target.upstreams, cll=target.cll
-                    )
-                else:
-                    self._lineage_map[target.dataset.urn] = target
-                logger.debug(
-                    f"Lineage[{target}]:{self._lineage_map[target.dataset.urn]}"
-                )
-        except Exception as e:
-            self.warn(
-                logger,
-                f"extract-{lineage_type.name}",
-                f"Error was {e}, {traceback.format_exc()}",
-            )
-            self.report_status(f"extract-{lineage_type.name}", False)
-    def _update_lineage_map_for_table_renames(
-        self, table_renames: Dict[str, TableRename]
-    ) -> None:
-        if not table_renames:
-            return
-        logger.info(f"Updating lineage map for {len(table_renames)} table renames")
-        for entry in table_renames.values():
-            # This table was renamed from some other name, copy in the lineage
-            # for the previous name as well.
-            prev_table_lineage = self._lineage_map.get(entry.original_urn)
-            if prev_table_lineage:
-                logger.debug(
-                    f"including lineage for {entry.original_urn} in {entry.new_urn} due to table rename"
-                )
-                self._lineage_map[entry.new_urn].merge_lineage(
-                    upstreams=prev_table_lineage.upstreams,
-                    cll=prev_table_lineage.cll,
-                )
     def _get_target_lineage(
         self,
         alias_db_name: str,
@@ -569,7 +370,7 @@ class RedshiftLineageExtractor:
                     ),
                 )
             except ValueError as e:
-                self.warn(logger, "non-s3-lineage", str(e))
+                self.report.warning("non-s3-lineage", str(e))
                 return None
         else:
             target_platform = LineageDatasetPlatform.REDSHIFT
@@ -588,269 +389,6 @@ class RedshiftLineageExtractor:
             cll=None,
         )
-    def _get_upstream_lineages(
-        self,
-        sources: List[LineageDataset],
-        target_table: str,
-        all_tables_set: Dict[str, Dict[str, Set[str]]],
-        alias_db_name: str,
-        raw_db_name: str,
-        connection: redshift_connector.Connection,
-        target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
-    ) -> List[LineageDataset]:
-        target_source = []
-        probable_temp_tables: List[str] = []
-        for source in sources:
-            if source.platform == LineageDatasetPlatform.REDSHIFT:
-                db, schema, table = split_qualified_table_name(source.urn)
-                if db == raw_db_name:
-                    db = alias_db_name
-                    path = f"{db}.{schema}.{table}"
-                    source = LineageDataset(
-                        platform=source.platform,
-                        urn=make_dataset_urn_with_platform_instance(
-                            platform=LineageDatasetPlatform.REDSHIFT.value,
-                            platform_instance=self.config.platform_instance,
-                            name=path,
-                            env=self.config.env,
-                        ),
-                    )
-                # Filtering out tables which does not exist in Redshift
-                # It was deleted in the meantime or query parser did not capture well the table name
-                # Or it might be a temp table
-                if (
-                    db not in all_tables_set
-                    or schema not in all_tables_set[db]
-                    or table not in all_tables_set[db][schema]
-                ):
-                    logger.debug(
-                        f"{source.urn} missing table. Adding it to temp table list for target table {target_table}.",
-                    )
-                    probable_temp_tables.append(f"{schema}.{table}")
-                    self.report.num_lineage_tables_dropped += 1
-                    continue
-            target_source.append(source)
-        if probable_temp_tables and self.config.resolve_temp_table_in_lineage:
-            self.report.num_lineage_processed_temp_tables += len(probable_temp_tables)
-            # Generate lineage dataset from temporary tables
-            number_of_permanent_dataset_found: int = (
-                self.update_table_and_column_lineage(
-                    db_name=raw_db_name,
-                    connection=connection,
-                    temp_table_names=probable_temp_tables,
-                    target_source_dataset=target_source,
-                    target_dataset_cll=target_dataset_cll,
-                )
-            )
-            logger.debug(
-                f"Number of permanent datasets found for {target_table} = {number_of_permanent_dataset_found} in "
-                f"temp tables {probable_temp_tables}"
-            )
-        return target_source
-    def populate_lineage(
-        self,
-        database: str,
-        connection: redshift_connector.Connection,
-        all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
-    ) -> None:
-        if self.config.resolve_temp_table_in_lineage:
-            self._init_temp_table_schema(
-                database=database,
-                temp_tables=list(self.get_temp_tables(connection=connection)),
-            )
-        populate_calls: List[Tuple[str, LineageCollectorType]] = []
-        all_tables_set: Dict[str, Dict[str, Set[str]]] = {
-            db: {schema: {t.name for t in tables} for schema, tables in schemas.items()}
-            for db, schemas in all_tables.items()
-        }
-        table_renames: Dict[str, TableRename] = {}
-        if self.config.include_table_rename_lineage:
-            table_renames, all_tables_set = self._process_table_renames(
-                database=database,
-                connection=connection,
-                all_tables=all_tables_set,
-            )
-        if self.config.table_lineage_mode in {
-            LineageMode.STL_SCAN_BASED,
-            LineageMode.MIXED,
-        }:
-            # Populate table level lineage by getting upstream tables from stl_scan redshift table
-            query = self.queries.stl_scan_based_lineage_query(
-                self.config.database,
-                self.start_time,
-                self.end_time,
-            )
-            populate_calls.append((query, LineageCollectorType.QUERY_SCAN))
-        if self.config.table_lineage_mode in {
-            LineageMode.SQL_BASED,
-            LineageMode.MIXED,
-        }:
-            # Populate table level lineage by parsing table creating sqls
-            query = self.queries.list_insert_create_queries_sql(
-                db_name=database,
-                start_time=self.start_time,
-                end_time=self.end_time,
-            )
-            populate_calls.append((query, LineageCollectorType.QUERY_SQL_PARSER))
-        if self.config.include_views and self.config.include_view_lineage:
-            # Populate table level lineage for views
-            query = self.queries.view_lineage_query()
-            populate_calls.append((query, LineageCollectorType.VIEW))
-            # Populate table level lineage for late binding views
-            query = self.queries.list_late_view_ddls_query()
-            populate_calls.append((query, LineageCollectorType.VIEW_DDL_SQL_PARSING))
-        if self.config.include_copy_lineage:
-            query = self.queries.list_copy_commands_sql(
-                db_name=database,
-                start_time=self.start_time,
-                end_time=self.end_time,
-            )
-            populate_calls.append((query, LineageCollectorType.COPY))
-        if self.config.include_unload_lineage:
-            query = self.queries.list_unload_commands_sql(
-                db_name=database,
-                start_time=self.start_time,
-                end_time=self.end_time,
-            )
-            populate_calls.append((query, LineageCollectorType.UNLOAD))
-        for query, lineage_type in populate_calls:
-            self._populate_lineage_map(
-                query=query,
-                database=database,
-                lineage_type=lineage_type,
-                connection=connection,
-                all_tables_set=all_tables_set,
-            )
-        # Handling for alter table statements.
-        self._update_lineage_map_for_table_renames(table_renames=table_renames)
-        self.report.lineage_mem_size[self.config.database] = humanfriendly.format_size(
-            memory_footprint.total_size(self._lineage_map)
-        )
-    def make_fine_grained_lineage_class(
-        self, lineage_item: LineageItem, dataset_urn: str
-    ) -> List[FineGrainedLineage]:
-        fine_grained_lineages: List[FineGrainedLineage] = []
-        if (
-            self.config.extract_column_level_lineage is False
-            or lineage_item.cll is None
-        ):
-            logger.debug("CLL extraction is disabled")
-            return fine_grained_lineages
-        logger.debug("Extracting column level lineage")
-        cll: List[sqlglot_l.ColumnLineageInfo] = lineage_item.cll
-        for cll_info in cll:
-            downstream = (
-                [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)]
-                if cll_info.downstream is not None
-                and cll_info.downstream.column is not None
-                else []
-            )
-            upstreams = [
-                builder.make_schema_field_urn(column_ref.table, column_ref.column)
-                for column_ref in cll_info.upstreams
-            ]
-            fine_grained_lineages.append(
-                FineGrainedLineage(
-                    downstreamType=FineGrainedLineageDownstreamType.FIELD,
-                    downstreams=downstream,
-                    upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
-                    upstreams=upstreams,
-                )
-            )
-        logger.debug(f"Created fine_grained_lineage for {dataset_urn}")
-        return fine_grained_lineages
-    def get_lineage(
-        self,
-        table: Union[RedshiftTable, RedshiftView],
-        dataset_urn: str,
-        schema: RedshiftSchema,
-    ) -> Optional[UpstreamLineageClass]:
-        upstream_lineage: List[UpstreamClass] = []
-        cll_lineage: List[FineGrainedLineage] = []
-        if dataset_urn in self._lineage_map:
-            item = self._lineage_map[dataset_urn]
-            for upstream in item.upstreams:
-                upstream_table = UpstreamClass(
-                    dataset=upstream.urn,
-                    type=item.dataset_lineage_type,
-                )
-                upstream_lineage.append(upstream_table)
-            cll_lineage = self.make_fine_grained_lineage_class(
-                lineage_item=item,
-                dataset_urn=dataset_urn,
-            )
-        tablename = table.name
-        if (
-            table.is_external_table()
-            and schema.is_external_schema()
-            and schema.external_platform
-        ):
-            # external_db_params = schema.option
-            upstream_platform = schema.external_platform.lower()
-            catalog_upstream = UpstreamClass(
-                mce_builder.make_dataset_urn_with_platform_instance(
-                    upstream_platform,
-                    f"{schema.external_database}.{tablename}",
-                    platform_instance=(
-                        self.config.platform_instance_map.get(upstream_platform)
-                        if self.config.platform_instance_map
-                        else None
-                    ),
-                    env=self.config.env,
-                ),
-                DatasetLineageTypeClass.COPY,
-            )
-            upstream_lineage.append(catalog_upstream)
-        if upstream_lineage:
-            self.report.upstream_lineage[dataset_urn] = [
-                u.dataset for u in upstream_lineage
-            ]
-        else:
-            return None
-        return UpstreamLineage(
-            upstreams=upstream_lineage,
-            fineGrainedLineages=cll_lineage or None,
-        )
-    def report_status(self, step: str, status: bool) -> None:
-        if self.redundant_run_skip_handler:
-            self.redundant_run_skip_handler.report_current_run_status(step, status)
     def _process_table_renames(
         self,
         database: str,
@@ -924,204 +462,365 @@ class RedshiftLineageExtractor:
         ):
             yield row
-    def find_temp_tables(
-        self, temp_table_rows: List[TempTableRow], temp_table_names: List[str]
-    ) -> List[TempTableRow]:
-        matched_temp_tables: List[TempTableRow] = []
+    def build(
+        self,
+        connection: redshift_connector.Connection,
+        all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
+        db_schemas: Dict[str, Dict[str, RedshiftSchema]],
+    ) -> None:
+        # Assume things not in `all_tables` as temp tables.
+        self.known_urns = {
+            DatasetUrn.create_from_ids(
+                self.platform,
+                f"{db}.{schema}.{table.name}",
+                env=self.config.env,
+                platform_instance=self.config.platform_instance,
+            ).urn()
+            for db, schemas in all_tables.items()
+            for schema, tables in schemas.items()
+            for table in tables
+        }
+        # Handle all the temp tables up front.
+        if self.config.resolve_temp_table_in_lineage:
+            for temp_row in self.get_temp_tables(connection=connection):
+                self.aggregator.add_observed_query(
+                    ObservedQuery(
+                        query=temp_row.query_text,
+                        default_db=self.database,
+                        default_schema=self.config.default_schema,
+                        session_id=temp_row.session_id,
+                        timestamp=temp_row.start_time,
+                    ),
+                    # The "temp table" query actually returns all CREATE TABLE statements, even if they
+                    # aren't explicitly a temp table. As such, setting is_known_temp_table=True
+                    # would not be correct. We already have mechanisms to autodetect temp tables,
+                    # so we won't lose anything by not setting it.
+                    is_known_temp_table=False,
+                )
+        populate_calls: List[Tuple[LineageCollectorType, str, Callable]] = []
-        for table_name in temp_table_names:
-            prefixes = self.queries.get_temp_table_clause(table_name)
-            prefixes.extend(
-                self.queries.get_temp_table_clause(table_name.split(".")[-1])
+        if self.config.include_table_rename_lineage:
+            # Process all the ALTER TABLE RENAME statements
+            table_renames, _ = self._process_table_renames(
+                database=self.database,
+                connection=connection,
+                all_tables=defaultdict(lambda: defaultdict(set)),
             )
+            for entry in table_renames.values():
+                self.aggregator.add_table_rename(entry)
-            for row in temp_table_rows:
-                if any(
-                    row.create_command.lower().startswith(prefix) for prefix in prefixes
-                ):
-                    matched_temp_tables.append(row)
+        if self.config.table_lineage_mode in {
+            LineageMode.SQL_BASED,
+            LineageMode.MIXED,
+        }:
+            # Populate lineage by parsing table creating sqls
+            query = self.queries.list_insert_create_queries_sql(
+                db_name=self.database,
+                start_time=self.start_time,
+                end_time=self.end_time,
+            )
+            populate_calls.append(
+                (
+                    LineageCollectorType.QUERY_SQL_PARSER,
+                    query,
+                    self._process_sql_parser_lineage,
+                )
+            )
+        if self.config.table_lineage_mode in {
+            LineageMode.STL_SCAN_BASED,
+            LineageMode.MIXED,
+        }:
+            # Populate lineage by getting upstream tables from stl_scan redshift table
+            query = self.queries.stl_scan_based_lineage_query(
+                self.database,
+                self.start_time,
+                self.end_time,
+            )
+            populate_calls.append(
+                (LineageCollectorType.QUERY_SCAN, query, self._process_stl_scan_lineage)
+            )
-        return matched_temp_tables
+        if self.config.include_views and self.config.include_view_lineage:
+            # Populate lineage for views
+            query = self.queries.view_lineage_query()
+            populate_calls.append(
+                (LineageCollectorType.VIEW, query, self._process_view_lineage)
+            )
-    def resolve_column_refs(
-        self, column_refs: List[sqlglot_l.ColumnRef], depth: int = 0
-    ) -> List[sqlglot_l.ColumnRef]:
-        """
-        This method resolves the column reference to the original column reference.
-        For example, if the column reference is to a temporary table, it will be resolved to the original column
-        reference.
-        """
-        max_depth = 10
+            # Populate lineage for late binding views
+            query = self.queries.list_late_view_ddls_query()
+            populate_calls.append(
+                (
+                    LineageCollectorType.VIEW_DDL_SQL_PARSING,
+                    query,
+                    self._process_view_lineage,
+                )
+            )
+        if self.config.include_copy_lineage:
+            # Populate lineage for copy commands.
+            query = self.queries.list_copy_commands_sql(
+                db_name=self.database,
+                start_time=self.start_time,
+                end_time=self.end_time,
+            )
+            populate_calls.append(
+                (LineageCollectorType.COPY, query, self._process_copy_command)
+            )
-        resolved_column_refs: List[sqlglot_l.ColumnRef] = []
-        if not column_refs:
-            return column_refs
+        if self.config.include_unload_lineage:
+            # Populate lineage for unload commands.
+            query = self.queries.list_unload_commands_sql(
+                db_name=self.database,
+                start_time=self.start_time,
+                end_time=self.end_time,
+            )
+            populate_calls.append(
+                (LineageCollectorType.UNLOAD, query, self._process_unload_command)
+            )
-        if depth >= max_depth:
-            logger.warning(
-                f"Max depth reached for resolving temporary columns: {column_refs}"
+        for lineage_type, query, processor in populate_calls:
+            self._populate_lineage_agg(
+                query=query,
+                lineage_type=lineage_type,
+                processor=processor,
+                connection=connection,
             )
-            self.report.num_unresolved_temp_columns += 1
-            return column_refs
-        for ref in column_refs:
-            resolved = False
-            if ref.table in self.temp_tables:
-                table = self.temp_tables[ref.table]
-                if table.parsed_result and table.parsed_result.column_lineage:
-                    for column_lineage in table.parsed_result.column_lineage:
-                        if (
-                            column_lineage.downstream.table == ref.table
-                            and column_lineage.downstream.column == ref.column
-                        ):
-                            resolved_column_refs.extend(
-                                self.resolve_column_refs(
-                                    column_lineage.upstreams, depth=depth + 1
-                                )
-                            )
-                            resolved = True
-                            break
-                    # If we reach here, it means that we were not able to resolve the column reference.
-                    if resolved is False:
-                        logger.warning(
-                            f"Unable to resolve column reference {ref} to a permanent table"
-                        )
-            else:
-                logger.debug(
-                    f"Resolved column reference {ref} is not resolved because referenced table {ref.table} is not a temp table or not found. Adding reference as non-temp table. This is normal."
-                )
-                resolved_column_refs.append(ref)
-        return resolved_column_refs
-    def _update_target_dataset_cll(
-        self,
-        temp_table_urn: str,
-        target_dataset_cll: List[sqlglot_l.ColumnLineageInfo],
-        source_dataset_cll: List[sqlglot_l.ColumnLineageInfo],
-    ) -> None:
-        for target_column_lineage in target_dataset_cll:
-            upstreams: List[sqlglot_l.ColumnRef] = []
-            # Look for temp_table_urn in upstream of column_lineage, if found then we need to replace it with
-            # column of permanent table
-            for target_column_ref in target_column_lineage.upstreams:
-                if target_column_ref.table == temp_table_urn:
-                    # Look for column_ref.table and column_ref.column in downstream of source_dataset_cll.
-                    # The source_dataset_cll contains CLL generated from create statement of temp table (temp_table_urn)
-                    for source_column_lineage in source_dataset_cll:
-                        if (
-                            source_column_lineage.downstream.table
-                            == target_column_ref.table
-                            and source_column_lineage.downstream.column
-                            == target_column_ref.column
-                        ):
-                            resolved_columns = self.resolve_column_refs(
-                                source_column_lineage.upstreams
-                            )
-                            # Add all upstream of above temporary column into upstream of target column
-                            upstreams.extend(resolved_columns)
-                    continue
-                upstreams.append(target_column_ref)
-            if upstreams:
-                # update the upstreams
-                target_column_lineage.upstreams = upstreams
-    def _add_permanent_datasets_recursively(
+        # Populate lineage for external tables.
+        if not self.config.skip_external_tables:
+            self._process_external_tables(all_tables=all_tables, db_schemas=db_schemas)
+    def _populate_lineage_agg(
         self,
-        db_name: str,
-        temp_table_rows: List[TempTableRow],
-        visited_tables: Set[str],
+        query: str,
+        lineage_type: LineageCollectorType,
+        processor: Callable[[LineageRow], None],
         connection: redshift_connector.Connection,
-        permanent_lineage_datasets: List[LineageDataset],
-        target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
     ) -> None:
-        transitive_temp_tables: List[TempTableRow] = []
+        logger.info(f"Extracting {lineage_type.name} lineage for db {self.database}")
+        try:
+            logger.debug(f"Processing {lineage_type.name} lineage query: {query}")
-        for temp_table in temp_table_rows:
-            logger.debug(
-                f"Processing temp table with transaction id: {temp_table.transaction_id} and query text {temp_table.query_text}"
+            timer = self.report.lineage_phases_timer.setdefault(
+                lineage_type.name, PerfTimer()
+            )
+            with timer:
+                for lineage_row in RedshiftDataDictionary.get_lineage_rows(
+                    conn=connection, query=query
+                ):
+                    processor(lineage_row)
+        except Exception as e:
+            self.report.warning(
+                title="Failed to extract some lineage",
+                message=f"Failed to extract lineage of type {lineage_type.name}",
+                context=f"Query: '{query}'",
+                exc=e,
             )
+            self.report_status(f"extract-{lineage_type.name}", False)
+    def _process_sql_parser_lineage(self, lineage_row: LineageRow) -> None:
+        ddl = lineage_row.ddl
+        if ddl is None:
+            return
+        # TODO actor
-            intermediate_l_datasets, cll = self._get_sources_from_query(
-                db_name=db_name,
-                query=temp_table.query_text,
-                parsed_result=temp_table.parsed_result,
+        self.aggregator.add_observed_query(
+            ObservedQuery(
+                query=ddl,
+                default_db=self.database,
+                default_schema=self.config.default_schema,
+                timestamp=lineage_row.timestamp,
+                session_id=lineage_row.session_id,
             )
+        )
-            if (
-                temp_table.urn is not None
-                and target_dataset_cll is not None
-                and cll is not None
-            ):  # condition to silent the lint
-                self._update_target_dataset_cll(
-                    temp_table_urn=temp_table.urn,
-                    target_dataset_cll=target_dataset_cll,
-                    source_dataset_cll=cll,
-                )
+    def _make_filtered_target(self, lineage_row: LineageRow) -> Optional[DatasetUrn]:
+        target = DatasetUrn.create_from_ids(
+            self.platform,
+            f"{self.database}.{lineage_row.target_schema}.{lineage_row.target_table}",
+            env=self.config.env,
+            platform_instance=self.config.platform_instance,
+        )
+        if target.urn() not in self.known_urns:
+            logger.debug(
+                f"Skipping lineage for {target.urn()} as it is not in known_urns"
+            )
+            return None
-            # make sure lineage dataset should not contain a temp table
-            # if such dataset is present then add it to transitive_temp_tables to resolve it to original permanent table
-            for lineage_dataset in intermediate_l_datasets:
-                db, schema, table = split_qualified_table_name(lineage_dataset.urn)
+        return target
-                if table in visited_tables:
-                    # The table is already processed
-                    continue
+    def _process_stl_scan_lineage(self, lineage_row: LineageRow) -> None:
+        target = self._make_filtered_target(lineage_row)
+        if not target:
+            return
-                # Check if table found is again a temp table
-                repeated_temp_table: List[TempTableRow] = self.find_temp_tables(
-                    temp_table_rows=list(self.temp_tables.values()),
-                    temp_table_names=[table],
-                )
+        source = DatasetUrn.create_from_ids(
+            self.platform,
+            f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
+            env=self.config.env,
+            platform_instance=self.config.platform_instance,
+        )
-                if not repeated_temp_table:
-                    logger.debug(f"Unable to find table {table} in temp tables.")
+        if lineage_row.ddl is None:
+            logger.warning(
+                f"stl scan entry is missing query text for {lineage_row.source_schema}.{lineage_row.source_table}"
+            )
+            return
+        self.aggregator.add_known_query_lineage(
+            KnownQueryLineageInfo(
+                query_text=lineage_row.ddl,
+                downstream=target.urn(),
+                upstreams=[source.urn()],
+                timestamp=lineage_row.timestamp,
+            ),
+            merge_lineage=True,
+        )
+    def _process_view_lineage(self, lineage_row: LineageRow) -> None:
+        ddl = lineage_row.ddl
+        if ddl is None:
+            return
-                if repeated_temp_table:
-                    transitive_temp_tables.extend(repeated_temp_table)
-                    visited_tables.add(table)
-                    continue
+        target = self._make_filtered_target(lineage_row)
+        if not target:
+            return
-                permanent_lineage_datasets.append(lineage_dataset)
+        self.aggregator.add_view_definition(
+            view_urn=target,
+            view_definition=ddl,
+            default_db=self.database,
+            default_schema=self.config.default_schema,
+        )
-        if transitive_temp_tables:
-            # recursive call
-            self._add_permanent_datasets_recursively(
-                db_name=db_name,
-                temp_table_rows=transitive_temp_tables,
-                visited_tables=visited_tables,
-                connection=connection,
-                permanent_lineage_datasets=permanent_lineage_datasets,
-                target_dataset_cll=target_dataset_cll,
+    def _process_copy_command(self, lineage_row: LineageRow) -> None:
+        logger.debug(f"Processing COPY command for lineage row: {lineage_row}")
+        sources = self._get_sources(
+            lineage_type=LineageCollectorType.COPY,
+            db_name=self.database,
+            source_schema=None,
+            source_table=None,
+            ddl=None,
+            filename=lineage_row.filename,
+        )
+        logger.debug(f"Recognized sources: {sources}")
+        source = sources[0]
+        if not source:
+            logger.debug("Ignoring command since couldn't recognize proper source")
+            return
+        s3_urn = source[0].urn
+        logger.debug(f"Recognized s3 dataset urn: {s3_urn}")
+        if not lineage_row.target_schema or not lineage_row.target_table:
+            logger.debug(
+                f"Didn't find target schema (found: {lineage_row.target_schema}) or target table (found: {lineage_row.target_table})"
             )
+            return
+        target = self._make_filtered_target(lineage_row)
+        if not target:
+            return
-    def update_table_and_column_lineage(
-        self,
-        db_name: str,
-        temp_table_names: List[str],
-        connection: redshift_connector.Connection,
-        target_source_dataset: List[LineageDataset],
-        target_dataset_cll: Optional[List[sqlglot_l.ColumnLineageInfo]],
-    ) -> int:
-        permanent_lineage_datasets: List[LineageDataset] = []
-        temp_table_rows: List[TempTableRow] = self.find_temp_tables(
-            temp_table_rows=list(self.temp_tables.values()),
-            temp_table_names=temp_table_names,
+        self.aggregator.add_known_lineage_mapping(
+            upstream_urn=s3_urn, downstream_urn=target.urn()
         )
-        visited_tables: Set[str] = set(temp_table_names)
+    def _process_unload_command(self, lineage_row: LineageRow) -> None:
+        lineage_entry = self._get_target_lineage(
+            alias_db_name=self.database,
+            lineage_row=lineage_row,
+            lineage_type=LineageCollectorType.UNLOAD,
+            all_tables_set={},
+        )
+        if not lineage_entry:
+            return
+        output_urn = lineage_entry.dataset.urn
-        self._add_permanent_datasets_recursively(
-            db_name=db_name,
-            temp_table_rows=temp_table_rows,
-            visited_tables=visited_tables,
-            connection=connection,
-            permanent_lineage_datasets=permanent_lineage_datasets,
-            target_dataset_cll=target_dataset_cll,
+        if not lineage_row.source_schema or not lineage_row.source_table:
+            return
+        source = DatasetUrn.create_from_ids(
+            self.platform,
+            f"{self.database}.{lineage_row.source_schema}.{lineage_row.source_table}",
+            env=self.config.env,
+            platform_instance=self.config.platform_instance,
         )
+        if source.urn() not in self.known_urns:
+            logger.debug(
+                f"Skipping unload lineage for {source.urn()} as it is not in known_urns"
+            )
+            return
+        self.aggregator.add_known_lineage_mapping(
+            upstream_urn=source.urn(), downstream_urn=output_urn
+        )
+    def _process_external_tables(
+        self,
+        all_tables: Dict[str, Dict[str, List[Union[RedshiftView, RedshiftTable]]]],
+        db_schemas: Dict[str, Dict[str, RedshiftSchema]],
+    ) -> None:
+        for schema_name, tables in all_tables[self.database].items():
+            logger.info(f"External table lineage: checking schema {schema_name}")
+            if not db_schemas[self.database].get(schema_name):
+                logger.warning(f"Schema {schema_name} not found")
+                continue
+            for table in tables:
+                schema = db_schemas[self.database][schema_name]
+                if (
+                    table.is_external_table()
+                    and schema.is_external_schema()
+                    and schema.external_platform
+                ):
+                    logger.info(
+                        f"External table lineage: processing table {schema_name}.{table.name}"
+                    )
+                    # external_db_params = schema.option
+                    upstream_platform = schema.external_platform.lower()
+                    table_urn = mce_builder.make_dataset_urn_with_platform_instance(
+                        self.platform,
+                        f"{self.database}.{schema_name}.{table.name}",
+                        platform_instance=self.config.platform_instance,
+                        env=self.config.env,
+                    )
+                    if upstream_platform == self.platform:
+                        upstream_schema = schema.get_upstream_schema_name() or "public"
+                        upstream_dataset_name = (
+                            f"{schema.external_database}.{upstream_schema}.{table.name}"
+                        )
+                        upstream_platform_instance = self.config.platform_instance
+                    else:
+                        upstream_dataset_name = (
+                            f"{schema.external_database}.{table.name}"
+                        )
+                        upstream_platform_instance = (
+                            self.config.platform_instance_map.get(upstream_platform)
+                            if self.config.platform_instance_map
+                            else None
+                        )
-        target_source_dataset.extend(permanent_lineage_datasets)
+                    upstream_urn = mce_builder.make_dataset_urn_with_platform_instance(
+                        upstream_platform,
+                        upstream_dataset_name,
+                        platform_instance=upstream_platform_instance,
+                        env=self.config.env,
+                    )
+                    self.aggregator.add_known_lineage_mapping(
+                        upstream_urn=upstream_urn,
+                        downstream_urn=table_urn,
+                    )
+    def generate(self) -> Iterable[MetadataWorkUnit]:
+        for mcp in self.aggregator.gen_metadata():
+            yield mcp.as_workunit()
+        if len(self.aggregator.report.observed_query_parse_failures) > 0:
+            self.report.report_warning(
+                title="Failed to extract some SQL lineage",
+                message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
+                context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
+            )
-        return len(permanent_lineage_datasets)
+    def close(self) -> None:
+        self.aggregator.close()

acryl-datahub 1.2.0.6__py3-none-any.whl → 1.2.0.7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.2.0.6py3-none-any.whl → 1.2.0.7py3-none-any.whl