PyPI - acryl-datahub - Versions diffs - 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (205) hide show

{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/METADATA +2522 -2493
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/RECORD +205 -192
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/WHEEL +1 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/entry_points.txt +1 -0
datahub/_version.py +1 -1
datahub/api/entities/common/serialized_value.py +4 -3
datahub/api/entities/dataset/dataset.py +731 -42
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/check_cli.py +72 -19
datahub/cli/docker_cli.py +3 -3
datahub/cli/iceberg_cli.py +31 -7
datahub/cli/ingest_cli.py +30 -93
datahub/cli/lite_cli.py +4 -2
datahub/cli/specific/dataproduct_cli.py +1 -1
datahub/cli/specific/dataset_cli.py +128 -14
datahub/configuration/common.py +10 -2
datahub/configuration/git.py +1 -3
datahub/configuration/kafka.py +1 -1
datahub/emitter/mce_builder.py +28 -13
datahub/emitter/mcp_builder.py +4 -1
datahub/emitter/response_helper.py +145 -0
datahub/emitter/rest_emitter.py +323 -10
datahub/ingestion/api/decorators.py +1 -1
datahub/ingestion/api/source_helpers.py +4 -0
datahub/ingestion/fs/s3_fs.py +2 -2
datahub/ingestion/glossary/classification_mixin.py +1 -5
datahub/ingestion/graph/client.py +41 -22
datahub/ingestion/graph/entity_versioning.py +3 -3
datahub/ingestion/graph/filters.py +64 -37
datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +1 -6
datahub/ingestion/run/pipeline.py +112 -148
datahub/ingestion/run/sink_callback.py +77 -0
datahub/ingestion/sink/datahub_rest.py +8 -0
datahub/ingestion/source/abs/config.py +2 -4
datahub/ingestion/source/bigquery_v2/bigquery_audit.py +1 -1
datahub/ingestion/source/bigquery_v2/bigquery_config.py +2 -46
datahub/ingestion/source/bigquery_v2/bigquery_schema.py +6 -1
datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +7 -4
datahub/ingestion/source/cassandra/cassandra.py +152 -233
datahub/ingestion/source/cassandra/cassandra_api.py +13 -5
datahub/ingestion/source/common/gcp_credentials_config.py +53 -0
datahub/ingestion/source/common/subtypes.py +12 -0
datahub/ingestion/source/csv_enricher.py +3 -3
datahub/ingestion/source/data_lake_common/path_spec.py +1 -3
datahub/ingestion/source/dbt/dbt_common.py +8 -5
datahub/ingestion/source/dbt/dbt_core.py +11 -9
datahub/ingestion/source/dbt/dbt_tests.py +4 -8
datahub/ingestion/source/delta_lake/config.py +8 -1
datahub/ingestion/source/delta_lake/report.py +4 -2
datahub/ingestion/source/delta_lake/source.py +20 -5
datahub/ingestion/source/dremio/dremio_api.py +4 -8
datahub/ingestion/source/dremio/dremio_aspects.py +3 -5
datahub/ingestion/source/dynamodb/dynamodb.py +6 -0
datahub/ingestion/source/elastic_search.py +26 -6
datahub/ingestion/source/feast.py +27 -8
datahub/ingestion/source/file.py +6 -3
datahub/ingestion/source/gc/dataprocess_cleanup.py +1 -1
datahub/ingestion/source/gc/execution_request_cleanup.py +2 -1
datahub/ingestion/source/ge_data_profiler.py +12 -15
datahub/ingestion/source/iceberg/iceberg.py +46 -12
datahub/ingestion/source/iceberg/iceberg_common.py +71 -21
datahub/ingestion/source/identity/okta.py +37 -7
datahub/ingestion/source/kafka/kafka.py +1 -1
datahub/ingestion/source/kafka_connect/common.py +2 -7
datahub/ingestion/source/kafka_connect/kafka_connect.py +97 -4
datahub/ingestion/source/kafka_connect/sink_connectors.py +2 -2
datahub/ingestion/source/kafka_connect/source_connectors.py +6 -9
datahub/ingestion/source/looker/looker_common.py +6 -5
datahub/ingestion/source/looker/looker_file_loader.py +2 -2
datahub/ingestion/source/looker/looker_lib_wrapper.py +2 -1
datahub/ingestion/source/looker/looker_source.py +1 -1
datahub/ingestion/source/looker/looker_template_language.py +4 -2
datahub/ingestion/source/looker/lookml_source.py +3 -2
datahub/ingestion/source/metabase.py +57 -35
datahub/ingestion/source/metadata/business_glossary.py +45 -3
datahub/ingestion/source/metadata/lineage.py +2 -2
datahub/ingestion/source/mlflow.py +365 -35
datahub/ingestion/source/mode.py +18 -8
datahub/ingestion/source/neo4j/neo4j_source.py +27 -7
datahub/ingestion/source/nifi.py +37 -11
datahub/ingestion/source/openapi.py +1 -1
datahub/ingestion/source/openapi_parser.py +49 -17
datahub/ingestion/source/powerbi/m_query/parser.py +3 -2
datahub/ingestion/source/powerbi/m_query/tree_function.py +2 -1
datahub/ingestion/source/powerbi/powerbi.py +1 -3
datahub/ingestion/source/powerbi/rest_api_wrapper/data_resolver.py +2 -1
datahub/ingestion/source/powerbi_report_server/report_server.py +26 -7
datahub/ingestion/source/powerbi_report_server/report_server_domain.py +1 -1
datahub/ingestion/source/preset.py +7 -4
datahub/ingestion/source/pulsar.py +3 -2
datahub/ingestion/source/qlik_sense/websocket_connection.py +4 -2
datahub/ingestion/source/redash.py +31 -7
datahub/ingestion/source/redshift/config.py +4 -0
datahub/ingestion/source/redshift/datashares.py +236 -0
datahub/ingestion/source/redshift/lineage.py +6 -2
datahub/ingestion/source/redshift/lineage_v2.py +24 -9
datahub/ingestion/source/redshift/profile.py +1 -1
datahub/ingestion/source/redshift/query.py +133 -33
datahub/ingestion/source/redshift/redshift.py +46 -73
datahub/ingestion/source/redshift/redshift_schema.py +186 -6
datahub/ingestion/source/redshift/report.py +3 -0
datahub/ingestion/source/s3/config.py +5 -5
datahub/ingestion/source/s3/source.py +20 -41
datahub/ingestion/source/salesforce.py +550 -275
datahub/ingestion/source/schema_inference/object.py +1 -1
datahub/ingestion/source/sigma/sigma.py +1 -1
datahub/ingestion/source/slack/slack.py +31 -10
datahub/ingestion/source/snowflake/snowflake_connection.py +2 -2
datahub/ingestion/source/snowflake/snowflake_queries.py +19 -13
datahub/ingestion/source/snowflake/snowflake_query.py +6 -4
datahub/ingestion/source/snowflake/snowflake_schema.py +3 -4
datahub/ingestion/source/snowflake/snowflake_v2.py +1 -1
datahub/ingestion/source/sql/athena.py +10 -16
datahub/ingestion/source/sql/druid.py +1 -5
datahub/ingestion/source/sql/hive.py +15 -6
datahub/ingestion/source/sql/hive_metastore.py +3 -2
datahub/ingestion/source/sql/mssql/job_models.py +29 -0
datahub/ingestion/source/sql/mssql/source.py +11 -5
datahub/ingestion/source/sql/oracle.py +127 -63
datahub/ingestion/source/sql/sql_common.py +16 -18
datahub/ingestion/source/sql/sql_types.py +2 -2
datahub/ingestion/source/sql/teradata.py +19 -5
datahub/ingestion/source/sql/trino.py +2 -2
datahub/ingestion/source/state/stale_entity_removal_handler.py +4 -8
datahub/ingestion/source/superset.py +222 -62
datahub/ingestion/source/tableau/tableau.py +22 -6
datahub/ingestion/source/tableau/tableau_common.py +3 -2
datahub/ingestion/source/unity/ge_profiler.py +2 -1
datahub/ingestion/source/unity/source.py +11 -1
datahub/ingestion/source/vertexai.py +697 -0
datahub/ingestion/source_config/pulsar.py +3 -1
datahub/ingestion/transformer/pattern_cleanup_ownership.py +25 -7
datahub/lite/duckdb_lite.py +3 -10
datahub/lite/lite_local.py +1 -1
datahub/lite/lite_util.py +4 -3
datahub/metadata/_schema_classes.py +714 -417
datahub/metadata/_urns/urn_defs.py +1673 -1649
datahub/metadata/com/linkedin/pegasus2avro/incident/__init__.py +4 -0
datahub/metadata/schema.avsc +16438 -16603
datahub/metadata/schemas/AssertionInfo.avsc +3 -1
datahub/metadata/schemas/BusinessAttributeInfo.avsc +6 -2
datahub/metadata/schemas/BusinessAttributes.avsc +6 -0
datahub/metadata/schemas/ChartInfo.avsc +1 -0
datahub/metadata/schemas/CorpGroupKey.avsc +2 -1
datahub/metadata/schemas/CorpUserInfo.avsc +13 -0
datahub/metadata/schemas/CorpUserKey.avsc +2 -1
datahub/metadata/schemas/DataHubIngestionSourceInfo.avsc +8 -3
datahub/metadata/schemas/DataProcessInstanceInput.avsc +129 -1
datahub/metadata/schemas/DataProcessInstanceOutput.avsc +131 -3
datahub/metadata/schemas/DataProcessKey.avsc +2 -1
datahub/metadata/schemas/DataProductKey.avsc +2 -1
datahub/metadata/schemas/DomainKey.avsc +2 -1
datahub/metadata/schemas/EditableSchemaMetadata.avsc +6 -2
datahub/metadata/schemas/GlossaryNodeKey.avsc +3 -1
datahub/metadata/schemas/GlossaryTermKey.avsc +2 -1
datahub/metadata/schemas/GlossaryTerms.avsc +3 -1
datahub/metadata/schemas/IncidentInfo.avsc +130 -46
datahub/metadata/schemas/InputFields.avsc +3 -1
datahub/metadata/schemas/MLFeatureKey.avsc +2 -1
datahub/metadata/schemas/MLFeatureTableKey.avsc +2 -1
datahub/metadata/schemas/MLModelDeploymentKey.avsc +2 -1
datahub/metadata/schemas/MLModelGroupKey.avsc +3 -1
datahub/metadata/schemas/MLModelKey.avsc +3 -1
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +2 -1
datahub/metadata/schemas/MetadataChangeEvent.avsc +20 -2
datahub/metadata/schemas/PostKey.avsc +2 -1
datahub/metadata/schemas/SchemaFieldKey.avsc +2 -1
datahub/metadata/schemas/SchemaMetadata.avsc +3 -1
datahub/metadata/schemas/StructuredPropertyDefinition.avsc +14 -0
datahub/metadata/schemas/VersionProperties.avsc +18 -0
datahub/metadata/schemas/VersionSetProperties.avsc +5 -0
datahub/pydantic/__init__.py +0 -0
datahub/pydantic/compat.py +58 -0
datahub/sdk/__init__.py +30 -12
datahub/sdk/_all_entities.py +1 -1
datahub/sdk/_attribution.py +4 -0
datahub/sdk/_shared.py +258 -16
datahub/sdk/_utils.py +35 -0
datahub/sdk/container.py +30 -6
datahub/sdk/dataset.py +118 -20
datahub/sdk/{_entity.py → entity.py} +24 -1
datahub/sdk/entity_client.py +1 -1
datahub/sdk/main_client.py +23 -0
datahub/sdk/resolver_client.py +17 -29
datahub/sdk/search_client.py +50 -0
datahub/sdk/search_filters.py +374 -0
datahub/specific/dataset.py +3 -4
datahub/sql_parsing/_sqlglot_patch.py +2 -10
datahub/sql_parsing/schema_resolver.py +1 -1
datahub/sql_parsing/split_statements.py +220 -126
datahub/sql_parsing/sql_parsing_common.py +7 -0
datahub/sql_parsing/sqlglot_lineage.py +1 -1
datahub/sql_parsing/sqlglot_utils.py +1 -4
datahub/testing/check_sql_parser_result.py +5 -6
datahub/testing/compare_metadata_json.py +7 -6
datahub/testing/pytest_hooks.py +56 -0
datahub/upgrade/upgrade.py +2 -2
datahub/utilities/file_backed_collections.py +3 -14
datahub/utilities/ingest_utils.py +106 -0
datahub/utilities/mapping.py +1 -1
datahub/utilities/memory_footprint.py +3 -2
datahub/utilities/sentinels.py +22 -0
datahub/utilities/unified_diff.py +5 -1
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/LICENSE +0 -0
{acryl_datahub-0.15.0.6rc2.dist-info → acryl_datahub-1.0.0.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/redshift/redshift.py CHANGED Viewed

@@ -33,7 +33,10 @@ from datahub.ingestion.api.source import (
     TestableSource,
     TestConnectionReport,
 )
-from datahub.ingestion.api.source_helpers import create_dataset_props_patch_builder
+from datahub.ingestion.api.source_helpers import (
+    auto_workunit,
+    create_dataset_props_patch_builder,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.glossary.classification_mixin import (
     ClassificationHandler,
@@ -45,6 +48,7 @@ from datahub.ingestion.source.common.subtypes import (
     DatasetSubTypes,
 )
 from datahub.ingestion.source.redshift.config import RedshiftConfig
+from datahub.ingestion.source.redshift.datashares import RedshiftDatasharesHelper
 from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield
 from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor
 from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2
@@ -52,6 +56,7 @@ from datahub.ingestion.source.redshift.profile import RedshiftProfiler
 from datahub.ingestion.source.redshift.redshift_data_reader import RedshiftDataReader
 from datahub.ingestion.source.redshift.redshift_schema import (
     RedshiftColumn,
+    RedshiftDatabase,
     RedshiftDataDictionary,
     RedshiftSchema,
     RedshiftTable,
@@ -150,76 +155,6 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
     - Table, row, and column statistics via optional SQL profiling
     - Table lineage
     - Usage statistics
-    ### Prerequisites
-    This source needs to access system tables that require extra permissions.
-    To grant these permissions, please alter your datahub Redshift user the following way:
-    ```sql
-    ALTER USER datahub_user WITH SYSLOG ACCESS UNRESTRICTED;
-    GRANT SELECT ON pg_catalog.svv_table_info to datahub_user;
-    GRANT SELECT ON pg_catalog.svl_user_info to datahub_user;
-    ```
-    :::note
-    Giving a user unrestricted access to system tables gives the user visibility to data generated by other users. For example, STL_QUERY and STL_QUERYTEXT contain the full text of INSERT, UPDATE, and DELETE statements.
-    :::
-    ### Lineage
-    There are multiple lineage collector implementations as Redshift does not support table lineage out of the box.
-    #### stl_scan_based
-    The stl_scan based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) and [stl_scan](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_SCAN.html) system tables to
-    discover lineage between tables.
-    Pros:
-    - Fast
-    - Reliable
-    Cons:
-    - Does not work with Spectrum/external tables because those scans do not show up in stl_scan table.
-    - If a table is depending on a view then the view won't be listed as dependency. Instead the table will be connected with the view's dependencies.
-    #### sql_based
-    The sql_based based collector uses Redshift's [stl_insert](https://docs.aws.amazon.com/redshift/latest/dg/r_STL_INSERT.html) to discover all the insert queries
-    and uses sql parsing to discover the dependencies.
-    Pros:
-    - Works with Spectrum tables
-    - Views are connected properly if a table depends on it
-    Cons:
-    - Slow.
-    - Less reliable as the query parser can fail on certain queries
-    #### mixed
-    Using both collector above and first applying the sql based and then the stl_scan based one.
-    Pros:
-    - Works with Spectrum tables
-    - Views are connected properly if a table depends on it
-    - A bit more reliable than the sql_based one only
-    Cons:
-    - Slow
-    - May be incorrect at times as the query parser can fail on certain queries
-    :::note
-    The redshift stl redshift tables which are used for getting data lineage retain at most seven days of log history, and sometimes closer to 2-5 days. This means you cannot extract lineage from queries issued outside that window.
-    :::
-    ### Profiling
-    Profiling runs sql queries on the redshift cluster to get statistics about the tables. To be able to do that, the user needs to have read access to the tables that should be profiled.
-    If you don't want to grant read access to the tables you can enable table level profiling which will get table statistics without reading the data.
-    ```yaml
-    profiling:
-      profile_table_level_only: true
-    ```
     """
     # TODO: Replace with standardized types in sql_types.py
@@ -330,6 +265,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         self.config: RedshiftConfig = config
         self.report: RedshiftReport = RedshiftReport()
         self.classification_handler = ClassificationHandler(self.config, self.report)
+        self.datashares_helper = RedshiftDatasharesHelper(
+            self.config, self.report, self.ctx.graph
+        )
         self.platform = "redshift"
         self.domain_registry = None
         if self.config.domain:
@@ -361,6 +299,7 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             is_serverless=self.config.is_serverless
         )
+        self.db: Optional[RedshiftDatabase] = None
         self.db_tables: Dict[str, Dict[str, List[RedshiftTable]]] = {}
         self.db_views: Dict[str, Dict[str, List[RedshiftView]]] = {}
         self.db_schemas: Dict[str, Dict[str, RedshiftSchema]] = {}
@@ -424,6 +363,11 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         database = self.config.database
         logger.info(f"Processing db {database}")
+        self.db = self.data_dictionary.get_database_details(connection, database)
+        self.report.is_shared_database = (
+            self.db is not None and self.db.is_shared_database()
+        )
         with self.report.new_stage(METADATA_EXTRACTION):
             self.db_tables[database] = defaultdict()
             self.db_views[database] = defaultdict()
@@ -563,7 +507,10 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
             schema_columns: Dict[str, Dict[str, List[RedshiftColumn]]] = {}
             schema_columns[schema.name] = self.data_dictionary.get_columns_for_schema(
-                conn=connection, schema=schema
+                conn=connection,
+                database=database,
+                schema=schema,
+                is_shared_database=self.report.is_shared_database,
             )
             if self.config.include_tables:
@@ -883,10 +830,14 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
                 domain_config=self.config.domain,
             )
-    def cache_tables_and_views(self, connection, database):
+    def cache_tables_and_views(
+        self, connection: redshift_connector.Connection, database: str
+    ) -> None:
         tables, views = self.data_dictionary.get_tables_and_views(
             conn=connection,
+            database=database,
             skip_external_tables=self.config.skip_external_tables,
+            is_shared_database=self.report.is_shared_database,
         )
         for schema in tables:
             if not is_schema_allowed(
@@ -1029,6 +980,28 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource):
         database: str,
         lineage_extractor: RedshiftSqlLineageV2,
     ) -> Iterable[MetadataWorkUnit]:
+        if self.config.include_share_lineage:
+            outbound_shares = self.data_dictionary.get_outbound_datashares(connection)
+            yield from auto_workunit(
+                self.datashares_helper.to_platform_resource(list(outbound_shares))
+            )
+            if self.db and self.db.is_shared_database():
+                inbound_share = self.db.get_inbound_share()
+                if inbound_share is None:
+                    self.report.warning(
+                        title="Upstream lineage of inbound datashare will be missing",
+                        message="Database options do not contain sufficient information",
+                        context=f"Database: {database}, Options {self.db.options}",
+                    )
+                else:
+                    for known_lineage in self.datashares_helper.generate_lineage(
+                        inbound_share, self.get_all_tables()[database]
+                    ):
+                        lineage_extractor.aggregator.add(known_lineage)
+        # TODO: distinguish between definition level lineage and audit log based lineage.
+        # Definition level lineage should never be skipped
         if not self._should_ingest_lineage():
             return

datahub/ingestion/source/redshift/redshift_schema.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import logging
+import re
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
-from typing import Dict, Iterable, List, Optional, Tuple
+from typing import Dict, Iterable, List, Optional, Tuple, Union
 import redshift_connector
@@ -41,6 +42,9 @@ class RedshiftTable(BaseTable):
     serde_parameters: Optional[str] = None
     last_altered: Optional[datetime] = None
+    def is_external_table(self) -> bool:
+        return self.type == "EXTERNAL_TABLE"
 @dataclass
 class RedshiftView(BaseTable):
@@ -51,6 +55,9 @@ class RedshiftView(BaseTable):
     size_in_bytes: Optional[int] = None
     rows_count: Optional[int] = None
+    def is_external_table(self) -> bool:
+        return self.type == "EXTERNAL_TABLE"
 @dataclass
 class RedshiftSchema:
@@ -59,8 +66,119 @@ class RedshiftSchema:
     type: str
     owner: Optional[str] = None
     option: Optional[str] = None
+    external_platform: Optional[str] = None
     external_database: Optional[str] = None
+    def is_external_schema(self) -> bool:
+        return self.type == "external"
+    def get_upstream_schema_name(self) -> Optional[str]:
+        """Gets the schema name from the external schema option.
+        Returns:
+            Optional[str]: The schema name from the external schema option
+            if this is an external schema and has a valid option format, None otherwise.
+        """
+        if not self.is_external_schema() or not self.option:
+            return None
+        # For external schema on redshift, option is in form
+        # {"SCHEMA":"tickit"}
+        schema_match = re.search(r'"SCHEMA"\s*:\s*"([^"]*)"', self.option)
+        if not schema_match:
+            return None
+        else:
+            return schema_match.group(1)
+@dataclass
+class PartialInboundDatashare:
+    share_name: str
+    producer_namespace_prefix: str
+    consumer_database: str
+    def get_description(self) -> str:
+        return (
+            f"Namespace Prefix {self.producer_namespace_prefix} Share {self.share_name}"
+        )
+@dataclass
+class OutboundDatashare:
+    share_name: str
+    producer_namespace: str
+    source_database: str
+    def get_key(self) -> str:
+        return f"{self.producer_namespace}.{self.share_name}"
+@dataclass
+class InboundDatashare:
+    share_name: str
+    producer_namespace: str
+    consumer_database: str
+    def get_key(self) -> str:
+        return f"{self.producer_namespace}.{self.share_name}"
+    def get_description(self) -> str:
+        return f"Namespace {self.producer_namespace} Share {self.share_name}"
+@dataclass
+class RedshiftDatabase:
+    name: str
+    type: str
+    options: Optional[str] = None
+    def is_shared_database(self) -> bool:
+        return self.type == "shared"
+    # NOTE: ideally options are in form
+    # {"datashare_name":"xxx","datashare_producer_account":"1234","datashare_producer_namespace":"yyy"}
+    # however due to varchar(128) type of database table that captures options
+    # we may receive only partial information about inbound share
+    def get_inbound_share(
+        self,
+    ) -> Optional[Union[InboundDatashare, PartialInboundDatashare]]:
+        if not self.is_shared_database() or not self.options:
+            return None
+        # Convert into single regex ??
+        share_name_match = re.search(r'"datashare_name"\s*:\s*"([^"]*)"', self.options)
+        namespace_match = re.search(
+            r'"datashare_producer_namespace"\s*:\s*"([^"]*)"', self.options
+        )
+        partial_namespace_match = re.search(
+            r'"datashare_producer_namespace"\s*:\s*"([^"]*)$', self.options
+        )
+        if not share_name_match:
+            # We will always at least get share name
+            return None
+        share_name = share_name_match.group(1)
+        if namespace_match:
+            return InboundDatashare(
+                share_name=share_name,
+                producer_namespace=namespace_match.group(1),
+                consumer_database=self.name,
+            )
+        elif partial_namespace_match:
+            return PartialInboundDatashare(
+                share_name=share_name,
+                producer_namespace_prefix=partial_namespace_match.group(1),
+                consumer_database=self.name,
+            )
+        else:
+            return PartialInboundDatashare(
+                share_name=share_name,
+                producer_namespace_prefix="",
+                consumer_database=self.name,
+            )
 @dataclass
 class RedshiftExtraTableMeta:
@@ -141,13 +259,31 @@ class RedshiftDataDictionary:
         return [db[0] for db in dbs]
+    @staticmethod
+    def get_database_details(
+        conn: redshift_connector.Connection, database: str
+    ) -> Optional[RedshiftDatabase]:
+        cursor = RedshiftDataDictionary.get_query_result(
+            conn,
+            RedshiftCommonQuery.get_database_details(database),
+        )
+        row = cursor.fetchone()
+        if row is None:
+            return None
+        return RedshiftDatabase(
+            name=database,
+            type=row[1],
+            options=row[2],
+        )
     @staticmethod
     def get_schemas(
         conn: redshift_connector.Connection, database: str
     ) -> List[RedshiftSchema]:
         cursor = RedshiftDataDictionary.get_query_result(
             conn,
-            RedshiftCommonQuery.list_schemas.format(database_name=database),
+            RedshiftCommonQuery.list_schemas(database),
         )
         schemas = cursor.fetchall()
@@ -158,8 +294,8 @@ class RedshiftDataDictionary:
                 database=database,
                 name=schema[field_names.index("schema_name")],
                 type=schema[field_names.index("schema_type")],
-                owner=schema[field_names.index("schema_owner_name")],
                 option=schema[field_names.index("schema_option")],
+                external_platform=schema[field_names.index("external_platform")],
                 external_database=schema[field_names.index("external_database")],
             )
             for schema in schemas
@@ -202,7 +338,9 @@ class RedshiftDataDictionary:
     def get_tables_and_views(
         self,
         conn: redshift_connector.Connection,
+        database: str,
         skip_external_tables: bool = False,
+        is_shared_database: bool = False,
     ) -> Tuple[Dict[str, List[RedshiftTable]], Dict[str, List[RedshiftView]]]:
         tables: Dict[str, List[RedshiftTable]] = {}
         views: Dict[str, List[RedshiftView]] = {}
@@ -213,7 +351,11 @@ class RedshiftDataDictionary:
         cur = RedshiftDataDictionary.get_query_result(
             conn,
-            RedshiftCommonQuery.list_tables(skip_external_tables=skip_external_tables),
+            RedshiftCommonQuery.list_tables(
+                database=database,
+                skip_external_tables=skip_external_tables,
+                is_shared_database=is_shared_database,
+            ),
         )
         field_names = [i[0] for i in cur.description]
         db_tables = cur.fetchall()
@@ -358,11 +500,18 @@ class RedshiftDataDictionary:
     @staticmethod
     def get_columns_for_schema(
-        conn: redshift_connector.Connection, schema: RedshiftSchema
+        conn: redshift_connector.Connection,
+        database: str,
+        schema: RedshiftSchema,
+        is_shared_database: bool = False,
     ) -> Dict[str, List[RedshiftColumn]]:
         cursor = RedshiftDataDictionary.get_query_result(
             conn,
-            RedshiftCommonQuery.list_columns.format(schema_name=schema.name),
+            RedshiftCommonQuery.list_columns(
+                database_name=database,
+                schema_name=schema.name,
+                is_shared_database=is_shared_database,
+            ),
         )
         table_columns: Dict[str, List[RedshiftColumn]] = {}
@@ -508,3 +657,34 @@ class RedshiftDataDictionary:
                     start_time=row[field_names.index("start_time")],
                 )
             rows = cursor.fetchmany()
+    @staticmethod
+    def get_outbound_datashares(
+        conn: redshift_connector.Connection,
+    ) -> Iterable[OutboundDatashare]:
+        cursor = conn.cursor()
+        cursor.execute(RedshiftCommonQuery.list_outbound_datashares())
+        for item in cursor.fetchall():
+            yield OutboundDatashare(
+                share_name=item[1],
+                producer_namespace=item[2],
+                source_database=item[3],
+            )
+    # NOTE: this is not used right now as it requires superuser privilege
+    # We can use this in future if the permissions are lowered.
+    @staticmethod
+    def get_inbound_datashare(
+        conn: redshift_connector.Connection,
+        database: str,
+    ) -> Optional[InboundDatashare]:
+        cursor = conn.cursor()
+        cursor.execute(RedshiftCommonQuery.get_inbound_datashare(database))
+        item = cursor.fetchone()
+        if item:
+            return InboundDatashare(
+                share_name=item[1],
+                producer_namespace=item[2],
+                consumer_database=item[3],
+            )
+        return None

datahub/ingestion/source/redshift/report.py CHANGED Viewed

@@ -60,5 +60,8 @@ class RedshiftReport(
     sql_aggregator: Optional[SqlAggregatorReport] = None
     lineage_phases_timer: Dict[str, PerfTimer] = field(default_factory=dict)
+    is_shared_database: bool = False
+    outbound_shares_count: Optional[int] = None
     def report_dropped(self, key: str) -> None:
         self.filtered.append(key)

datahub/ingestion/source/s3/config.py CHANGED Viewed

@@ -5,7 +5,9 @@ import pydantic
 from pydantic.fields import Field
 from datahub.configuration.common import AllowDenyPattern
-from datahub.configuration.source_common import DatasetSourceConfigMixin
+from datahub.configuration.source_common import (
+    DatasetSourceConfigMixin,
+)
 from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
 from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
@@ -152,10 +154,8 @@ class DataLakeSourceConfig(
         return path_specs
     @pydantic.validator("platform", always=True)
-    def platform_valid(cls, platform: str, values: dict) -> str:
-        inferred_platform = values.get(
-            "platform", None
-        )  # we may have inferred it above
+    def platform_valid(cls, platform: Any, values: dict) -> str:
+        inferred_platform = values.get("platform")  # we may have inferred it above
         platform = platform or inferred_platform
         if not platform:
             raise ValueError("platform must not be empty")

datahub/ingestion/source/s3/source.py CHANGED Viewed

@@ -834,7 +834,7 @@ class S3Source(StatefulIngestionSourceBase):
                         min=min,
                     )
                     folders.extend(folders_list)
-                    if not path_spec.traversal_method == FolderTraversalMethod.ALL:
+                    if path_spec.traversal_method != FolderTraversalMethod.ALL:
                         return folders
             if folders:
                 return folders
@@ -847,7 +847,7 @@ class S3Source(StatefulIngestionSourceBase):
         path_spec: PathSpec,
         bucket: "Bucket",
         prefix: str,
-    ) -> List[Folder]:
+    ) -> Iterable[Folder]:
         """
         Retrieves all the folders in a path by listing all the files in the prefix.
         If the prefix is a full path then only that folder will be extracted.
@@ -877,51 +877,30 @@ class S3Source(StatefulIngestionSourceBase):
         s3_objects = (
             obj
             for obj in bucket.objects.filter(Prefix=prefix).page_size(PAGE_SIZE)
-            if _is_allowed_path(path_spec, f"s3://{obj.bucket_name}/{obj.key}")
+            if _is_allowed_path(
+                path_spec, self.create_s3_path(obj.bucket_name, obj.key)
+            )
         )
-        partitions: List[Folder] = []
         grouped_s3_objects_by_dirname = groupby_unsorted(
             s3_objects,
             key=lambda obj: obj.key.rsplit("/", 1)[0],
         )
-        for key, group in grouped_s3_objects_by_dirname:
-            file_size = 0
-            creation_time = None
-            modification_time = None
-            for item in group:
-                file_size += item.size
-                if creation_time is None or item.last_modified < creation_time:
-                    creation_time = item.last_modified
-                if modification_time is None or item.last_modified > modification_time:
-                    modification_time = item.last_modified
-                    max_file = item
-            if modification_time is None:
-                logger.warning(
-                    f"Unable to find any files in the folder {key}. Skipping..."
-                )
-                continue
-            id = path_spec.get_partition_from_path(
-                self.create_s3_path(max_file.bucket_name, max_file.key)
+        for _, group in grouped_s3_objects_by_dirname:
+            max_file = max(group, key=lambda x: x.last_modified)
+            max_file_s3_path = self.create_s3_path(max_file.bucket_name, max_file.key)
+            # If partition_id is None, it means the folder is not a partition
+            partition_id = path_spec.get_partition_from_path(max_file_s3_path)
+            yield Folder(
+                partition_id=partition_id,
+                is_partition=bool(partition_id),
+                creation_time=min(obj.last_modified for obj in group),
+                modification_time=max_file.last_modified,
+                sample_file=max_file_s3_path,
+                size=sum(obj.size for obj in group),
             )
-            # If id is None, it means the folder is not a partition
-            partitions.append(
-                Folder(
-                    partition_id=id,
-                    is_partition=bool(id),
-                    creation_time=creation_time if creation_time else None,  # type: ignore[arg-type]
-                    modification_time=modification_time,
-                    sample_file=self.create_s3_path(max_file.bucket_name, max_file.key),
-                    size=file_size,
-                )
-            )
-        return partitions
     def s3_browser(self, path_spec: PathSpec, sample_size: int) -> Iterable[BrowsePath]:
         if self.source_config.aws_config is None:
             raise ValueError("aws_config not set. Cannot browse s3")
@@ -1000,7 +979,7 @@ class S3Source(StatefulIngestionSourceBase):
                                     min=True,
                                 )
                                 dirs_to_process.append(dirs_to_process_min[0])
-                        folders = []
+                        folders: List[Folder] = []
                         for dir in dirs_to_process:
                             logger.info(f"Getting files from folder: {dir}")
                             prefix_to_process = urlparse(dir).path.lstrip("/")

acryl-datahub 0.15.0.6rc2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

acryl-datahub 0.15.0.6rc2py3-none-any.whl → 1.0.0py3-none-any.whl