PyPI - acryl-datahub - Versions diffs - 1.1.0.5rc5__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl - Mend

acryl-datahub 1.1.0.5rc5py3-none-any.whl → 1.1.0.5rc7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (43) hide show

datahub/ingestion/source/cassandra/cassandra_profiling.py CHANGED Viewed

@@ -70,11 +70,12 @@ class CassandraProfiler:
     ) -> Iterable[MetadataWorkUnit]:
         for keyspace_name in cassandra_data.keyspaces:
             tables = cassandra_data.tables.get(keyspace_name, [])
-            with self.report.new_stage(
-                f"{keyspace_name}: {PROFILING}"
-            ), ThreadPoolExecutor(
-                max_workers=self.config.profiling.max_workers
-            ) as executor:
+            with (
+                self.report.new_stage(f"{keyspace_name}: {PROFILING}"),
+                ThreadPoolExecutor(
+                    max_workers=self.config.profiling.max_workers
+                ) as executor,
+            ):
                 future_to_dataset = {
                     executor.submit(
                         self.generate_profile,

datahub/ingestion/source/common/subtypes.py CHANGED Viewed

@@ -143,7 +143,7 @@ def create_source_capability_modifier_enum():
     for enum_class in source_enums:
         for member in enum_class:  # type: ignore[var-annotated]
             if member.name in all_values:
-                logger.error(
+                logger.debug(
                     f"Warning: {member.name} already exists with value {all_values[member.name]}, skipping {member.value}"
                 )
                 continue

datahub/ingestion/source/data_lake_common/object_store.py CHANGED Viewed

@@ -519,6 +519,13 @@ class ObjectStoreSourceAdapter:
                 "get_external_url",
                 lambda table_data: self.get_gcs_external_url(table_data),
             )
+            # Fix URI mismatch issue in pattern matching
+            self.register_customization(
+                "_normalize_uri_for_pattern_matching",
+                self._normalize_gcs_uri_for_pattern_matching,
+            )
+            # Fix URI handling in schema extraction - override strip_s3_prefix for GCS
+            self.register_customization("strip_s3_prefix", self._strip_gcs_prefix)
         elif platform == "s3":
             self.register_customization("is_s3_platform", lambda: True)
             self.register_customization("create_s3_path", self.create_s3_path)
@@ -612,6 +619,39 @@ class ObjectStoreSourceAdapter:
             return self.get_abs_external_url(table_data)
         return None
+    def _normalize_gcs_uri_for_pattern_matching(self, uri: str) -> str:
+        """
+        Normalize GCS URI for pattern matching.
+        This method converts gs:// URIs to s3:// URIs for pattern matching purposes,
+        fixing the URI mismatch issue in GCS ingestion.
+        Args:
+            uri: The URI to normalize
+        Returns:
+            The normalized URI for pattern matching
+        """
+        if uri.startswith("gs://"):
+            return uri.replace("gs://", "s3://", 1)
+        return uri
+    def _strip_gcs_prefix(self, uri: str) -> str:
+        """
+        Strip GCS prefix from URI.
+        This method removes the gs:// prefix from GCS URIs for path processing.
+        Args:
+            uri: The URI to strip the prefix from
+        Returns:
+            The URI without the gs:// prefix
+        """
+        if uri.startswith("gs://"):
+            return uri[5:]  # Remove "gs://" prefix
+        return uri
 # Factory function to create an adapter for a specific platform
 def create_object_store_adapter(

datahub/ingestion/source/data_lake_common/path_spec.py CHANGED Viewed

@@ -166,7 +166,6 @@ class PathSpec(ConfigModel):
         return False
     def allowed(self, path: str, ignore_ext: bool = False) -> bool:
-        logger.debug(f"Checking file to inclusion: {path}")
         if self.is_path_hidden(path) and not self.include_hidden_folders:
             return False
@@ -174,19 +173,17 @@ class PathSpec(ConfigModel):
             self.glob_include, flags=pathlib.GLOBSTAR
         ):
             return False
-        logger.debug(f"{path} matched include ")
         if self.exclude:
             for exclude_path in self.exclude:
                 if pathlib.PurePath(path).globmatch(
                     exclude_path, flags=pathlib.GLOBSTAR
                 ):
                     return False
-        logger.debug(f"{path} is not excluded")
         table_name, _ = self.extract_table_name_and_path(path)
         if not self.tables_filter_pattern.allowed(table_name):
             return False
-        logger.debug(f"{path} is passed table name check")
         ext = os.path.splitext(path)[1].strip(".")
@@ -196,8 +193,6 @@ class PathSpec(ConfigModel):
             ):
                 return False
-            logger.debug(f"{path} had selected extension {ext}")
-            logger.debug(f"{path} allowed for dataset creation")
         return True
     def dir_allowed(self, path: str) -> bool:
@@ -219,10 +214,8 @@ class PathSpec(ConfigModel):
         for _ in range(slash_to_remove_from_glob):
             glob_include = glob_include.rsplit("/", 1)[0]
-        logger.debug(f"Checking dir to inclusion: {path}")
         if not pathlib.PurePath(path).globmatch(glob_include, flags=pathlib.GLOBSTAR):
             return False
-        logger.debug(f"{path} matched include ")
         if self.exclude:
             for exclude_path in self.exclude:
                 if pathlib.PurePath(path.rstrip("/")).globmatch(
@@ -236,7 +229,7 @@ class PathSpec(ConfigModel):
         )
         if not self.tables_filter_pattern.allowed(table_name):
             return False
-        logger.debug(f"{path} is passed table name check")
+        # logger.debug(f"{path} is passed table name check")
         return True
@@ -246,10 +239,10 @@ class PathSpec(ConfigModel):
         if parsable_include.endswith("/{table}/**"):
             # Remove the last two characters to make it parsable if it ends with {table}/** which marks autodetect partition
             parsable_include = parsable_include[:-2]
-        else:
-            # Replace all * with {folder[i]} to make it parsable
-            for i in range(parsable_include.count("*")):
-                parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
+        # Replace all * with {folder[i]} to make it parsable
+        for i in range(parsable_include.count("*")):
+            parsable_include = parsable_include.replace("*", f"{{folder[{i}]}}", 1)
         return parsable_include
     def get_named_vars(self, path: str) -> Union[None, parse.Result, parse.Match]:
@@ -330,8 +323,6 @@ class PathSpec(ConfigModel):
             if "{table}" in values["include"]:
                 v = "{table}"
         else:
-            logger.debug(f"include fields: {compiled_include.named_fields}")
-            logger.debug(f"table_name fields: {parse.compile(v).named_fields}")
             if not all(
                 x in compiled_include.named_fields
                 for x in parse.compile(v).named_fields
@@ -356,9 +347,7 @@ class PathSpec(ConfigModel):
     @cached_property
     def compiled_include(self):
         parsable_include = PathSpec.get_parsable_include(self.include)
-        logger.debug(f"parsable_include: {parsable_include}")
         compiled_include = parse.compile(parsable_include)
-        logger.debug(f"Setting compiled_include: {compiled_include}")
         return compiled_include
     @cached_property
@@ -366,9 +355,8 @@ class PathSpec(ConfigModel):
         parsable_folder_include = PathSpec.get_parsable_include(self.include).rsplit(
             "/", 1
         )[0]
-        logger.debug(f"parsable_folder_include: {parsable_folder_include}")
         compiled_folder_include = parse.compile(parsable_folder_include)
-        logger.debug(f"Setting compiled_folder_include: {compiled_folder_include}")
         return compiled_folder_include
     @cached_property
@@ -376,7 +364,8 @@ class PathSpec(ConfigModel):
         # Regular expression to find all substrings enclosed in {}
         pattern = r"\{(.*?)\}"
         # Find all matches
-        matches = re.findall(pattern, self.include.split("{table}/")[1])
+        split_parts = self.include.split("{table}/")
+        matches = re.findall(pattern, split_parts[1]) if len(split_parts) > 1 else []
         return matches
     def get_partition_from_path(self, path: str) -> Optional[List[Tuple[str, str]]]:
@@ -563,7 +552,7 @@ class PathSpec(ConfigModel):
                         f"{{{template_key}}}", var[key]
                     )
             else:
-                partition_format.replace(f"{{{var_key}}}", var)
+                partition_format = partition_format.replace(f"{{{var_key}}}", var)
         return datetime.datetime.strptime(partition_format, datetime_format).replace(
             tzinfo=datetime.timezone.utc
         )

datahub/ingestion/source/dremio/dremio_source.py CHANGED Viewed

@@ -261,9 +261,12 @@ class DremioSource(StatefulIngestionSourceBase):
             # Profiling
             if self.config.is_profiling_enabled():
-                with self.report.new_stage(PROFILING), ThreadPoolExecutor(
-                    max_workers=self.config.profiling.max_workers
-                ) as executor:
+                with (
+                    self.report.new_stage(PROFILING),
+                    ThreadPoolExecutor(
+                        max_workers=self.config.profiling.max_workers
+                    ) as executor,
+                ):
                     future_to_dataset = {
                         executor.submit(self.generate_profiles, dataset): dataset
                         for dataset in datasets

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -112,6 +112,7 @@ class GCSSource(StatefulIngestionSourceBase):
             env=self.config.env,
             max_rows=self.config.max_rows,
             number_of_files_to_sample=self.config.number_of_files_to_sample,
+            platform=PLATFORM_GCS,  # Ensure GCS platform is used for correct container subtypes
         )
         return s3_config
@@ -138,7 +139,9 @@ class GCSSource(StatefulIngestionSourceBase):
     def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
         config = self.create_equivalent_s3_config()
-        s3_source = S3Source(config, PipelineContext(ctx.run_id))
+        # Create a new context for S3 source without graph to avoid duplicate checkpointer registration
+        s3_ctx = PipelineContext(run_id=ctx.run_id, pipeline_name=ctx.pipeline_name)
+        s3_source = S3Source(config, s3_ctx)
         return self.s3_source_overrides(s3_source)
     def s3_source_overrides(self, source: S3Source) -> S3Source:

datahub/ingestion/source/ge_data_profiler.py CHANGED Viewed

@@ -1213,26 +1213,34 @@ class DatahubGEProfiler:
             f"Will profile {len(requests)} table(s) with {max_workers} worker(s) - this may take a while"
         )
-        with PerfTimer() as timer, unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
-            get_column_unique_count_dh_patch,
-        ), unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
-            _get_column_quantiles_bigquery_patch,
-        ), unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
-            _get_column_quantiles_awsathena_patch,
-        ), unittest.mock.patch(
-            "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
-            _get_column_median_patch,
-        ), concurrent.futures.ThreadPoolExecutor(
-            max_workers=max_workers
-        ) as async_executor, SQLAlchemyQueryCombiner(
-            enabled=self.config.query_combiner_enabled,
-            catch_exceptions=self.config.catch_exceptions,
-            is_single_row_query_method=_is_single_row_query_method,
-            serial_execution_fallback_enabled=True,
-        ).activate() as query_combiner:
+        with (
+            PerfTimer() as timer,
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_unique_count",
+                get_column_unique_count_dh_patch,
+            ),
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_bigquery",
+                _get_column_quantiles_bigquery_patch,
+            ),
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset._get_column_quantiles_awsathena",
+                _get_column_quantiles_awsathena_patch,
+            ),
+            unittest.mock.patch(
+                "great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyDataset.get_column_median",
+                _get_column_median_patch,
+            ),
+            concurrent.futures.ThreadPoolExecutor(
+                max_workers=max_workers
+            ) as async_executor,
+            SQLAlchemyQueryCombiner(
+                enabled=self.config.query_combiner_enabled,
+                catch_exceptions=self.config.catch_exceptions,
+                is_single_row_query_method=_is_single_row_query_method,
+                serial_execution_fallback_enabled=True,
+            ).activate() as query_combiner,
+        ):
             # Submit the profiling requests to the thread pool executor.
             async_profiles = collections.deque(
                 async_executor.submit(

datahub/ingestion/source/kafka_connect/source_connectors.py CHANGED Viewed

@@ -20,6 +20,8 @@ from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import (
     get_platform_from_sqlalchemy_uri,
 )
+logger = logging.getLogger(__name__)
 @dataclass
 class ConfluentJDBCSourceConnector(BaseConnector):
@@ -392,7 +394,7 @@ class MongoSourceConnector(BaseConnector):
             db_connection_url=connector_manifest.config.get("connection.uri"),
             source_platform="mongodb",
             database_name=connector_manifest.config.get("database"),
-            topic_prefix=connector_manifest.config.get("topic_prefix"),
+            topic_prefix=connector_manifest.config.get("topic.prefix"),
             transforms=(
                 connector_manifest.config["transforms"].split(",")
                 if "transforms" in connector_manifest.config
@@ -406,7 +408,11 @@ class MongoSourceConnector(BaseConnector):
         lineages: List[KafkaConnectLineage] = list()
         parser = self.get_parser(self.connector_manifest)
         source_platform = parser.source_platform
-        topic_naming_pattern = r"mongodb\.(\w+)\.(\w+)"
+        topic_prefix = parser.topic_prefix or ""
+        # Escape topic_prefix to handle cases where it contains dots
+        # Some users configure topic.prefix like "my.mongodb" which breaks the regex
+        topic_naming_pattern = rf"{re.escape(topic_prefix)}\.(\w+)\.(\w+)"
         if not self.connector_manifest.topic_names:
             return lineages
@@ -429,6 +435,26 @@ class MongoSourceConnector(BaseConnector):
 @dataclass
 class DebeziumSourceConnector(BaseConnector):
+    # Debezium topic naming patterns by connector type
+    # - MySQL: {topic.prefix}.{database}.{table}
+    # - PostgreSQL: {topic.prefix}.{schema}.{table}
+    # - SQL Server: {topic.prefix}.{database}.{schema}.{table}
+    # - Oracle: {topic.prefix}.{schema}.{table}
+    # - DB2: {topic.prefix}.{schema}.{table}
+    # - MongoDB: {topic.prefix}.{database}.{collection}
+    # - Vitess: {topic.prefix}.{keyspace}.{table}
+    # Note SQL Server allows for "database.names" (multiple databases) config,
+    # and so database is in the topic naming pattern.
+    # However, others have "database.dbname" which is a single database name. For these connectors,
+    # additional databases would require a different connector instance
+    # Connectors with 2-level container in pattern (database + schema)
+    # Others have either database XOR schema, but not both
+    DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN = {
+        "io.debezium.connector.sqlserver.SqlServerConnector",
+    }
     @dataclass
     class DebeziumParser:
         source_platform: str
@@ -514,16 +540,45 @@ class DebeziumSourceConnector(BaseConnector):
             source_platform = parser.source_platform
             server_name = parser.server_name
             database_name = parser.database_name
-            topic_naming_pattern = rf"({server_name})\.(\w+\.\w+)"
+            # Escape server_name to handle cases where topic.prefix contains dots
+            # Some users configure topic.prefix like "my.server" which breaks the regex
+            server_name = server_name or ""
+            # Regex pattern (\w+\.\w+(?:\.\w+)?) supports BOTH 2-part and 3-part table names
+            topic_naming_pattern = rf"({re.escape(server_name)})\.(\w+\.\w+(?:\.\w+)?)"
             if not self.connector_manifest.topic_names:
                 return lineages
+            # Handle connectors with 2-level container (database + schema) in topic pattern
+            connector_class = self.connector_manifest.config.get(CONNECTOR_CLASS, "")
+            maybe_duplicated_database_name = (
+                connector_class
+                in self.DEBEZIUM_CONNECTORS_WITH_2_LEVEL_CONTAINER_IN_PATTERN
+            )
             for topic in self.connector_manifest.topic_names:
                 found = re.search(re.compile(topic_naming_pattern), topic)
+                logger.debug(
+                    f"Processing topic: '{topic}' with regex pattern '{topic_naming_pattern}', found: {found}"
+                )
                 if found:
-                    table_name = get_dataset_name(database_name, found.group(2))
+                    # Extract the table part after server_name
+                    table_part = found.group(2)
+                    if (
+                        maybe_duplicated_database_name
+                        and database_name
+                        and table_part.startswith(f"{database_name}.")
+                    ):
+                        table_part = table_part[len(database_name) + 1 :]
+                    logger.debug(
+                        f"Extracted table part: '{table_part}' from topic '{topic}'"
+                    )
+                    # Apply database name to create final dataset name
+                    table_name = get_dataset_name(database_name, table_part)
+                    logger.debug(f"Final table name: '{table_name}'")
                     lineage = KafkaConnectLineage(
                         source_dataset=table_name,

datahub/ingestion/source/mock_data/datahub_mock_data.py CHANGED Viewed

@@ -21,9 +21,13 @@ from datahub.ingestion.source.mock_data.datahub_mock_data_report import (
 )
 from datahub.ingestion.source.mock_data.table_naming_helper import TableNamingHelper
 from datahub.metadata.schema_classes import (
+    CalendarIntervalClass,
     DatasetLineageTypeClass,
+    DatasetProfileClass,
+    DatasetUsageStatisticsClass,
     StatusClass,
     SubTypesClass,
+    TimeWindowSizeClass,
     UpstreamClass,
     UpstreamLineageClass,
 )
@@ -278,6 +282,10 @@ class DataHubMockDataSource(Source):
                 yield self._get_subtypes_aspect(table_name, i, j)
+                yield self._get_profile_aspect(table_name)
+                yield self._get_usage_aspect(table_name)
                 yield from self._generate_lineage_for_table(
                     table_name=table_name,
                     table_level=i,
@@ -381,5 +389,42 @@ class DataHubMockDataSource(Source):
         )
         return mcp.as_workunit()
+    def _get_profile_aspect(self, table: str) -> MetadataWorkUnit:
+        urn = make_dataset_urn(
+            platform="fake",
+            name=table,
+        )
+        mcp = MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            entityType="dataset",
+            aspect=DatasetProfileClass(
+                timestampMillis=0,
+                rowCount=100,
+                columnCount=10,
+                sizeInBytes=1000,
+            ),
+        )
+        return mcp.as_workunit()
+    def _get_usage_aspect(self, table: str) -> MetadataWorkUnit:
+        urn = make_dataset_urn(
+            platform="fake",
+            name=table,
+        )
+        mcp = MetadataChangeProposalWrapper(
+            entityUrn=urn,
+            entityType="dataset",
+            aspect=DatasetUsageStatisticsClass(
+                timestampMillis=0,
+                eventGranularity=TimeWindowSizeClass(unit=CalendarIntervalClass.DAY),
+                uniqueUserCount=0,
+                totalSqlQueries=0,
+                topSqlQueries=[],
+                userCounts=[],
+                fieldCounts=[],
+            ),
+        )
+        return mcp.as_workunit()
     def get_report(self) -> SourceReport:
         return self.report

datahub/ingestion/source/redshift/usage.py CHANGED Viewed

@@ -182,9 +182,10 @@ class RedshiftUsageExtractor:
         self.report.num_operational_stats_filtered = 0
         if self.config.include_operational_stats:
-            with self.report.new_stage(
-                USAGE_EXTRACTION_OPERATIONAL_STATS
-            ), PerfTimer() as timer:
+            with (
+                self.report.new_stage(USAGE_EXTRACTION_OPERATIONAL_STATS),
+                PerfTimer() as timer,
+            ):
                 # Generate operation aspect workunits
                 yield from self._gen_operation_aspect_workunits(
                     self.connection, all_tables

datahub/ingestion/source/s3/report.py CHANGED Viewed

@@ -1,19 +1,21 @@
 import dataclasses
 from dataclasses import field as dataclass_field
-from typing import List
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalSourceReport,
 )
+from datahub.utilities.lossy_collections import LossyList
 @dataclasses.dataclass
 class DataLakeSourceReport(StaleEntityRemovalSourceReport):
     files_scanned = 0
-    filtered: List[str] = dataclass_field(default_factory=list)
+    filtered: LossyList[str] = dataclass_field(default_factory=LossyList)
+    number_of_files_filtered: int = 0
     def report_file_scanned(self) -> None:
         self.files_scanned += 1
     def report_file_dropped(self, file: str) -> None:
         self.filtered.append(file)
+        self.number_of_files_filtered += 1

acryl-datahub 1.1.0.5rc5__py3-none-any.whl → 1.1.0.5rc7__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0.5rc5py3-none-any.whl → 1.1.0.5rc7py3-none-any.whl