PyPI - acryl-datahub - Versions diffs - 1.1.0rc3__py3-none-any.whl → 1.1.0.1__py3-none-any.whl - Mend

acryl-datahub 1.1.0rc3py3-none-any.whl → 1.1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (87) hide show

{acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/METADATA +2532 -2530
{acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/RECORD +87 -70
{acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +9 -8
datahub/api/entities/external/__init__.py +0 -0
datahub/api/entities/external/external_entities.py +239 -0
datahub/api/entities/external/external_tag.py +145 -0
datahub/api/entities/external/restricted_text.py +247 -0
datahub/api/entities/external/unity_catalog_external_entites.py +170 -0
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/delete_cli.py +4 -4
datahub/cli/ingest_cli.py +9 -1
datahub/emitter/mce_builder.py +3 -1
datahub/emitter/response_helper.py +86 -1
datahub/emitter/rest_emitter.py +1 -1
datahub/ingestion/graph/client.py +3 -3
datahub/ingestion/source/apply/datahub_apply.py +4 -4
datahub/ingestion/source/data_lake_common/data_lake_utils.py +22 -10
datahub/ingestion/source/data_lake_common/object_store.py +644 -0
datahub/ingestion/source/datahub/config.py +11 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_common.py +30 -11
datahub/ingestion/source/gcs/gcs_source.py +22 -7
datahub/ingestion/source/gcs/gcs_utils.py +36 -9
datahub/ingestion/source/hex/query_fetcher.py +9 -3
datahub/ingestion/source/openapi.py +12 -0
datahub/ingestion/source/openapi_parser.py +56 -37
datahub/ingestion/source/s3/source.py +65 -6
datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
datahub/ingestion/source/snowflake/snowflake_queries.py +44 -21
datahub/ingestion/source/snowflake/snowflake_query.py +0 -7
datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
datahub/ingestion/source/sql/athena.py +1 -0
datahub/ingestion/source/sql/hive.py +2 -3
datahub/ingestion/source/sql/sql_common.py +98 -34
datahub/ingestion/source/sql/sql_types.py +5 -2
datahub/ingestion/source/unity/config.py +5 -0
datahub/ingestion/source/unity/proxy.py +117 -0
datahub/ingestion/source/unity/source.py +167 -15
datahub/ingestion/source/unity/tag_entities.py +295 -0
datahub/metadata/_internal_schema_classes.py +667 -522
datahub/metadata/_urns/urn_defs.py +1804 -1748
datahub/metadata/com/linkedin/pegasus2avro/application/__init__.py +19 -0
datahub/metadata/schema.avsc +17358 -17584
datahub/metadata/schemas/ApplicationKey.avsc +31 -0
datahub/metadata/schemas/ApplicationProperties.avsc +72 -0
datahub/metadata/schemas/Applications.avsc +38 -0
datahub/metadata/schemas/ChartKey.avsc +1 -0
datahub/metadata/schemas/ContainerKey.avsc +1 -0
datahub/metadata/schemas/DashboardKey.avsc +1 -0
datahub/metadata/schemas/DataFlowKey.avsc +1 -0
datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
datahub/metadata/schemas/DataJobKey.avsc +1 -0
datahub/metadata/schemas/DataProductKey.avsc +1 -0
datahub/metadata/schemas/DataProductProperties.avsc +1 -1
datahub/metadata/schemas/DatasetKey.avsc +1 -0
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/GlossaryTermKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureKey.avsc +1 -0
datahub/metadata/schemas/MLFeatureTableKey.avsc +1 -0
datahub/metadata/schemas/MLModelGroupKey.avsc +1 -0
datahub/metadata/schemas/MLModelKey.avsc +1 -0
datahub/metadata/schemas/MLPrimaryKeyKey.avsc +1 -0
datahub/metadata/schemas/NotebookKey.avsc +1 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +6 -0
datahub/sdk/_all_entities.py +11 -0
datahub/sdk/_shared.py +118 -1
datahub/sdk/chart.py +315 -0
datahub/sdk/container.py +7 -0
datahub/sdk/dashboard.py +432 -0
datahub/sdk/dataflow.py +309 -0
datahub/sdk/datajob.py +342 -0
datahub/sdk/dataset.py +8 -2
datahub/sdk/entity_client.py +90 -2
datahub/sdk/lineage_client.py +681 -82
datahub/sdk/main_client.py +27 -8
datahub/sdk/mlmodel.py +101 -38
datahub/sdk/mlmodelgroup.py +7 -0
datahub/sql_parsing/sql_parsing_aggregator.py +1 -1
datahub/testing/mce_helpers.py +421 -0
datahub/testing/sdk_v2_helpers.py +18 -0
{acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.0rc3.dist-info → acryl_datahub-1.1.0.1.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import contextlib
 import json
 import logging
+import time
 from datetime import datetime
 from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from datahub.emitter.aspect import ASPECT_MAP
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
 # Should work for at least mysql, mariadb, postgres
 DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
+DATE_FORMAT = "%Y-%m-%d"
 ROW = TypeVar("ROW", bound=Dict[str, Any])
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
             **connection_config.options,
         )
+        # Cache for available dates to avoid redundant queries
+        self.available_dates_cache: Optional[List[datetime]] = None
     @property
     def soft_deleted_urns_query(self) -> str:
         return f"""
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
             ORDER BY mav.urn
         """
-    @property
-    def query(self) -> str:
-        # May repeat rows for the same date
-        # Offset is generally 0, unless we repeat the same createdon twice
+    def query(self, set_structured_properties_filter: bool) -> str:
+        """
+        Main query that gets data for specified date range with appropriate filters.
+        """
+        structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
-        # Ensures stable order, chronological per (urn, aspect)
-        # Relies on createdon order to reflect version order
-        # Ordering of entries with the same createdon is handled by VersionOrderer
         return f"""
         SELECT *
         FROM (
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
                 {"" if self.config.include_all_versions else "AND mav.version = 0"}
                 {"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
                 AND mav.createdon >= %(since_createdon)s
+                AND mav.createdon < %(end_createdon)s
             ORDER BY
                 createdon,
                 urn,
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
                 version
         ) as t
         WHERE 1=1
-            {"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
+            {"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
+            {structured_prop_filter}
         ORDER BY
             createdon,
             urn,
             aspect,
             version
+        LIMIT %(limit)s
+        OFFSET %(offset)s
         """
+    def execute_with_params(
+        self, query: str, params: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        """Execute query with proper parameter binding that works with your database"""
+        with self.engine.connect() as conn:
+            result = conn.execute(query, params or {})
+            return [dict(row) for row in result.fetchall()]
     def execute_server_cursor(
         self, query: str, params: Dict[str, Any]
     ) -> Iterable[Dict[str, Any]]:
+        """Execute a query with server-side cursor"""
         with self.engine.connect() as conn:
             if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
                 with (
                     conn.begin()
                 ):  # Transaction required for PostgreSQL server-side cursor
-                    # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
-                    # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
+                    # Set query timeout at the connection level
+                    if self.config.query_timeout:
+                        if self.engine.dialect.name == "postgresql":
+                            conn.execute(
+                                text(
+                                    f"SET statement_timeout = {self.config.query_timeout * 1000}"
+                                )
+                            )  # milliseconds
+                        elif self.engine.dialect.name in ["mysql", "mariadb"]:
+                            conn.execute(
+                                text(
+                                    f"SET max_execution_time = {self.config.query_timeout * 1000}"
+                                )
+                            )  # milliseconds
+                    # Stream results with batch size
                     conn = conn.execution_options(
                         stream_results=True,
                         yield_per=self.config.database_query_batch_size,
                     )
+                    # Execute query - using native parameterization without text()
+                    # to maintain compatibility with your original code
                     result = conn.execute(query, params)
                     for row in result:
                         yield dict(row)
+                return  # Success, exit the retry loop
             else:
                 raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
     def _get_rows(
-        self, from_createdon: datetime, stop_time: datetime
+        self,
+        start_date: datetime,
+        end_date: datetime,
+        set_structured_properties_filter: bool,
+        limit: int,
     ) -> Iterable[Dict[str, Any]]:
-        params = {
-            "exclude_aspects": list(self.config.exclude_aspects),
-            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
-        }
-        yield from self.execute_server_cursor(self.query, params)
+        """
+        Retrieves data rows within a specified date range using pagination.
-    def get_aspects(
+        Implements a hybrid pagination strategy that switches between time-based and
+        offset-based approaches depending on the returned data. Uses server-side
+        cursors for efficient memory usage.
+        Note: May return duplicate rows across batch boundaries when multiple rows
+        share the same 'createdon' timestamp. This is expected behavior when
+        transitioning between pagination methods.
+        Args:
+        start_date: Beginning of date range (inclusive)
+        end_date: End of date range (exclusive)
+        set_structured_properties_filter: Whether to apply structured filtering
+        limit: Maximum rows to fetch per query
+        Returns:
+            An iterable of database rows as dictionaries
+        """
+        offset = 0
+        last_createdon = None
+        first_iteration = True
+        while True:
+            try:
+                # Set up query and parameters - using named parameters
+                query = self.query(set_structured_properties_filter)
+                params: Dict[str, Any] = {
+                    "since_createdon": start_date.strftime(DATETIME_FORMAT),
+                    "end_createdon": end_date.strftime(DATETIME_FORMAT),
+                    "limit": limit,
+                    "offset": offset,
+                }
+                # Add exclude_aspects if needed
+                if (
+                    hasattr(self.config, "exclude_aspects")
+                    and self.config.exclude_aspects
+                ):
+                    params["exclude_aspects"] = tuple(self.config.exclude_aspects)
+                logger.info(
+                    f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
+                    f"with limit {limit} and offset {offset} (inclusive range)"
+                )
+                # Execute query with server-side cursor
+                rows = self.execute_server_cursor(query, params)
+                # Process and yield rows
+                rows_processed = 0
+                for row in rows:
+                    if first_iteration:
+                        start_date = row.get("createdon", start_date)
+                        first_iteration = False
+                    last_createdon = row.get("createdon")
+                    rows_processed += 1
+                    yield row
+                # If we processed fewer than the limit or no last_createdon, we're done
+                if rows_processed < limit or not last_createdon:
+                    break
+                # Update parameters for next iteration
+                if start_date != last_createdon:
+                    start_date = last_createdon
+                    offset = 0
+                else:
+                    offset += limit
+                logger.info(
+                    f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
+                )
+            except Exception as e:
+                logger.error(
+                    f"Error processing date range {start_date} to {end_date}: {str(e)}"
+                )
+                # Re-raise the exception after logging
+                raise
+    def get_all_aspects(
         self, from_createdon: datetime, stop_time: datetime
+    ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
+        logger.info("Fetching Structured properties aspects")
+        yield from self.get_aspects(
+            from_createdon=from_createdon,
+            stop_time=stop_time,
+            set_structured_properties_filter=True,
+        )
+        logger.info(
+            f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
+        )
+        time.sleep(
+            self.config.structured_properties_template_cache_invalidation_interval
+        )
+        logger.info("Fetching aspects")
+        yield from self.get_aspects(
+            from_createdon=from_createdon,
+            stop_time=stop_time,
+            set_structured_properties_filter=False,
+        )
+    def get_aspects(
+        self,
+        from_createdon: datetime,
+        stop_time: datetime,
+        set_structured_properties_filter: bool = False,
     ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
         orderer = VersionOrderer[Dict[str, Any]](
             enabled=self.config.include_all_versions
         )
-        rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
+        rows = self._get_rows(
+            start_date=from_createdon,
+            end_date=stop_time,
+            set_structured_properties_filter=set_structured_properties_filter,
+            limit=self.config.database_query_batch_size,
+        )
         for row in orderer(rows):
             mcp = self._parse_row(row)
             if mcp:
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
     def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
         """
-        Fetches all soft-deleted entities from the database.
+        Fetches all soft-deleted entities from the database using pagination.
         Yields:
             Row objects containing URNs of soft-deleted entities
         """
-        with self.engine.connect() as conn, contextlib.closing(
-            conn.connection.cursor()
-        ) as cursor:
-            logger.debug("Polling soft-deleted urns from database")
-            cursor.execute(self.soft_deleted_urns_query)
-            columns = [desc[0] for desc in cursor.description]
-            while True:
-                rows = cursor.fetchmany(self.config.database_query_batch_size)
-                if not rows:
-                    return
-                for row in rows:
-                    yield dict(zip(columns, row))
+        try:
+            params: Dict = {}
+            logger.debug("Fetching soft-deleted URNs")
+            # Use server-side cursor implementation
+            rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
+            processed_rows = 0
+            # Process and yield rows
+            for row in rows:
+                processed_rows += 1
+                yield row
+            logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
+        except Exception:
+            logger.exception("Error fetching soft-deleted row", exc_info=True)
+            raise
     def _parse_row(
         self, row: Dict[str, Any]

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
     ) -> Iterable[MetadataWorkUnit]:
         logger.info(f"Fetching database aspects starting from {from_createdon}")
         progress = ProgressTimer(report_every=timedelta(seconds=60))
-        mcps = reader.get_aspects(from_createdon, self.report.stop_time)
+        mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
         for i, (mcp, createdon) in enumerate(mcps):
             if not self.urn_pattern.allowed(str(mcp.entityUrn)):
                 continue

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -132,6 +132,12 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
     sql_parser_column_errors: int = 0
     sql_parser_successes: int = 0
+    # Details on where column info comes from.
+    nodes_with_catalog_columns: int = 0
+    nodes_with_inferred_columns: int = 0
+    nodes_with_graph_columns: int = 0
+    nodes_with_no_columns: int = 0
     sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
     sql_parser_detach_ctes_failures_list: LossyList[str] = field(
         default_factory=LossyList
@@ -619,14 +625,8 @@ class DBTNode:
     def exists_in_target_platform(self):
         return not (self.is_ephemeral_model() or self.node_type == "test")
-    def columns_setdefault(self, schema_fields: List[SchemaField]) -> None:
-        """
-        Update the column list if they are not already set.
-        """
-        if self.columns:
-            # If we already have columns, don't overwrite them.
-            return
+    def set_columns(self, schema_fields: List[SchemaField]) -> None:
+        """Update the column list."""
         self.columns = [
             DBTColumn(
@@ -1248,9 +1248,28 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                     target_node_urn, self._to_schema_info(inferred_schema_fields)
                 )
-            # Save the inferred schema fields into the dbt node.
-            if inferred_schema_fields:
-                node.columns_setdefault(inferred_schema_fields)
+            # When updating the node's columns, our order of preference is:
+            # 1. Schema from the dbt catalog
+            # 2. Inferred schema
+            # 3. Schema fetched from the graph
+            if node.columns:
+                self.report.nodes_with_catalog_columns += 1
+                pass  # we already have columns from the dbt catalog
+            elif inferred_schema_fields:
+                logger.debug(
+                    f"Using {len(inferred_schema_fields)} inferred columns for {node.dbt_name}"
+                )
+                self.report.nodes_with_inferred_columns += 1
+                node.set_columns(inferred_schema_fields)
+            elif schema_fields:
+                logger.debug(
+                    f"Using {len(schema_fields)} graph columns for {node.dbt_name}"
+                )
+                self.report.nodes_with_graph_columns += 1
+                node.set_columns(schema_fields)
+            else:
+                logger.debug(f"No columns found for {node.dbt_name}")
+                self.report.nodes_with_no_columns += 1
     def _parse_cll(
         self,

datahub/ingestion/source/gcs/gcs_source.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from typing import Dict, Iterable, List, Optional
-from urllib.parse import unquote
 from pydantic import Field, SecretStr, validator
@@ -19,6 +18,9 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
 from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin
 from datahub.ingestion.source.data_lake_common.data_lake_utils import PLATFORM_GCS
+from datahub.ingestion.source.data_lake_common.object_store import (
+    create_object_store_adapter,
+)
 from datahub.ingestion.source.data_lake_common.path_spec import PathSpec, is_gcs_uri
 from datahub.ingestion.source.s3.config import DataLakeSourceConfig
 from datahub.ingestion.source.s3.report import DataLakeSourceReport
@@ -136,16 +138,29 @@ class GCSSource(StatefulIngestionSourceBase):
     def create_equivalent_s3_source(self, ctx: PipelineContext) -> S3Source:
         config = self.create_equivalent_s3_config()
-        return self.s3_source_overrides(S3Source(config, PipelineContext(ctx.run_id)))
+        s3_source = S3Source(config, PipelineContext(ctx.run_id))
+        return self.s3_source_overrides(s3_source)
     def s3_source_overrides(self, source: S3Source) -> S3Source:
-        source.source_config.platform = PLATFORM_GCS
+        """
+        Override S3Source methods with GCS-specific implementations using the adapter pattern.
+        This method customizes the S3Source instance to behave like a GCS source by
+        applying the GCS-specific adapter that replaces the necessary functionality.
-        source.is_s3_platform = lambda: True  # type: ignore
-        source.create_s3_path = lambda bucket_name, key: unquote(  # type: ignore
-            f"s3://{bucket_name}/{key}"
+        Args:
+            source: The S3Source instance to customize
+        Returns:
+            The modified S3Source instance with GCS behavior
+        """
+        # Create a GCS adapter with project ID and region from our config
+        adapter = create_object_store_adapter(
+            "gcs",
         )
-        return source
+        # Apply all customizations to the source
+        return adapter.apply_customizations(source)
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [

datahub/ingestion/source/gcs/gcs_utils.py CHANGED Viewed

@@ -4,35 +4,62 @@ GCS_PREFIX = "gs://"
 def is_gcs_uri(uri: str) -> bool:
+    """
+    Check if a URI is a GCS URI (starts with gs://).
+    For more general URI handling, consider using object_store.get_object_store_for_uri.
+    """
     return uri.startswith(GCS_PREFIX)
 def get_gcs_prefix(gcs_uri: str) -> Optional[str]:
+    """
+    Get the GCS prefix (gs://) if the URI is a GCS URI.
+    For more general URI handling, consider using object_store.get_object_store_for_uri.
+    """
     if gcs_uri.startswith(GCS_PREFIX):
         return GCS_PREFIX
     return None
 def strip_gcs_prefix(gcs_uri: str) -> str:
-    # remove GCS prefix (gs://)
+    """
+    Remove the GCS prefix (gs://) from a GCS URI.
+    For more general URI handling, consider using the object_store module.
+    Args:
+        gcs_uri: A GCS URI starting with gs://
+    Returns:
+        The URI without the gs:// prefix
+    Raises:
+        ValueError: If the URI doesn't start with gs://
+    """
     prefix = get_gcs_prefix(gcs_uri)
     if not prefix:
-        raise ValueError(f"Not an GCS URI. Must start with prefix: {GCS_PREFIX}")
+        raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
     return gcs_uri[len(GCS_PREFIX) :]
-def get_gcs_bucket_name(path):
-    if not is_gcs_uri(path):
-        raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
-    return strip_gcs_prefix(path).split("/")[0]
 def get_gcs_bucket_relative_path(gcs_uri: str) -> str:
+    """
+    Get the path relative to the bucket from a GCS URI.
+    For more general URI handling, consider using object_store.get_object_key.
+    """
     return "/".join(strip_gcs_prefix(gcs_uri).split("/")[1:])
 def get_gcs_key_prefix(gcs_uri: str) -> str:
+    """
+    Get the key prefix (first path component after bucket) from a GCS URI.
+    For more general URI handling, consider using object_store.get_object_key.
+    """
     if not is_gcs_uri(gcs_uri):
-        raise ValueError(f"Not a GCS URI. Must start with prefixe: {GCS_PREFIX}")
+        raise ValueError(f"Not a GCS URI. Must start with prefix: {GCS_PREFIX}")
     return strip_gcs_prefix(gcs_uri).split("/", maxsplit=1)[1]

datahub/ingestion/source/hex/query_fetcher.py CHANGED Viewed

@@ -18,8 +18,12 @@ from datahub.utilities.time import datetime_to_ts_millis
 logger = logging.getLogger(__name__)
 # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
-# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
-HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
+# Context values:
+# - SCHEDULED_RUN: The query was executed during a scheduled run of a published Hex app.
+# - LOGIC_VIEW: The query was executed from the Hex project's notebook view. This happens when a user is actively editing a Hex notebook: When they first open and run it or when they rerun without cached results.
+# - APP_VIEW: The query was executed during a published app session. This happens when a user opens up a published app or reruns the app without cached results.
+# Only match metadata with "context": "SCHEDULED_RUN|APP_VIEW" to filter out those from notebook, which may bring more noise from development than value
+HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "(?:SCHEDULED_RUN|APP_VIEW)".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
 @dataclass
@@ -197,13 +201,15 @@ class HexQueryFetcher:
         Example:
         -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
-        # TODO: Consider supporting multiline metadata format in the future:
+        TODO: Consider supporting multiline metadata format in the future:
         # -- Hex query metadata: {
         # --   "categories": ["Scratchpad"],
         # --   "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
         # --   ...
         # -- }
+        TODO: Extract based on pattern matching is strict on the order of the keys in the metadata. Consider using a more flexible approach like JSON parsing.
         Returns:
             A tuple of (project_id, workspace_name) if both are successfully extracted
             None if extraction fails for any reason

datahub/ingestion/source/openapi.py CHANGED Viewed

@@ -82,6 +82,9 @@ class OpenApiConfig(ConfigModel):
     get_token: dict = Field(
         default={}, description="Retrieving a token from the endpoint."
     )
+    verify_ssl: bool = Field(
+        default=True, description="Enable SSL certificate verification"
+    )
     @validator("bearer_token", always=True)
     def ensure_only_one_token(
@@ -129,12 +132,14 @@ class OpenApiConfig(ConfigModel):
                     tok_url=url4req,
                     method=self.get_token["request_type"],
                     proxies=self.proxies,
+                    verify_ssl=self.verify_ssl,
                 )
             sw_dict = get_swag_json(
                 self.url,
                 token=self.token,
                 swagger_file=self.swagger_file,
                 proxies=self.proxies,
+                verify_ssl=self.verify_ssl,
             )  # load the swagger file
         else:  # using basic auth for accessing endpoints
@@ -144,6 +149,7 @@ class OpenApiConfig(ConfigModel):
                 password=self.password,
                 swagger_file=self.swagger_file,
                 proxies=self.proxies,
+                verify_ssl=self.verify_ssl,
             )
         return sw_dict
@@ -343,6 +349,7 @@ class APISource(Source, ABC):
                         tot_url,
                         token=config.token,
                         proxies=config.proxies,
+                        verify_ssl=config.verify_ssl,
                     )
                 else:
                     response = request_call(
@@ -350,6 +357,7 @@ class APISource(Source, ABC):
                         username=config.username,
                         password=config.password,
                         proxies=config.proxies,
+                        verify_ssl=config.verify_ssl,
                     )
                 if response.status_code == 200:
                     fields2add, root_dataset_samples[dataset_name] = extract_fields(
@@ -380,6 +388,7 @@ class APISource(Source, ABC):
                             tot_url,
                             token=config.token,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     else:
                         response = request_call(
@@ -387,6 +396,7 @@ class APISource(Source, ABC):
                             username=config.username,
                             password=config.password,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     if response.status_code == 200:
                         fields2add, _ = extract_fields(response, dataset_name)
@@ -415,6 +425,7 @@ class APISource(Source, ABC):
                             tot_url,
                             token=config.token,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     else:
                         response = request_call(
@@ -422,6 +433,7 @@ class APISource(Source, ABC):
                             username=config.username,
                             password=config.password,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     if response.status_code == 200:
                         fields2add, _ = extract_fields(response, dataset_name)

acryl-datahub 1.1.0rc3__py3-none-any.whl → 1.1.0.1__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.0rc3py3-none-any.whl → 1.1.0.1py3-none-any.whl