PyPI - acryl-datahub - Versions diffs - 1.1.1rc2__py3-none-any.whl → 1.1.1rc4__py3-none-any.whl - Mend

acryl-datahub 1.1.1rc2py3-none-any.whl → 1.1.1rc4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of acryl-datahub might be problematic. Click here for more details.

Files changed (35) hide show

{acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/METADATA +2612 -2610
{acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/RECORD +35 -33
{acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/WHEEL +1 -1
datahub/_version.py +1 -1
datahub/api/entities/dataset/dataset.py +9 -8
datahub/api/entities/structuredproperties/structuredproperties.py +2 -2
datahub/cli/ingest_cli.py +9 -1
datahub/emitter/mce_builder.py +3 -1
datahub/emitter/response_helper.py +86 -1
datahub/emitter/rest_emitter.py +1 -1
datahub/ingestion/source/datahub/config.py +11 -0
datahub/ingestion/source/datahub/datahub_database_reader.py +186 -33
datahub/ingestion/source/datahub/datahub_source.py +1 -1
datahub/ingestion/source/dbt/dbt_common.py +30 -11
datahub/ingestion/source/hex/query_fetcher.py +9 -3
datahub/ingestion/source/openapi.py +12 -0
datahub/ingestion/source/openapi_parser.py +56 -37
datahub/ingestion/source/snowflake/snowflake_config.py +13 -0
datahub/ingestion/source/snowflake/snowflake_v2.py +17 -6
datahub/ingestion/source/sql/sql_types.py +5 -2
datahub/metadata/_internal_schema_classes.py +515 -515
datahub/metadata/_urns/urn_defs.py +1785 -1785
datahub/metadata/schema.avsc +17269 -17639
datahub/metadata/schemas/DataHubIngestionSourceKey.avsc +2 -1
datahub/metadata/schemas/ExecutionRequestInput.avsc +5 -0
datahub/metadata/schemas/__init__.py +3 -3
datahub/sdk/__init__.py +4 -0
datahub/sdk/_all_entities.py +4 -0
datahub/sdk/_shared.py +2 -1
datahub/sdk/dataflow.py +302 -0
datahub/sdk/datajob.py +335 -0
datahub/sdk/entity_client.py +8 -0
{acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/entry_points.txt +0 -0
{acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/licenses/LICENSE +0 -0
{acryl_datahub-1.1.1rc2.dist-info → acryl_datahub-1.1.1rc4.dist-info}/top_level.txt +0 -0

datahub/ingestion/source/datahub/datahub_database_reader.py CHANGED Viewed

@@ -1,10 +1,10 @@
-import contextlib
 import json
 import logging
+import time
 from datetime import datetime
 from typing import Any, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, text
 from datahub.emitter.aspect import ASPECT_MAP
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
 # Should work for at least mysql, mariadb, postgres
 DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
+DATE_FORMAT = "%Y-%m-%d"
 ROW = TypeVar("ROW", bound=Dict[str, Any])
@@ -85,6 +86,9 @@ class DataHubDatabaseReader:
             **connection_config.options,
         )
+        # Cache for available dates to avoid redundant queries
+        self.available_dates_cache: Optional[List[datetime]] = None
     @property
     def soft_deleted_urns_query(self) -> str:
         return f"""
@@ -100,14 +104,12 @@ class DataHubDatabaseReader:
             ORDER BY mav.urn
         """
-    @property
-    def query(self) -> str:
-        # May repeat rows for the same date
-        # Offset is generally 0, unless we repeat the same createdon twice
+    def query(self, set_structured_properties_filter: bool) -> str:
+        """
+        Main query that gets data for specified date range with appropriate filters.
+        """
+        structured_prop_filter = f" AND urn {'' if set_structured_properties_filter else 'NOT'} like 'urn:li:structuredProperty:%%'"
-        # Ensures stable order, chronological per (urn, aspect)
-        # Relies on createdon order to reflect version order
-        # Ordering of entries with the same createdon is handled by VersionOrderer
         return f"""
         SELECT *
         FROM (
@@ -132,6 +134,7 @@ class DataHubDatabaseReader:
                 {"" if self.config.include_all_versions else "AND mav.version = 0"}
                 {"" if not self.config.exclude_aspects else "AND mav.aspect NOT IN %(exclude_aspects)s"}
                 AND mav.createdon >= %(since_createdon)s
+                AND mav.createdon < %(end_createdon)s
             ORDER BY
                 createdon,
                 urn,
@@ -139,50 +142,194 @@ class DataHubDatabaseReader:
                 version
         ) as t
         WHERE 1=1
-            {"" if self.config.include_soft_deleted_entities else "AND (removed = false or removed is NULL)"}
+            {"" if self.config.include_soft_deleted_entities else " AND (removed = false or removed is NULL)"}
+            {structured_prop_filter}
         ORDER BY
             createdon,
             urn,
             aspect,
             version
+        LIMIT %(limit)s
+        OFFSET %(offset)s
         """
+    def execute_with_params(
+        self, query: str, params: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
+        """Execute query with proper parameter binding that works with your database"""
+        with self.engine.connect() as conn:
+            result = conn.execute(query, params or {})
+            return [dict(row) for row in result.fetchall()]
     def execute_server_cursor(
         self, query: str, params: Dict[str, Any]
     ) -> Iterable[Dict[str, Any]]:
+        """Execute a query with server-side cursor"""
         with self.engine.connect() as conn:
             if self.engine.dialect.name in ["postgresql", "mysql", "mariadb"]:
                 with (
                     conn.begin()
                 ):  # Transaction required for PostgreSQL server-side cursor
-                    # Note that stream_results=True is mainly supported by PostgreSQL and MySQL-based dialects.
-                    # https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.Connection.execution_options.params.stream_results
+                    # Set query timeout at the connection level
+                    if self.config.query_timeout:
+                        if self.engine.dialect.name == "postgresql":
+                            conn.execute(
+                                text(
+                                    f"SET statement_timeout = {self.config.query_timeout * 1000}"
+                                )
+                            )  # milliseconds
+                        elif self.engine.dialect.name in ["mysql", "mariadb"]:
+                            conn.execute(
+                                text(
+                                    f"SET max_execution_time = {self.config.query_timeout * 1000}"
+                                )
+                            )  # milliseconds
+                    # Stream results with batch size
                     conn = conn.execution_options(
                         stream_results=True,
                         yield_per=self.config.database_query_batch_size,
                     )
+                    # Execute query - using native parameterization without text()
+                    # to maintain compatibility with your original code
                     result = conn.execute(query, params)
                     for row in result:
                         yield dict(row)
+                return  # Success, exit the retry loop
             else:
                 raise ValueError(f"Unsupported dialect: {self.engine.dialect.name}")
     def _get_rows(
-        self, from_createdon: datetime, stop_time: datetime
+        self,
+        start_date: datetime,
+        end_date: datetime,
+        set_structured_properties_filter: bool,
+        limit: int,
     ) -> Iterable[Dict[str, Any]]:
-        params = {
-            "exclude_aspects": list(self.config.exclude_aspects),
-            "since_createdon": from_createdon.strftime(DATETIME_FORMAT),
-        }
-        yield from self.execute_server_cursor(self.query, params)
+        """
+        Retrieves data rows within a specified date range using pagination.
-    def get_aspects(
+        Implements a hybrid pagination strategy that switches between time-based and
+        offset-based approaches depending on the returned data. Uses server-side
+        cursors for efficient memory usage.
+        Note: May return duplicate rows across batch boundaries when multiple rows
+        share the same 'createdon' timestamp. This is expected behavior when
+        transitioning between pagination methods.
+        Args:
+        start_date: Beginning of date range (inclusive)
+        end_date: End of date range (exclusive)
+        set_structured_properties_filter: Whether to apply structured filtering
+        limit: Maximum rows to fetch per query
+        Returns:
+            An iterable of database rows as dictionaries
+        """
+        offset = 0
+        last_createdon = None
+        first_iteration = True
+        while True:
+            try:
+                # Set up query and parameters - using named parameters
+                query = self.query(set_structured_properties_filter)
+                params: Dict[str, Any] = {
+                    "since_createdon": start_date.strftime(DATETIME_FORMAT),
+                    "end_createdon": end_date.strftime(DATETIME_FORMAT),
+                    "limit": limit,
+                    "offset": offset,
+                }
+                # Add exclude_aspects if needed
+                if (
+                    hasattr(self.config, "exclude_aspects")
+                    and self.config.exclude_aspects
+                ):
+                    params["exclude_aspects"] = tuple(self.config.exclude_aspects)
+                logger.info(
+                    f"Querying data from {start_date.strftime(DATETIME_FORMAT)} to {end_date.strftime(DATETIME_FORMAT)} "
+                    f"with limit {limit} and offset {offset} (inclusive range)"
+                )
+                # Execute query with server-side cursor
+                rows = self.execute_server_cursor(query, params)
+                # Process and yield rows
+                rows_processed = 0
+                for row in rows:
+                    if first_iteration:
+                        start_date = row.get("createdon", start_date)
+                        first_iteration = False
+                    last_createdon = row.get("createdon")
+                    rows_processed += 1
+                    yield row
+                # If we processed fewer than the limit or no last_createdon, we're done
+                if rows_processed < limit or not last_createdon:
+                    break
+                # Update parameters for next iteration
+                if start_date != last_createdon:
+                    start_date = last_createdon
+                    offset = 0
+                else:
+                    offset += limit
+                logger.info(
+                    f"Processed {rows_processed} rows for date range {start_date} to {end_date}. Continuing to next batch."
+                )
+            except Exception as e:
+                logger.error(
+                    f"Error processing date range {start_date} to {end_date}: {str(e)}"
+                )
+                # Re-raise the exception after logging
+                raise
+    def get_all_aspects(
         self, from_createdon: datetime, stop_time: datetime
+    ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
+        logger.info("Fetching Structured properties aspects")
+        yield from self.get_aspects(
+            from_createdon=from_createdon,
+            stop_time=stop_time,
+            set_structured_properties_filter=True,
+        )
+        logger.info(
+            f"Waiting for {self.config.structured_properties_template_cache_invalidation_interval} seconds for structured properties cache to invalidate"
+        )
+        time.sleep(
+            self.config.structured_properties_template_cache_invalidation_interval
+        )
+        logger.info("Fetching aspects")
+        yield from self.get_aspects(
+            from_createdon=from_createdon,
+            stop_time=stop_time,
+            set_structured_properties_filter=False,
+        )
+    def get_aspects(
+        self,
+        from_createdon: datetime,
+        stop_time: datetime,
+        set_structured_properties_filter: bool = False,
     ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
         orderer = VersionOrderer[Dict[str, Any]](
             enabled=self.config.include_all_versions
         )
-        rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
+        rows = self._get_rows(
+            start_date=from_createdon,
+            end_date=stop_time,
+            set_structured_properties_filter=set_structured_properties_filter,
+            limit=self.config.database_query_batch_size,
+        )
         for row in orderer(rows):
             mcp = self._parse_row(row)
             if mcp:
@@ -190,23 +337,29 @@ class DataHubDatabaseReader:
     def get_soft_deleted_rows(self) -> Iterable[Dict[str, Any]]:
         """
-        Fetches all soft-deleted entities from the database.
+        Fetches all soft-deleted entities from the database using pagination.
         Yields:
             Row objects containing URNs of soft-deleted entities
         """
-        with self.engine.connect() as conn, contextlib.closing(
-            conn.connection.cursor()
-        ) as cursor:
-            logger.debug("Polling soft-deleted urns from database")
-            cursor.execute(self.soft_deleted_urns_query)
-            columns = [desc[0] for desc in cursor.description]
-            while True:
-                rows = cursor.fetchmany(self.config.database_query_batch_size)
-                if not rows:
-                    return
-                for row in rows:
-                    yield dict(zip(columns, row))
+        try:
+            params: Dict = {}
+            logger.debug("Fetching soft-deleted URNs")
+            # Use server-side cursor implementation
+            rows = self.execute_server_cursor(self.soft_deleted_urns_query, params)
+            processed_rows = 0
+            # Process and yield rows
+            for row in rows:
+                processed_rows += 1
+                yield row
+            logger.debug(f"Fetched batch of {processed_rows} soft-deleted URNs")
+        except Exception:
+            logger.exception("Error fetching soft-deleted row", exc_info=True)
+            raise
     def _parse_row(
         self, row: Dict[str, Any]

datahub/ingestion/source/datahub/datahub_source.py CHANGED Viewed

@@ -117,7 +117,7 @@ class DataHubSource(StatefulIngestionSourceBase):
     ) -> Iterable[MetadataWorkUnit]:
         logger.info(f"Fetching database aspects starting from {from_createdon}")
         progress = ProgressTimer(report_every=timedelta(seconds=60))
-        mcps = reader.get_aspects(from_createdon, self.report.stop_time)
+        mcps = reader.get_all_aspects(from_createdon, self.report.stop_time)
         for i, (mcp, createdon) in enumerate(mcps):
             if not self.urn_pattern.allowed(str(mcp.entityUrn)):
                 continue

datahub/ingestion/source/dbt/dbt_common.py CHANGED Viewed

@@ -132,6 +132,12 @@ class DBTSourceReport(StaleEntityRemovalSourceReport):
     sql_parser_column_errors: int = 0
     sql_parser_successes: int = 0
+    # Details on where column info comes from.
+    nodes_with_catalog_columns: int = 0
+    nodes_with_inferred_columns: int = 0
+    nodes_with_graph_columns: int = 0
+    nodes_with_no_columns: int = 0
     sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
     sql_parser_detach_ctes_failures_list: LossyList[str] = field(
         default_factory=LossyList
@@ -619,14 +625,8 @@ class DBTNode:
     def exists_in_target_platform(self):
         return not (self.is_ephemeral_model() or self.node_type == "test")
-    def columns_setdefault(self, schema_fields: List[SchemaField]) -> None:
-        """
-        Update the column list if they are not already set.
-        """
-        if self.columns:
-            # If we already have columns, don't overwrite them.
-            return
+    def set_columns(self, schema_fields: List[SchemaField]) -> None:
+        """Update the column list."""
         self.columns = [
             DBTColumn(
@@ -1248,9 +1248,28 @@ class DBTSourceBase(StatefulIngestionSourceBase):
                     target_node_urn, self._to_schema_info(inferred_schema_fields)
                 )
-            # Save the inferred schema fields into the dbt node.
-            if inferred_schema_fields:
-                node.columns_setdefault(inferred_schema_fields)
+            # When updating the node's columns, our order of preference is:
+            # 1. Schema from the dbt catalog
+            # 2. Inferred schema
+            # 3. Schema fetched from the graph
+            if node.columns:
+                self.report.nodes_with_catalog_columns += 1
+                pass  # we already have columns from the dbt catalog
+            elif inferred_schema_fields:
+                logger.debug(
+                    f"Using {len(inferred_schema_fields)} inferred columns for {node.dbt_name}"
+                )
+                self.report.nodes_with_inferred_columns += 1
+                node.set_columns(inferred_schema_fields)
+            elif schema_fields:
+                logger.debug(
+                    f"Using {len(schema_fields)} graph columns for {node.dbt_name}"
+                )
+                self.report.nodes_with_graph_columns += 1
+                node.set_columns(schema_fields)
+            else:
+                logger.debug(f"No columns found for {node.dbt_name}")
+                self.report.nodes_with_no_columns += 1
     def _parse_cll(
         self,

datahub/ingestion/source/hex/query_fetcher.py CHANGED Viewed

@@ -18,8 +18,12 @@ from datahub.utilities.time import datetime_to_ts_millis
 logger = logging.getLogger(__name__)
 # Pattern to extract both project_id and workspace_name from Hex metadata in SQL comments
-# Only match metadata with "context": "SCHEDULED_RUN" to filter out non-scheduled runs
-HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "SCHEDULED_RUN".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
+# Context values:
+# - SCHEDULED_RUN: The query was executed during a scheduled run of a published Hex app.
+# - LOGIC_VIEW: The query was executed from the Hex project's notebook view. This happens when a user is actively editing a Hex notebook: When they first open and run it or when they rerun without cached results.
+# - APP_VIEW: The query was executed during a published app session. This happens when a user opens up a published app or reruns the app without cached results.
+# Only match metadata with "context": "SCHEDULED_RUN|APP_VIEW" to filter out those from notebook, which may bring more noise from development than value
+HEX_METADATA_PATTERN = r'-- Hex query metadata: \{.*?"context": "(?:SCHEDULED_RUN|APP_VIEW)".*?"project_id": "([^"]+)".*?"project_url": "https?://[^/]+/([^/]+)/hex/.*?\}'
 @dataclass
@@ -197,13 +201,15 @@ class HexQueryFetcher:
         Example:
         -- Hex query metadata: {"categories": ["Scratchpad"], "cell_type": "SQL", "connection": "Long Tail Companions", "context": "SCHEDULED_RUN", "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf", "project_url": "https://app.hex.tech/acryl-partnership/hex/d73da67d-c87b-4dd8-9e7f-b79cb7f822cf/draft/logic?selectedCellId=67c38da0-e631-4005-9750-5bdae2a2ef3f"}
-        # TODO: Consider supporting multiline metadata format in the future:
+        TODO: Consider supporting multiline metadata format in the future:
         # -- Hex query metadata: {
         # --   "categories": ["Scratchpad"],
         # --   "project_id": "d73da67d-c87b-4dd8-9e7f-b79cb7f822cf",
         # --   ...
         # -- }
+        TODO: Extract based on pattern matching is strict on the order of the keys in the metadata. Consider using a more flexible approach like JSON parsing.
         Returns:
             A tuple of (project_id, workspace_name) if both are successfully extracted
             None if extraction fails for any reason

datahub/ingestion/source/openapi.py CHANGED Viewed

@@ -82,6 +82,9 @@ class OpenApiConfig(ConfigModel):
     get_token: dict = Field(
         default={}, description="Retrieving a token from the endpoint."
     )
+    verify_ssl: bool = Field(
+        default=True, description="Enable SSL certificate verification"
+    )
     @validator("bearer_token", always=True)
     def ensure_only_one_token(
@@ -129,12 +132,14 @@ class OpenApiConfig(ConfigModel):
                     tok_url=url4req,
                     method=self.get_token["request_type"],
                     proxies=self.proxies,
+                    verify_ssl=self.verify_ssl,
                 )
             sw_dict = get_swag_json(
                 self.url,
                 token=self.token,
                 swagger_file=self.swagger_file,
                 proxies=self.proxies,
+                verify_ssl=self.verify_ssl,
             )  # load the swagger file
         else:  # using basic auth for accessing endpoints
@@ -144,6 +149,7 @@ class OpenApiConfig(ConfigModel):
                 password=self.password,
                 swagger_file=self.swagger_file,
                 proxies=self.proxies,
+                verify_ssl=self.verify_ssl,
             )
         return sw_dict
@@ -343,6 +349,7 @@ class APISource(Source, ABC):
                         tot_url,
                         token=config.token,
                         proxies=config.proxies,
+                        verify_ssl=config.verify_ssl,
                     )
                 else:
                     response = request_call(
@@ -350,6 +357,7 @@ class APISource(Source, ABC):
                         username=config.username,
                         password=config.password,
                         proxies=config.proxies,
+                        verify_ssl=config.verify_ssl,
                     )
                 if response.status_code == 200:
                     fields2add, root_dataset_samples[dataset_name] = extract_fields(
@@ -380,6 +388,7 @@ class APISource(Source, ABC):
                             tot_url,
                             token=config.token,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     else:
                         response = request_call(
@@ -387,6 +396,7 @@ class APISource(Source, ABC):
                             username=config.username,
                             password=config.password,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     if response.status_code == 200:
                         fields2add, _ = extract_fields(response, dataset_name)
@@ -415,6 +425,7 @@ class APISource(Source, ABC):
                             tot_url,
                             token=config.token,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     else:
                         response = request_call(
@@ -422,6 +433,7 @@ class APISource(Source, ABC):
                             username=config.username,
                             password=config.password,
                             proxies=config.proxies,
+                            verify_ssl=config.verify_ssl,
                         )
                     if response.status_code == 200:
                         fields2add, _ = extract_fields(response, dataset_name)

datahub/ingestion/source/openapi_parser.py CHANGED Viewed

@@ -59,17 +59,21 @@ def request_call(
     username: Optional[str] = None,
     password: Optional[str] = None,
     proxies: Optional[dict] = None,
+    verify_ssl: bool = True,
 ) -> requests.Response:
     headers = {"accept": "application/json"}
     if username is not None and password is not None:
         return requests.get(
-            url, headers=headers, auth=HTTPBasicAuth(username, password)
+            url,
+            headers=headers,
+            auth=HTTPBasicAuth(username, password),
+            verify=verify_ssl,
         )
     elif token is not None:
         headers["Authorization"] = f"{token}"
-        return requests.get(url, proxies=proxies, headers=headers)
+        return requests.get(url, proxies=proxies, headers=headers, verify=verify_ssl)
     else:
-        return requests.get(url, headers=headers)
+        return requests.get(url, headers=headers, verify=verify_ssl)
 def get_swag_json(
@@ -79,10 +83,16 @@ def get_swag_json(
     password: Optional[str] = None,
     swagger_file: str = "",
     proxies: Optional[dict] = None,
+    verify_ssl: bool = True,
 ) -> Dict:
     tot_url = url + swagger_file
     response = request_call(
-        url=tot_url, token=token, username=username, password=password, proxies=proxies
+        url=tot_url,
+        token=token,
+        username=username,
+        password=password,
+        proxies=proxies,
+        verify_ssl=verify_ssl,
     )
     if response.status_code != 200:
@@ -127,37 +137,45 @@ def get_endpoints(sw_dict: dict) -> dict:
     check_sw_version(sw_dict)
     for p_k, p_o in sw_dict["paths"].items():
-        method = list(p_o)[0]
-        if "200" in p_o[method]["responses"]:
-            base_res = p_o[method]["responses"]["200"]
-        elif 200 in p_o[method]["responses"]:
-            # if you read a plain yml file the 200 will be an integer
-            base_res = p_o[method]["responses"][200]
-        else:
-            # the endpoint does not have a 200 response
-            continue
-        if "description" in p_o[method]:
-            desc = p_o[method]["description"]
-        elif "summary" in p_o[method]:
-            desc = p_o[method]["summary"]
-        else:  # still testing
-            desc = ""
-        try:
-            tags = p_o[method]["tags"]
-        except KeyError:
-            tags = []
-        url_details[p_k] = {"description": desc, "tags": tags, "method": method}
-        example_data = check_for_api_example_data(base_res, p_k)
-        if example_data:
-            url_details[p_k]["data"] = example_data
-        # checking whether there are defined parameters to execute the call...
-        if "parameters" in p_o[method]:
-            url_details[p_k]["parameters"] = p_o[method]["parameters"]
+        for method, method_spec in p_o.items():
+            # skip non-method keys like "parameters"
+            if method.lower() not in [
+                "get",
+                "post",
+                "put",
+                "delete",
+                "patch",
+                "options",
+                "head",
+            ]:
+                continue
+            responses = method_spec.get("responses", {})
+            base_res = responses.get("200") or responses.get(200)
+            if not base_res:
+                # if there is no 200 response, we skip this method
+                continue
+            # if the description is not present, we will use the summary
+            # if both are not present, we will use an empty string
+            desc = method_spec.get("description") or method_spec.get("summary", "")
+            # if the tags are not present, we will use an empty list
+            tags = method_spec.get("tags", [])
+            url_details[p_k] = {
+                "description": desc,
+                "tags": tags,
+                "method": method.upper(),
+            }
+            example_data = check_for_api_example_data(base_res, p_k)
+            if example_data:
+                url_details[p_k]["data"] = example_data
+            # checking whether there are defined parameters to execute the call...
+            if "parameters" in p_o[method]:
+                url_details[p_k]["parameters"] = p_o[method]["parameters"]
     return dict(sorted(url_details.items()))
@@ -358,6 +376,7 @@ def get_tok(
     tok_url: str = "",
     method: str = "post",
     proxies: Optional[dict] = None,
+    verify_ssl: bool = True,
 ) -> str:
     """
     Trying to post username/password to get auth.
@@ -368,7 +387,7 @@ def get_tok(
         # this will make a POST call with username and password
         data = {"username": username, "password": password, "maxDuration": True}
         # url2post = url + "api/authenticate/"
-        response = requests.post(url4req, proxies=proxies, json=data)
+        response = requests.post(url4req, proxies=proxies, json=data, verify=verify_ssl)
         if response.status_code == 200:
             cont = json.loads(response.content)
             if "token" in cont:  # other authentication scheme
@@ -377,7 +396,7 @@ def get_tok(
                 token = f"Bearer {cont['tokens']['access']}"
     elif method == "get":
         # this will make a GET call with username and password
-        response = requests.get(url4req)
+        response = requests.get(url4req, verify=verify_ssl)
         if response.status_code == 200:
             cont = json.loads(response.content)
             token = cont["token"]

datahub/ingestion/source/snowflake/snowflake_config.py CHANGED Viewed

@@ -22,6 +22,7 @@ from datahub.ingestion.api.incremental_properties_helper import (
 from datahub.ingestion.glossary.classification_mixin import (
     ClassificationSourceConfigMixin,
 )
+from datahub.ingestion.source.snowflake.constants import SnowflakeEdition
 from datahub.ingestion.source.snowflake.snowflake_connection import (
     SnowflakeConnectionConfig,
 )
@@ -326,6 +327,18 @@ class SnowflakeV2Config(
         " Map of share name -> details of share.",
     )
+    known_snowflake_edition: Optional[SnowflakeEdition] = Field(
+        default=None,
+        description="Explicitly specify the Snowflake edition (STANDARD or ENTERPRISE). If unset, the edition will be inferred automatically using 'SHOW TAGS'.",
+    )
+    # Allows empty containers to be ingested before datasets are added, avoiding permission errors
+    warn_no_datasets: bool = Field(
+        hidden_from_docs=True,
+        default=False,
+        description="If True, warns when no datasets are found during ingestion. If False, ingestion fails when no datasets are found.",
+    )
     include_assertion_results: bool = Field(
         default=False,
         description="Whether to ingest assertion run results for assertions created using Datahub"

acryl-datahub 1.1.1rc2__py3-none-any.whl → 1.1.1rc4__py3-none-any.whl

Potentially problematic release.

acryl-datahub 1.1.1rc2py3-none-any.whl → 1.1.1rc4py3-none-any.whl