PyPI - databricks-sql-connector - Versions diffs - 2.9.2.dev1__tar.gz → 2.9.4b1__tar.gz - Mend

databricks-sql-connector 2.9.2.dev1tar.gz → 2.9.4b1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/CHANGELOG.md RENAMED Viewed

@@ -1,14 +1,31 @@
 # Release History
-## 2.9.x (Unreleased)
+## 2.9.4 (Unreleased)
-- Other: Explicitly pin urllib3 to ^2.0.0
+## 2.9.4b1 (2024-02-16)
+- Fix: Cloud fetch file download errors (#356)
+- Fix: Redact the URL query parameters from the urllib3.connectionpool logs (#341)
+## 2.9.3 (2023-08-24)
+- Fix: Connections failed when urllib3~=1.0.0 is installed (#206)
+## 2.9.2 (2023-08-17)
+- Other: Add `examples/v3_retries_query_execute.py` (#199)
+- Other: suppress log message when `_enable_v3_retries` is not `True` (#199)
+- Other: make this connector backwards compatible with `urllib3>=1.0.0` (#197)
+## 2.9.1 (2023-08-11)
+- Other: Explicitly pin urllib3 to ^2.0.0 (#191)
 ## 2.9.0 (2023-08-10)
-- Replace retry handling with DatabricksRetryPolicy. This is disabled by default. To enable, set `enable_v3_retries=True` when creating `databricks.sql.client`
-- Other: Fix typo in README quick start example
-- Other: Add autospec to Client mocks and tidy up `make_request`
+- Replace retry handling with DatabricksRetryPolicy. This is disabled by default. To enable, set `enable_v3_retries=True` when creating `databricks.sql.client` (#182)
+- Other: Fix typo in README quick start example (#186)
+- Other: Add autospec to Client mocks and tidy up `make_request` (#188)
 ## 2.8.0 (2023-07-21)

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: databricks-sql-connector
-Version: 2.9.2.dev1
+Version: 2.9.4b1
 Summary: Databricks SQL Connector for Python
 License: Apache-2.0
 Author: Databricks
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
 Requires-Dist: alembic (>=1.0.11,<2.0.0)
 Requires-Dist: lz4 (>=4.0.2,<5.0.0)
 Requires-Dist: numpy (>=1.16.6) ; python_version >= "3.7" and python_version < "3.11"

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "databricks-sql-connector"
-version = "2.9.2dev1"
+version = "2.9.4b1"
 description = "Databricks SQL Connector for Python"
 authors = ["Databricks <databricks-sql-connector-maintainers@databricks.com>"]
 license = "Apache-2.0"

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/__init__.py RENAMED Viewed

@@ -7,6 +7,38 @@ apilevel = "2.0"
 threadsafety = 1  # Threads may share the module, but not connections.
 paramstyle = "pyformat"  # Python extended format codes, e.g. ...WHERE name=%(name)s
+import re
+class RedactUrlQueryParamsFilter(logging.Filter):
+    pattern = re.compile(r"(\?|&)([\w-]+)=([^&\s]+)")
+    mask = r"\1\2=<REDACTED>"
+    def __init__(self):
+        super().__init__()
+    def redact(self, string):
+        return re.sub(self.pattern, self.mask, str(string))
+    def filter(self, record):
+        record.msg = self.redact(str(record.msg))
+        if isinstance(record.args, dict):
+            for k in record.args.keys():
+                record.args[k] = (
+                    self.redact(record.args[k])
+                    if isinstance(record.arg[k], str)
+                    else record.args[k]
+                )
+        else:
+            record.args = tuple(
+                (self.redact(arg) if isinstance(arg, str) else arg)
+                for arg in record.args
+            )
+        return True
+logging.getLogger("urllib3.connectionpool").addFilter(RedactUrlQueryParamsFilter())
 class DBAPITypeObject(object):
     def __init__(self, *values):
@@ -28,7 +60,7 @@ DATETIME = DBAPITypeObject("timestamp")
 DATE = DBAPITypeObject("date")
 ROWID = DBAPITypeObject()
-__version__ = "2.9.2dev1"
+__version__ = "2.9.4b1"
 USER_AGENT_NAME = "PyDatabricksSqlConnector"
 # These two functions are pyhive legacy

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/retry.py RENAMED Viewed

@@ -56,8 +56,7 @@ class DatabricksRetryPolicy(Retry):
         `backoff_factor`.
     :param delay_max:
-        Float of seconds for the maximum delay between retries. This is an alias for urllib3's
-        `backoff_max`
+        Float of seconds for the maximum delay between retries.
     :param stop_after_attempts_count:
         Integer maximum number of attempts that will be retried. This is an alias for urllib3's
@@ -122,7 +121,6 @@ class DatabricksRetryPolicy(Retry):
             total=_attempts_remaining,
             respect_retry_after_header=True,
             backoff_factor=self.delay_min,
-            backoff_max=self.delay_max,
             allowed_methods=["POST"],
             status_forcelist=[429, 503, *self.force_dangerous_codes],
         )
@@ -212,13 +210,11 @@ class DatabricksRetryPolicy(Retry):
             allowed_methods=self.allowed_methods,
             status_forcelist=self.status_forcelist,
             backoff_factor=self.backoff_factor,  # type: ignore
-            backoff_max=self.backoff_max,  # type: ignore
             raise_on_redirect=self.raise_on_redirect,
             raise_on_status=self.raise_on_status,
             history=self.history,
             remove_headers_on_redirect=self.remove_headers_on_redirect,
             respect_retry_after_header=self.respect_retry_after_header,
-            backoff_jitter=self.backoff_jitter,  # type: ignore
         )
         # Update urllib3's current state to reflect the incremented counters

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/cloudfetch/download_manager.py RENAMED Viewed

@@ -8,6 +8,7 @@ from databricks.sql.cloudfetch.downloader import (
     ResultSetDownloadHandler,
     DownloadableResultSettings,
 )
+from databricks.sql.exc import ResultSetDownloadError
 from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
 logger = logging.getLogger(__name__)
@@ -34,8 +35,6 @@ class ResultFileDownloadManager:
         self.download_handlers: List[ResultSetDownloadHandler] = []
         self.thread_pool = ThreadPoolExecutor(max_workers=max_download_threads + 1)
         self.downloadable_result_settings = DownloadableResultSettings(lz4_compressed)
-        self.fetch_need_retry = False
-        self.num_consecutive_result_file_download_retries = 0
     def add_file_links(
         self, t_spark_arrow_result_links: List[TSparkArrowResultLink]
@@ -81,13 +80,15 @@ class ResultFileDownloadManager:
         # Find next file
         idx = self._find_next_file_index(next_row_offset)
+        # is this correct?
         if idx is None:
             self._shutdown_manager()
+            logger.debug("could not find next file index")
             return None
         handler = self.download_handlers[idx]
         # Check (and wait) for download status
-        if self._check_if_download_successful(handler):
+        if handler.is_file_download_successful():
             # Buffer should be empty so set buffer to new ArrowQueue with result_file
             result = DownloadedFile(
                 handler.result_file,
@@ -97,9 +98,11 @@ class ResultFileDownloadManager:
             self.download_handlers.pop(idx)
             # Return True upon successful download to continue loop and not force a retry
             return result
-        # Download was not successful for next download item, force a retry
+        # Download was not successful for next download item. Fail
         self._shutdown_manager()
-        return None
+        raise ResultSetDownloadError(
+            f"Download failed for result set starting at {next_row_offset}"
+        )
     def _remove_past_handlers(self, next_row_offset: int):
         # Any link in which its start to end range doesn't include the next row to be fetched does not need downloading
@@ -133,33 +136,6 @@ class ResultFileDownloadManager:
         ]
         return next_indices[0] if len(next_indices) > 0 else None
-    def _check_if_download_successful(self, handler: ResultSetDownloadHandler):
-        # Check (and wait until download finishes) if download was successful
-        if not handler.is_file_download_successful():
-            if handler.is_link_expired:
-                self.fetch_need_retry = True
-                return False
-            elif handler.is_download_timedout:
-                # Consecutive file retries should not exceed threshold in settings
-                if (
-                    self.num_consecutive_result_file_download_retries
-                    >= self.downloadable_result_settings.max_consecutive_file_download_retries
-                ):
-                    self.fetch_need_retry = True
-                    return False
-                self.num_consecutive_result_file_download_retries += 1
-                # Re-submit handler run to thread pool and recursively check download status
-                self.thread_pool.submit(handler.run)
-                return self._check_if_download_successful(handler)
-            else:
-                self.fetch_need_retry = True
-                return False
-        self.num_consecutive_result_file_download_retries = 0
-        self.fetch_need_retry = False
-        return True
     def _shutdown_manager(self):
         # Clear download handlers and shutdown the thread pool
         self.download_handlers = []

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/cloudfetch/downloader.py RENAMED Viewed

@@ -1,15 +1,17 @@
 import logging
 from dataclasses import dataclass
 import requests
 import lz4.frame
 import threading
 import time
+import os
+import re
 from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
 logger = logging.getLogger(__name__)
+DEFAULT_CLOUD_FILE_TIMEOUT = int(os.getenv("DATABRICKS_CLOUD_FILE_TIMEOUT", 60))
 @dataclass
 class DownloadableResultSettings:
@@ -20,13 +22,17 @@ class DownloadableResultSettings:
         is_lz4_compressed (bool): Whether file is expected to be lz4 compressed.
         link_expiry_buffer_secs (int): Time in seconds to prevent download of a link before it expires. Default 0 secs.
         download_timeout (int): Timeout for download requests. Default 60 secs.
-        max_consecutive_file_download_retries (int): Number of consecutive download retries before shutting down.
+        download_max_retries (int): Number of consecutive download retries before shutting down.
+        max_retries (int): Number of consecutive download retries before shutting down.
+        backoff_factor (int): Factor to increase wait time between retries.
     """
     is_lz4_compressed: bool
     link_expiry_buffer_secs: int = 0
-    download_timeout: int = 60
-    max_consecutive_file_download_retries: int = 0
+    download_timeout: int = DEFAULT_CLOUD_FILE_TIMEOUT
+    max_retries: int = 5
+    backoff_factor: int = 2
 class ResultSetDownloadHandler(threading.Thread):
@@ -57,16 +63,21 @@ class ResultSetDownloadHandler(threading.Thread):
             else None
         )
         try:
+            logger.debug(
+                f"waiting for at most {timeout} seconds for download file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+            )
             if not self.is_download_finished.wait(timeout=timeout):
                 self.is_download_timedout = True
-                logger.debug(
-                    "Cloud fetch download timed out after {} seconds for link representing rows {} to {}".format(
-                        self.settings.download_timeout,
-                        self.result_link.startRowOffset,
-                        self.result_link.startRowOffset + self.result_link.rowCount,
-                    )
+                logger.error(
+                    f"cloud fetch download timed out after {self.settings.download_timeout} seconds for link representing rows {self.result_link.startRowOffset} to {self.result_link.startRowOffset + self.result_link.rowCount}"
                 )
-                return False
+                # there are some weird cases when the is_download_finished is not set, but the file is downloaded successfully
+                return self.is_file_downloaded_successfully
+            logger.debug(
+                f"finish waiting for download file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+            )
         except Exception as e:
             logger.error(e)
             return False
@@ -81,24 +92,36 @@ class ResultSetDownloadHandler(threading.Thread):
         """
         self._reset()
-        # Check if link is already expired or is expiring
-        if ResultSetDownloadHandler.check_link_expired(
-            self.result_link, self.settings.link_expiry_buffer_secs
-        ):
-            self.is_link_expired = True
-            return
+        try:
+            # Check if link is already expired or is expiring
+            if ResultSetDownloadHandler.check_link_expired(
+                self.result_link, self.settings.link_expiry_buffer_secs
+            ):
+                self.is_link_expired = True
+                return
-        session = requests.Session()
-        session.timeout = self.settings.download_timeout
+            logger.debug(
+                f"started to download file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+            )
-        try:
             # Get the file via HTTP request
-            response = session.get(self.result_link.fileLink)
+            response = http_get_with_retry(
+                url=self.result_link.fileLink,
+                max_retries=self.settings.max_retries,
+                backoff_factor=self.settings.backoff_factor,
+                download_timeout=self.settings.download_timeout,
+            )
-            if not response.ok:
-                self.is_file_downloaded_successfully = False
+            if not response:
+                logger.error(
+                    f"failed downloading file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+                )
                 return
+            logger.debug(
+                f"success downloading file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+            )
             # Save (and decompress if needed) the downloaded file
             compressed_data = response.content
             decompressed_data = (
@@ -109,15 +132,22 @@ class ResultSetDownloadHandler(threading.Thread):
             self.result_file = decompressed_data
             # The size of the downloaded file should match the size specified from TSparkArrowResultLink
-            self.is_file_downloaded_successfully = (
-                len(self.result_file) == self.result_link.bytesNum
+            success = len(self.result_file) == self.result_link.bytesNum
+            logger.debug(
+                f"download successful file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
             )
+            self.is_file_downloaded_successfully = success
         except Exception as e:
+            logger.error(
+                f"exception downloading file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+            )
             logger.error(e)
             self.is_file_downloaded_successfully = False
         finally:
-            session and session.close()
+            logger.debug(
+                f"signal finished file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
+            )
             # Awaken threads waiting for this to be true which signals the run is complete
             self.is_download_finished.set()
@@ -145,6 +175,7 @@ class ResultSetDownloadHandler(threading.Thread):
             link.expiryTime < current_time
             or link.expiryTime - current_time < expiry_buffer_secs
         ):
+            logger.debug("link expired")
             return True
         return False
@@ -171,3 +202,38 @@ class ResultSetDownloadHandler(threading.Thread):
                 uncompressed_data += data
                 start += num_bytes
         return uncompressed_data
+def http_get_with_retry(url, max_retries=5, backoff_factor=2, download_timeout=60):
+    attempts = 0
+    pattern = re.compile(r"(\?|&)([\w-]+)=([^&\s]+)")
+    mask = r"\1\2=<REDACTED>"
+    # TODO: introduce connection pooling. I am seeing weird errors without it.
+    while attempts < max_retries:
+        try:
+            session = requests.Session()
+            session.timeout = download_timeout
+            response = session.get(url)
+            # Check if the response status code is in the 2xx range for success
+            if response.status_code == 200:
+                return response
+            else:
+                logger.error(response)
+        except requests.RequestException as e:
+            # if this is not redacted, it will print the pre-signed URL
+            logger.error(f"request failed with exception: {re.sub(pattern, mask, str(e))}")
+        finally:
+            session.close()
+        # Exponential backoff before the next attempt
+        wait_time = backoff_factor**attempts
+        logger.info(f"retrying in {wait_time} seconds...")
+        time.sleep(wait_time)
+        attempts += 1
+    logger.error(
+        f"exceeded maximum number of retries ({max_retries}) while downloading result."
+    )
+    return None

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/exc.py RENAMED Viewed

@@ -115,3 +115,7 @@ class SessionAlreadyClosedError(RequestError):
 class CursorAlreadyClosedError(RequestError):
     """Thrown if CancelOperation receives a code 404. ThriftBackend should gracefully proceed as this is expected."""
+class ResultSetDownloadError(RequestError):
+    """Thrown if there was an error during the download of a result set"""

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_backend.py RENAMED Viewed

@@ -371,13 +371,16 @@ class ThriftBackend:
                 this_method_name = getattr(method, "__name__")
-                logger.debug("Sending request: {}(<REDACTED>)".format(this_method_name))
+                logger.debug(
+                    "sending thrift request: {}(<REDACTED>)".format(this_method_name)
+                )
                 unsafe_logger.debug("Sending request: {}".format(request))
                 # These three lines are no-ops if the v3 retry policy is not in use
-                this_command_type = CommandType.get(this_method_name)
-                self._transport.set_retry_command_type(this_command_type)
-                self._transport.startRetryTimer()
+                if self.enable_v3_retries:
+                    this_command_type = CommandType.get(this_method_name)
+                    self._transport.set_retry_command_type(this_command_type)
+                    self._transport.startRetryTimer()
                 response = method(request)
@@ -386,7 +389,9 @@ class ThriftBackend:
                 # We need to call type(response) here because thrift doesn't implement __name__ attributes for thrift responses
                 logger.debug(
-                    "Received response: {}(<REDACTED>)".format(type(response).__name__)
+                    "received thrift response: {}(<REDACTED>)".format(
+                        type(response).__name__
+                    )
                 )
                 unsafe_logger.debug("Received response: {}".format(response))
                 return response
@@ -740,6 +745,7 @@ class ThriftBackend:
         lz4_compressed = t_result_set_metadata_resp.lz4Compressed
         is_staging_operation = t_result_set_metadata_resp.isStagingOperation
         if direct_results and direct_results.resultSet:
+            logger.debug(f"received direct results")
             assert direct_results.resultSet.results.startRowOffset == 0
             assert direct_results.resultSetMetadata
@@ -752,6 +758,7 @@ class ThriftBackend:
                 description=description,
             )
         else:
+            logger.debug(f"must fetch results")
             arrow_queue_opt = None
         return ExecuteResponse(
             arrow_queue=arrow_queue_opt,
@@ -815,6 +822,10 @@ class ThriftBackend:
     ):
         assert session_handle is not None
+        logger.debug(
+            f"executing: cloud fetch: {use_cloud_fetch}, max rows: {max_rows}, max bytes: {max_bytes}"
+        )
         spark_arrow_types = ttypes.TSparkArrowTypes(
             timestampAsArrow=self._use_arrow_native_timestamps,
             decimalAsArrow=self._use_arrow_native_decimals,
@@ -929,6 +940,7 @@ class ThriftBackend:
         return self._handle_execute_response(resp, cursor)
     def _handle_execute_response(self, resp, cursor):
+        logger.debug(f"got execute response")
         cursor.active_op_handle = resp.operationHandle
         self._check_direct_results_for_error(resp.directResults)
@@ -949,6 +961,7 @@ class ThriftBackend:
         arrow_schema_bytes,
         description,
     ):
+        logger.debug("started to fetch results")
         assert op_handle is not None
         req = ttypes.TFetchResultsReq(

{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/utils.py RENAMED Viewed

@@ -5,6 +5,7 @@ from decimal import Decimal
 import datetime
 import decimal
 from enum import Enum
+import logging
 import lz4.frame
 from typing import Dict, List, Union, Any
 import pyarrow
@@ -18,6 +19,7 @@ from databricks.sql.thrift_api.TCLIService.ttypes import (
 )
 BIT_MASKS = [1, 2, 4, 8, 16, 32, 64, 128]
+logger = logging.getLogger(__name__)
 class ResultSetQueue(ABC):
@@ -71,6 +73,9 @@ class ResultSetQueueFactory(ABC):
             )
             return ArrowQueue(converted_arrow_table, n_valid_rows)
         elif row_set_type == TSparkRowSetType.URL_BASED_SET:
+            logger.debug(
+                f"built cloud fetch queue for {len(t_row_set.resultLinks)} links."
+            )
             return CloudFetchQueue(
                 arrow_schema_bytes,
                 start_row_offset=t_row_set.startRowOffset,
@@ -146,6 +151,9 @@ class CloudFetchQueue(ResultSetQueue):
         self.lz4_compressed = lz4_compressed
         self.description = description
+        logger.debug(
+            f"creating cloud fetch queue for {len(result_links)} links and max_download_threads {self.max_download_threads}."
+        )
         self.download_manager = ResultFileDownloadManager(
             self.max_download_threads, self.lz4_compressed
         )