PyPI - databricks-sql-connector - Versions diffs - 3.3.0__tar.gz → 3.5.0__tar.gz - Mend

databricks-sql-connector 3.3.0tar.gz → 3.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,16 @@
 # Release History
+# 3.5.0 (2024-10-18)
+- Create a non pyarrow flow to handle small results for the column set (databricks/databricks-sql-python#440 by @jprakash-db)
+- Fix: On non-retryable error, ensure PySQL includes useful information in error (databricks/databricks-sql-python#447 by @shivam2680)
+# 3.4.0 (2024-08-27)
+- Unpin pandas to support v2.2.2 (databricks/databricks-sql-python#416 by @kfollesdal)
+- Make OAuth as the default authenticator if no authentication setting is provided (databricks/databricks-sql-python#419 by @jackyhu-db)
+- Fix (regression): use SSL options with HTTPS connection pool (databricks/databricks-sql-python#425 by @kravets-levko)
 # 3.3.0 (2024-07-18)
 - Don't retry requests that fail with HTTP code 401 (databricks/databricks-sql-python#408 by @Hodnebo)

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: databricks-sql-connector
-Version: 3.3.0
+Version: 3.5.0
 Summary: Databricks SQL Connector for Python
 License: Apache-2.0
 Author: Databricks
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Provides-Extra: alembic
 Provides-Extra: sqlalchemy
 Requires-Dist: alembic (>=1.0.11,<2.0.0) ; extra == "alembic"
@@ -21,7 +22,7 @@ Requires-Dist: numpy (>=1.16.6,<2.0.0) ; python_version >= "3.8" and python_vers
 Requires-Dist: numpy (>=1.23.4,<2.0.0) ; python_version >= "3.11"
 Requires-Dist: oauthlib (>=3.1.0,<4.0.0)
 Requires-Dist: openpyxl (>=3.0.10,<4.0.0)
-Requires-Dist: pandas (>=1.2.5,<2.2.0) ; python_version >= "3.8"
+Requires-Dist: pandas (>=1.2.5,<2.3.0) ; python_version >= "3.8"
 Requires-Dist: pyarrow (>=14.0.1,<17)
 Requires-Dist: requests (>=2.18.1,<3.0.0)
 Requires-Dist: sqlalchemy (>=2.0.21) ; extra == "sqlalchemy" or extra == "alembic"
@@ -57,12 +58,9 @@ For the latest documentation, see
 Install the library with `pip install databricks-sql-connector`
-Note: Don't hard-code authentication secrets into your Python. Use environment variables
 ```bash
 export DATABRICKS_HOST=********.databricks.com
 export DATABRICKS_HTTP_PATH=/sql/1.0/endpoints/****************
-export DATABRICKS_TOKEN=dapi********************************
 ```
 Example usage:
@@ -72,12 +70,10 @@ from databricks import sql
 host = os.getenv("DATABRICKS_HOST")
 http_path = os.getenv("DATABRICKS_HTTP_PATH")
-access_token = os.getenv("DATABRICKS_TOKEN")
 connection = sql.connect(
   server_hostname=host,
-  http_path=http_path,
-  access_token=access_token)
+  http_path=http_path)
 cursor = connection.cursor()
 cursor.execute('SELECT :param `p`, * FROM RANGE(10)', {"param": "foo"})
@@ -93,7 +89,10 @@ In the above example:
 - `server-hostname` is the Databricks instance host name.
 - `http-path` is the HTTP Path either to a Databricks SQL endpoint (e.g. /sql/1.0/endpoints/1234567890abcdef),
 or to a Databricks Runtime interactive cluster (e.g. /sql/protocolv1/o/1234567890123456/1234-123456-slid123)
-- `personal-access-token` is the Databricks Personal Access Token for the account that will execute commands and queries
+> Note: This example uses [Databricks OAuth U2M](https://docs.databricks.com/en/dev-tools/auth/oauth-u2m.html)
+> to authenticate the target Databricks user account and needs to open the browser for authentication. So it
+> can only run on the user's machine.
 ## Contributing

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/README.md RENAMED Viewed

@@ -24,12 +24,9 @@ For the latest documentation, see
 Install the library with `pip install databricks-sql-connector`
-Note: Don't hard-code authentication secrets into your Python. Use environment variables
 ```bash
 export DATABRICKS_HOST=********.databricks.com
 export DATABRICKS_HTTP_PATH=/sql/1.0/endpoints/****************
-export DATABRICKS_TOKEN=dapi********************************
 ```
 Example usage:
@@ -39,12 +36,10 @@ from databricks import sql
 host = os.getenv("DATABRICKS_HOST")
 http_path = os.getenv("DATABRICKS_HTTP_PATH")
-access_token = os.getenv("DATABRICKS_TOKEN")
 connection = sql.connect(
   server_hostname=host,
-  http_path=http_path,
-  access_token=access_token)
+  http_path=http_path)
 cursor = connection.cursor()
 cursor.execute('SELECT :param `p`, * FROM RANGE(10)', {"param": "foo"})
@@ -60,7 +55,10 @@ In the above example:
 - `server-hostname` is the Databricks instance host name.
 - `http-path` is the HTTP Path either to a Databricks SQL endpoint (e.g. /sql/1.0/endpoints/1234567890abcdef),
 or to a Databricks Runtime interactive cluster (e.g. /sql/protocolv1/o/1234567890123456/1234-123456-slid123)
-- `personal-access-token` is the Databricks Personal Access Token for the account that will execute commands and queries
+> Note: This example uses [Databricks OAuth U2M](https://docs.databricks.com/en/dev-tools/auth/oauth-u2m.html)
+> to authenticate the target Databricks user account and needs to open the browser for authentication. So it
+> can only run on the user's machine.
 ## Contributing

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "databricks-sql-connector"
-version = "3.3.0"
+version = "3.5.0"
 description = "Databricks SQL Connector for Python"
 authors = ["Databricks <databricks-sql-connector-maintainers@databricks.com>"]
 license = "Apache-2.0"
@@ -12,7 +12,7 @@ include = ["CHANGELOG.md"]
 python = "^3.8.0"
 thrift = ">=0.16.0,<0.21.0"
 pandas = [
-    { version = ">=1.2.5,<2.2.0", python = ">=3.8" }
+    { version = ">=1.2.5,<2.3.0", python = ">=3.8" }
 ]
 pyarrow = ">=14.0.1,<17"

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/__init__.py RENAMED Viewed

@@ -68,7 +68,7 @@ DATETIME = DBAPITypeObject("timestamp")
 DATE = DBAPITypeObject("date")
 ROWID = DBAPITypeObject()
-__version__ = "3.3.0"
+__version__ = "3.5.0"
 USER_AGENT_NAME = "PyDatabricksSqlConnector"
 # These two functions are pyhive legacy

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/auth/auth.py RENAMED Viewed

@@ -64,7 +64,20 @@ def get_auth_provider(cfg: ClientContext):
         # no op authenticator. authentication is performed using ssl certificate outside of headers
         return AuthProvider()
     else:
-        raise RuntimeError("No valid authentication settings!")
+        if (
+            cfg.oauth_redirect_port_range is not None
+            and cfg.oauth_client_id is not None
+            and cfg.oauth_scopes is not None
+        ):
+            return DatabricksOAuthProvider(
+                cfg.hostname,
+                cfg.oauth_persistence,
+                cfg.oauth_redirect_port_range,
+                cfg.oauth_client_id,
+                cfg.oauth_scopes,
+            )
+        else:
+            raise RuntimeError("No valid authentication settings!")
 PYSQL_OAUTH_SCOPES = ["sql", "offline_access"]

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/auth/thrift_http_client.py RENAMED Viewed

@@ -1,13 +1,11 @@
 import base64
 import logging
 import urllib.parse
-from typing import Dict, Union
+from typing import Dict, Union, Optional
 import six
 import thrift
-logger = logging.getLogger(__name__)
 import ssl
 import warnings
 from http.client import HTTPResponse
@@ -16,6 +14,9 @@ from io import BytesIO
 from urllib3 import HTTPConnectionPool, HTTPSConnectionPool, ProxyManager
 from urllib3.util import make_headers
 from databricks.sql.auth.retry import CommandType, DatabricksRetryPolicy
+from databricks.sql.types import SSLOptions
+logger = logging.getLogger(__name__)
 class THttpClient(thrift.transport.THttpClient.THttpClient):
@@ -25,13 +26,12 @@ class THttpClient(thrift.transport.THttpClient.THttpClient):
         uri_or_host,
         port=None,
         path=None,
-        cafile=None,
-        cert_file=None,
-        key_file=None,
-        ssl_context=None,
+        ssl_options: Optional[SSLOptions] = None,
         max_connections: int = 1,
         retry_policy: Union[DatabricksRetryPolicy, int] = 0,
     ):
+        self._ssl_options = ssl_options
         if port is not None:
             warnings.warn(
                 "Please use the THttpClient('http{s}://host:port/path') constructor",
@@ -48,13 +48,11 @@ class THttpClient(thrift.transport.THttpClient.THttpClient):
             self.scheme = parsed.scheme
             assert self.scheme in ("http", "https")
             if self.scheme == "https":
-                self.certfile = cert_file
-                self.keyfile = key_file
-                self.context = (
-                    ssl.create_default_context(cafile=cafile)
-                    if (cafile and not ssl_context)
-                    else ssl_context
-                )
+                if self._ssl_options is not None:
+                    # TODO: Not sure if those options are used anywhere - need to double-check
+                    self.certfile = self._ssl_options.tls_client_cert_file
+                    self.keyfile = self._ssl_options.tls_client_cert_key_file
+                    self.context = self._ssl_options.create_ssl_context()
             self.port = parsed.port
             self.host = parsed.hostname
             self.path = parsed.path
@@ -109,12 +107,23 @@ class THttpClient(thrift.transport.THttpClient.THttpClient):
     def open(self):
         # self.__pool replaces the self.__http used by the original THttpClient
+        _pool_kwargs = {"maxsize": self.max_connections}
         if self.scheme == "http":
             pool_class = HTTPConnectionPool
         elif self.scheme == "https":
             pool_class = HTTPSConnectionPool
-        _pool_kwargs = {"maxsize": self.max_connections}
+            _pool_kwargs.update(
+                {
+                    "cert_reqs": ssl.CERT_REQUIRED
+                    if self._ssl_options.tls_verify
+                    else ssl.CERT_NONE,
+                    "ca_certs": self._ssl_options.tls_trusted_ca_file,
+                    "cert_file": self._ssl_options.tls_client_cert_file,
+                    "key_file": self._ssl_options.tls_client_cert_key_file,
+                    "key_password": self._ssl_options.tls_client_cert_key_password,
+                }
+            )
         if self.using_proxy():
             proxy_manager = ProxyManager(

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/client.py RENAMED Viewed

@@ -1,7 +1,11 @@
 from typing import Dict, Tuple, List, Optional, Any, Union, Sequence
 import pandas
-import pyarrow
+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
 import requests
 import json
 import os
@@ -22,6 +26,8 @@ from databricks.sql.utils import (
     ParamEscaper,
     inject_parameters,
     transform_paramstyle,
+    ColumnTable,
+    ColumnQueue,
 )
 from databricks.sql.parameters.native import (
     DbsqlParameterBase,
@@ -35,7 +41,7 @@ from databricks.sql.parameters.native import (
 )
-from databricks.sql.types import Row
+from databricks.sql.types import Row, SSLOptions
 from databricks.sql.auth.auth import get_python_sql_connector_auth_provider
 from databricks.sql.experimental.oauth_persistence import OAuthPersistence
@@ -96,7 +102,7 @@ class Connection:
                 sanitise parameterized inputs to prevent SQL injection.  The inline parameter approach is maintained for
                 legacy purposes and will be deprecated in a future release. When this parameter is `True` you will see
                 a warning log message. To suppress this log message, set `use_inline_params="silent"`.
-            auth_type: `str`, optional
+            auth_type: `str`, optional (default is databricks-oauth if neither `access_token` nor `tls_client_cert_file` is set)
                 `databricks-oauth` : to use Databricks OAuth with fine-grained permission scopes, set to `databricks-oauth`.
                 `azure-oauth` : to use Microsoft Entra ID OAuth flow, set to `azure-oauth`.
@@ -178,8 +184,9 @@ class Connection:
         # _tls_trusted_ca_file
         #   Set to the path of the file containing trusted CA certificates for server certificate
         #   verification. If not provide, uses system truststore.
-        # _tls_client_cert_file, _tls_client_cert_key_file
+        # _tls_client_cert_file, _tls_client_cert_key_file, _tls_client_cert_key_password
         #   Set client SSL certificate.
+        #   See https://docs.python.org/3/library/ssl.html#ssl.SSLContext.load_cert_chain
         # _retry_stop_after_attempts_count
         #  The maximum number of attempts during a request retry sequence (defaults to 24)
         # _socket_timeout
@@ -220,12 +227,25 @@ class Connection:
         base_headers = [("User-Agent", useragent_header)]
+        self._ssl_options = SSLOptions(
+            # Double negation is generally a bad thing, but we have to keep backward compatibility
+            tls_verify=not kwargs.get(
+                "_tls_no_verify", False
+            ),  # by default - verify cert and host
+            tls_verify_hostname=kwargs.get("_tls_verify_hostname", True),
+            tls_trusted_ca_file=kwargs.get("_tls_trusted_ca_file"),
+            tls_client_cert_file=kwargs.get("_tls_client_cert_file"),
+            tls_client_cert_key_file=kwargs.get("_tls_client_cert_key_file"),
+            tls_client_cert_key_password=kwargs.get("_tls_client_cert_key_password"),
+        )
         self.thrift_backend = ThriftBackend(
             self.host,
             self.port,
             http_path,
             (http_headers or []) + base_headers,
             auth_provider,
+            ssl_options=self._ssl_options,
             _use_arrow_native_complex_types=_use_arrow_native_complex_types,
             **kwargs,
         )
@@ -977,14 +997,14 @@ class Cursor:
         else:
             raise Error("There is no active result set")
-    def fetchall_arrow(self) -> pyarrow.Table:
+    def fetchall_arrow(self) -> "pyarrow.Table":
         self._check_not_closed()
         if self.active_result_set:
             return self.active_result_set.fetchall_arrow()
         else:
             raise Error("There is no active result set")
-    def fetchmany_arrow(self, size) -> pyarrow.Table:
+    def fetchmany_arrow(self, size) -> "pyarrow.Table":
         self._check_not_closed()
         if self.active_result_set:
             return self.active_result_set.fetchmany_arrow(size)
@@ -1129,6 +1149,18 @@ class ResultSet:
         self.results = results
         self.has_more_rows = has_more_rows
+    def _convert_columnar_table(self, table):
+        column_names = [c[0] for c in self.description]
+        ResultRow = Row(*column_names)
+        result = []
+        for row_index in range(table.num_rows):
+            curr_row = []
+            for col_index in range(table.num_columns):
+                curr_row.append(table.get_item(col_index, row_index))
+            result.append(ResultRow(*curr_row))
+        return result
     def _convert_arrow_table(self, table):
         column_names = [c[0] for c in self.description]
         ResultRow = Row(*column_names)
@@ -1164,14 +1196,14 @@ class ResultSet:
             timestamp_as_object=True,
         )
-        res = df.to_numpy(na_value=None)
+        res = df.to_numpy(na_value=None, dtype="object")
         return [ResultRow(*v) for v in res]
     @property
     def rownumber(self):
         return self._next_row_index
-    def fetchmany_arrow(self, size: int) -> pyarrow.Table:
+    def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
         """
         Fetch the next set of rows of a query result, returning a PyArrow table.
@@ -1196,7 +1228,49 @@ class ResultSet:
         return results
-    def fetchall_arrow(self) -> pyarrow.Table:
+    def merge_columnar(self, result1, result2):
+        """
+        Function to merge / combining the columnar results into a single result
+        :param result1:
+        :param result2:
+        :return:
+        """
+        if result1.column_names != result2.column_names:
+            raise ValueError("The columns in the results don't match")
+        merged_result = [
+            result1.column_table[i] + result2.column_table[i]
+            for i in range(result1.num_columns)
+        ]
+        return ColumnTable(merged_result, result1.column_names)
+    def fetchmany_columnar(self, size: int):
+        """
+        Fetch the next set of rows of a query result, returning a Columnar Table.
+        An empty sequence is returned when no more rows are available.
+        """
+        if size < 0:
+            raise ValueError("size argument for fetchmany is %s but must be >= 0", size)
+        results = self.results.next_n_rows(size)
+        n_remaining_rows = size - results.num_rows
+        self._next_row_index += results.num_rows
+        while (
+            n_remaining_rows > 0
+            and not self.has_been_closed_server_side
+            and self.has_more_rows
+        ):
+            self._fill_results_buffer()
+            partial_results = self.results.next_n_rows(n_remaining_rows)
+            results = self.merge_columnar(results, partial_results)
+            n_remaining_rows -= partial_results.num_rows
+            self._next_row_index += partial_results.num_rows
+        return results
+    def fetchall_arrow(self) -> "pyarrow.Table":
         """Fetch all (remaining) rows of a query result, returning them as a PyArrow table."""
         results = self.results.remaining_rows()
         self._next_row_index += results.num_rows
@@ -1209,12 +1283,30 @@ class ResultSet:
         return results
+    def fetchall_columnar(self):
+        """Fetch all (remaining) rows of a query result, returning them as a Columnar table."""
+        results = self.results.remaining_rows()
+        self._next_row_index += results.num_rows
+        while not self.has_been_closed_server_side and self.has_more_rows:
+            self._fill_results_buffer()
+            partial_results = self.results.remaining_rows()
+            results = self.merge_columnar(results, partial_results)
+            self._next_row_index += partial_results.num_rows
+        return results
     def fetchone(self) -> Optional[Row]:
         """
         Fetch the next row of a query result set, returning a single sequence,
         or None when no more data is available.
         """
-        res = self._convert_arrow_table(self.fetchmany_arrow(1))
+        if isinstance(self.results, ColumnQueue):
+            res = self._convert_columnar_table(self.fetchmany_columnar(1))
+        else:
+            res = self._convert_arrow_table(self.fetchmany_arrow(1))
         if len(res) > 0:
             return res[0]
         else:
@@ -1224,7 +1316,10 @@ class ResultSet:
         """
         Fetch all (remaining) rows of a query result, returning them as a list of rows.
         """
-        return self._convert_arrow_table(self.fetchall_arrow())
+        if isinstance(self.results, ColumnQueue):
+            return self._convert_columnar_table(self.fetchall_columnar())
+        else:
+            return self._convert_arrow_table(self.fetchall_arrow())
     def fetchmany(self, size: int) -> List[Row]:
         """
@@ -1232,7 +1327,10 @@ class ResultSet:
         An empty sequence is returned when no more rows are available.
         """
-        return self._convert_arrow_table(self.fetchmany_arrow(size))
+        if isinstance(self.results, ColumnQueue):
+            return self._convert_columnar_table(self.fetchmany_columnar(size))
+        else:
+            return self._convert_arrow_table(self.fetchmany_arrow(size))
     def close(self) -> None:
         """

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/cloudfetch/download_manager.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import logging
-from ssl import SSLContext
 from concurrent.futures import ThreadPoolExecutor, Future
 from typing import List, Union
@@ -9,6 +8,8 @@ from databricks.sql.cloudfetch.downloader import (
     DownloadableResultSettings,
     DownloadedFile,
 )
+from databricks.sql.types import SSLOptions
 from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
 logger = logging.getLogger(__name__)
@@ -20,7 +21,7 @@ class ResultFileDownloadManager:
         links: List[TSparkArrowResultLink],
         max_download_threads: int,
         lz4_compressed: bool,
-        ssl_context: SSLContext,
+        ssl_options: SSLOptions,
     ):
         self._pending_links: List[TSparkArrowResultLink] = []
         for link in links:
@@ -38,7 +39,7 @@ class ResultFileDownloadManager:
         self._thread_pool = ThreadPoolExecutor(max_workers=self._max_download_threads)
         self._downloadable_result_settings = DownloadableResultSettings(lz4_compressed)
-        self._ssl_context = ssl_context
+        self._ssl_options = ssl_options
     def get_next_downloaded_file(
         self, next_row_offset: int
@@ -95,7 +96,7 @@ class ResultFileDownloadManager:
             handler = ResultSetDownloadHandler(
                 settings=self._downloadable_result_settings,
                 link=link,
-                ssl_context=self._ssl_context,
+                ssl_options=self._ssl_options,
             )
             task = self._thread_pool.submit(handler.run)
             self._download_tasks.append(task)

{databricks_sql_connector-3.3.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/cloudfetch/downloader.py RENAMED Viewed

@@ -3,13 +3,12 @@ from dataclasses import dataclass
 import requests
 from requests.adapters import HTTPAdapter, Retry
-from ssl import SSLContext, CERT_NONE
 import lz4.frame
 import time
 from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
 from databricks.sql.exc import Error
+from databricks.sql.types import SSLOptions
 logger = logging.getLogger(__name__)
@@ -66,11 +65,11 @@ class ResultSetDownloadHandler:
         self,
         settings: DownloadableResultSettings,
         link: TSparkArrowResultLink,
-        ssl_context: SSLContext,
+        ssl_options: SSLOptions,
     ):
         self.settings = settings
         self.link = link
-        self._ssl_context = ssl_context
+        self._ssl_options = ssl_options
     def run(self) -> DownloadedFile:
         """
@@ -95,14 +94,13 @@ class ResultSetDownloadHandler:
         session.mount("http://", HTTPAdapter(max_retries=retryPolicy))
         session.mount("https://", HTTPAdapter(max_retries=retryPolicy))
-        ssl_verify = self._ssl_context.verify_mode != CERT_NONE
         try:
             # Get the file via HTTP request
             response = session.get(
                 self.link.fileLink,
                 timeout=self.settings.download_timeout,
-                verify=ssl_verify,
+                verify=self._ssl_options.tls_verify,
+                # TODO: Pass cert from `self._ssl_options`
             )
             response.raise_for_status()

databricks-sql-connector 3.3.0__tar.gz → 3.5.0__tar.gz

databricks-sql-connector 3.3.0tar.gz → 3.5.0tar.gz