PyPI - databricks-sql-connector - Versions diffs - 3.4.0__tar.gz → 3.5.0__tar.gz - Mend

databricks-sql-connector 3.4.0tar.gz → 3.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/CHANGELOG.md RENAMED Viewed

@@ -1,5 +1,10 @@
 # Release History
+# 3.5.0 (2024-10-18)
+- Create a non pyarrow flow to handle small results for the column set (databricks/databricks-sql-python#440 by @jprakash-db)
+- Fix: On non-retryable error, ensure PySQL includes useful information in error (databricks/databricks-sql-python#447 by @shivam2680)
 # 3.4.0 (2024-08-27)
 - Unpin pandas to support v2.2.2 (databricks/databricks-sql-python#416 by @kfollesdal)

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: databricks-sql-connector
-Version: 3.4.0
+Version: 3.5.0
 Summary: Databricks SQL Connector for Python
 License: Apache-2.0
 Author: Databricks
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
 Provides-Extra: alembic
 Provides-Extra: sqlalchemy
 Requires-Dist: alembic (>=1.0.11,<2.0.0) ; extra == "alembic"

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "databricks-sql-connector"
-version = "3.4.0"
+version = "3.5.0"
 description = "Databricks SQL Connector for Python"
 authors = ["Databricks <databricks-sql-connector-maintainers@databricks.com>"]
 license = "Apache-2.0"

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/__init__.py RENAMED Viewed

@@ -68,7 +68,7 @@ DATETIME = DBAPITypeObject("timestamp")
 DATE = DBAPITypeObject("date")
 ROWID = DBAPITypeObject()
-__version__ = "3.4.0"
+__version__ = "3.5.0"
 USER_AGENT_NAME = "PyDatabricksSqlConnector"
 # These two functions are pyhive legacy

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/client.py RENAMED Viewed

@@ -1,7 +1,11 @@
 from typing import Dict, Tuple, List, Optional, Any, Union, Sequence
 import pandas
-import pyarrow
+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
 import requests
 import json
 import os
@@ -22,6 +26,8 @@ from databricks.sql.utils import (
     ParamEscaper,
     inject_parameters,
     transform_paramstyle,
+    ColumnTable,
+    ColumnQueue,
 )
 from databricks.sql.parameters.native import (
     DbsqlParameterBase,
@@ -991,14 +997,14 @@ class Cursor:
         else:
             raise Error("There is no active result set")
-    def fetchall_arrow(self) -> pyarrow.Table:
+    def fetchall_arrow(self) -> "pyarrow.Table":
         self._check_not_closed()
         if self.active_result_set:
             return self.active_result_set.fetchall_arrow()
         else:
             raise Error("There is no active result set")
-    def fetchmany_arrow(self, size) -> pyarrow.Table:
+    def fetchmany_arrow(self, size) -> "pyarrow.Table":
         self._check_not_closed()
         if self.active_result_set:
             return self.active_result_set.fetchmany_arrow(size)
@@ -1143,6 +1149,18 @@ class ResultSet:
         self.results = results
         self.has_more_rows = has_more_rows
+    def _convert_columnar_table(self, table):
+        column_names = [c[0] for c in self.description]
+        ResultRow = Row(*column_names)
+        result = []
+        for row_index in range(table.num_rows):
+            curr_row = []
+            for col_index in range(table.num_columns):
+                curr_row.append(table.get_item(col_index, row_index))
+            result.append(ResultRow(*curr_row))
+        return result
     def _convert_arrow_table(self, table):
         column_names = [c[0] for c in self.description]
         ResultRow = Row(*column_names)
@@ -1185,7 +1203,7 @@ class ResultSet:
     def rownumber(self):
         return self._next_row_index
-    def fetchmany_arrow(self, size: int) -> pyarrow.Table:
+    def fetchmany_arrow(self, size: int) -> "pyarrow.Table":
         """
         Fetch the next set of rows of a query result, returning a PyArrow table.
@@ -1210,7 +1228,49 @@ class ResultSet:
         return results
-    def fetchall_arrow(self) -> pyarrow.Table:
+    def merge_columnar(self, result1, result2):
+        """
+        Function to merge / combining the columnar results into a single result
+        :param result1:
+        :param result2:
+        :return:
+        """
+        if result1.column_names != result2.column_names:
+            raise ValueError("The columns in the results don't match")
+        merged_result = [
+            result1.column_table[i] + result2.column_table[i]
+            for i in range(result1.num_columns)
+        ]
+        return ColumnTable(merged_result, result1.column_names)
+    def fetchmany_columnar(self, size: int):
+        """
+        Fetch the next set of rows of a query result, returning a Columnar Table.
+        An empty sequence is returned when no more rows are available.
+        """
+        if size < 0:
+            raise ValueError("size argument for fetchmany is %s but must be >= 0", size)
+        results = self.results.next_n_rows(size)
+        n_remaining_rows = size - results.num_rows
+        self._next_row_index += results.num_rows
+        while (
+            n_remaining_rows > 0
+            and not self.has_been_closed_server_side
+            and self.has_more_rows
+        ):
+            self._fill_results_buffer()
+            partial_results = self.results.next_n_rows(n_remaining_rows)
+            results = self.merge_columnar(results, partial_results)
+            n_remaining_rows -= partial_results.num_rows
+            self._next_row_index += partial_results.num_rows
+        return results
+    def fetchall_arrow(self) -> "pyarrow.Table":
         """Fetch all (remaining) rows of a query result, returning them as a PyArrow table."""
         results = self.results.remaining_rows()
         self._next_row_index += results.num_rows
@@ -1223,12 +1283,30 @@ class ResultSet:
         return results
+    def fetchall_columnar(self):
+        """Fetch all (remaining) rows of a query result, returning them as a Columnar table."""
+        results = self.results.remaining_rows()
+        self._next_row_index += results.num_rows
+        while not self.has_been_closed_server_side and self.has_more_rows:
+            self._fill_results_buffer()
+            partial_results = self.results.remaining_rows()
+            results = self.merge_columnar(results, partial_results)
+            self._next_row_index += partial_results.num_rows
+        return results
     def fetchone(self) -> Optional[Row]:
         """
         Fetch the next row of a query result set, returning a single sequence,
         or None when no more data is available.
         """
-        res = self._convert_arrow_table(self.fetchmany_arrow(1))
+        if isinstance(self.results, ColumnQueue):
+            res = self._convert_columnar_table(self.fetchmany_columnar(1))
+        else:
+            res = self._convert_arrow_table(self.fetchmany_arrow(1))
         if len(res) > 0:
             return res[0]
         else:
@@ -1238,7 +1316,10 @@ class ResultSet:
         """
         Fetch all (remaining) rows of a query result, returning them as a list of rows.
         """
-        return self._convert_arrow_table(self.fetchall_arrow())
+        if isinstance(self.results, ColumnQueue):
+            return self._convert_columnar_table(self.fetchall_columnar())
+        else:
+            return self._convert_arrow_table(self.fetchall_arrow())
     def fetchmany(self, size: int) -> List[Row]:
         """
@@ -1246,7 +1327,10 @@ class ResultSet:
         An empty sequence is returned when no more rows are available.
         """
-        return self._convert_arrow_table(self.fetchmany_arrow(size))
+        if isinstance(self.results, ColumnQueue):
+            return self._convert_columnar_table(self.fetchmany_columnar(size))
+        else:
+            return self._convert_arrow_table(self.fetchmany_arrow(size))
     def close(self) -> None:
         """

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/thrift_backend.py RENAMED Viewed

@@ -7,7 +7,10 @@ import uuid
 import threading
 from typing import List, Union
-import pyarrow
+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
 import thrift.transport.THttpClient
 import thrift.protocol.TBinaryProtocol
 import thrift.transport.TSocket
@@ -726,12 +729,17 @@ class ThriftBackend:
         description = self._hive_schema_to_description(
             t_result_set_metadata_resp.schema
         )
-        schema_bytes = (
-            t_result_set_metadata_resp.arrowSchema
-            or self._hive_schema_to_arrow_schema(t_result_set_metadata_resp.schema)
-            .serialize()
-            .to_pybytes()
-        )
+        if pyarrow:
+            schema_bytes = (
+                t_result_set_metadata_resp.arrowSchema
+                or self._hive_schema_to_arrow_schema(t_result_set_metadata_resp.schema)
+                .serialize()
+                .to_pybytes()
+            )
+        else:
+            schema_bytes = None
         lz4_compressed = t_result_set_metadata_resp.lz4Compressed
         is_staging_operation = t_result_set_metadata_resp.isStagingOperation
         if direct_results and direct_results.resultSet:
@@ -827,7 +835,7 @@ class ThriftBackend:
             getDirectResults=ttypes.TSparkGetDirectResults(
                 maxRows=max_rows, maxBytes=max_bytes
             ),
-            canReadArrowResult=True,
+            canReadArrowResult=True if pyarrow else False,
             canDecompressLZ4Result=lz4_compression,
             canDownloadResult=use_cloud_fetch,
             confOverlay={

{databricks_sql_connector-3.4.0 → databricks_sql_connector-3.5.0}/src/databricks/sql/utils.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import pytz
 import datetime
 import decimal
 from abc import ABC, abstractmethod
@@ -11,7 +12,11 @@ from typing import Any, Dict, List, Optional, Union
 import re
 import lz4.frame
-import pyarrow
+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
 from databricks.sql import OperationalError, exc
 from databricks.sql.cloudfetch.download_manager import ResultFileDownloadManager
@@ -27,17 +32,18 @@ from databricks.sql.parameters.native import ParameterStructure, TDbsqlParameter
 import logging
 BIT_MASKS = [1, 2, 4, 8, 16, 32, 64, 128]
+DEFAULT_ERROR_CONTEXT = "Unknown error"
 logger = logging.getLogger(__name__)
 class ResultSetQueue(ABC):
     @abstractmethod
-    def next_n_rows(self, num_rows: int) -> pyarrow.Table:
+    def next_n_rows(self, num_rows: int):
         pass
     @abstractmethod
-    def remaining_rows(self) -> pyarrow.Table:
+    def remaining_rows(self):
         pass
@@ -76,13 +82,15 @@ class ResultSetQueueFactory(ABC):
             )
             return ArrowQueue(converted_arrow_table, n_valid_rows)
         elif row_set_type == TSparkRowSetType.COLUMN_BASED_SET:
-            arrow_table, n_valid_rows = convert_column_based_set_to_arrow_table(
+            column_table, column_names = convert_column_based_set_to_column_table(
                 t_row_set.columns, description
             )
-            converted_arrow_table = convert_decimals_in_arrow_table(
-                arrow_table, description
+            converted_column_table = convert_to_assigned_datatypes_in_column_table(
+                column_table, description
             )
-            return ArrowQueue(converted_arrow_table, n_valid_rows)
+            return ColumnQueue(ColumnTable(converted_column_table, column_names))
         elif row_set_type == TSparkRowSetType.URL_BASED_SET:
             return CloudFetchQueue(
                 schema_bytes=arrow_schema_bytes,
@@ -97,10 +105,63 @@ class ResultSetQueueFactory(ABC):
             raise AssertionError("Row set type is not valid")
+class ColumnTable:
+    def __init__(self, column_table, column_names):
+        self.column_table = column_table
+        self.column_names = column_names
+    @property
+    def num_rows(self):
+        if len(self.column_table) == 0:
+            return 0
+        else:
+            return len(self.column_table[0])
+    @property
+    def num_columns(self):
+        return len(self.column_names)
+    def get_item(self, col_index, row_index):
+        return self.column_table[col_index][row_index]
+    def slice(self, curr_index, length):
+        sliced_column_table = [
+            column[curr_index : curr_index + length] for column in self.column_table
+        ]
+        return ColumnTable(sliced_column_table, self.column_names)
+    def __eq__(self, other):
+        return (
+            self.column_table == other.column_table
+            and self.column_names == other.column_names
+        )
+class ColumnQueue(ResultSetQueue):
+    def __init__(self, column_table: ColumnTable):
+        self.column_table = column_table
+        self.cur_row_index = 0
+        self.n_valid_rows = column_table.num_rows
+    def next_n_rows(self, num_rows):
+        length = min(num_rows, self.n_valid_rows - self.cur_row_index)
+        slice = self.column_table.slice(self.cur_row_index, length)
+        self.cur_row_index += slice.num_rows
+        return slice
+    def remaining_rows(self):
+        slice = self.column_table.slice(
+            self.cur_row_index, self.n_valid_rows - self.cur_row_index
+        )
+        self.cur_row_index += slice.num_rows
+        return slice
 class ArrowQueue(ResultSetQueue):
     def __init__(
         self,
-        arrow_table: pyarrow.Table,
+        arrow_table: "pyarrow.Table",
         n_valid_rows: int,
         start_row_index: int = 0,
     ):
@@ -115,7 +176,7 @@ class ArrowQueue(ResultSetQueue):
         self.arrow_table = arrow_table
         self.n_valid_rows = n_valid_rows
-    def next_n_rows(self, num_rows: int) -> pyarrow.Table:
+    def next_n_rows(self, num_rows: int) -> "pyarrow.Table":
         """Get upto the next n rows of the Arrow dataframe"""
         length = min(num_rows, self.n_valid_rows - self.cur_row_index)
         # Note that the table.slice API is not the same as Python's slice
@@ -124,7 +185,7 @@ class ArrowQueue(ResultSetQueue):
         self.cur_row_index += slice.num_rows
         return slice
-    def remaining_rows(self) -> pyarrow.Table:
+    def remaining_rows(self) -> "pyarrow.Table":
         slice = self.arrow_table.slice(
             self.cur_row_index, self.n_valid_rows - self.cur_row_index
         )
@@ -184,7 +245,7 @@ class CloudFetchQueue(ResultSetQueue):
         self.table = self._create_next_table()
         self.table_row_index = 0
-    def next_n_rows(self, num_rows: int) -> pyarrow.Table:
+    def next_n_rows(self, num_rows: int) -> "pyarrow.Table":
         """
         Get up to the next n rows of the cloud fetch Arrow dataframes.
@@ -216,7 +277,7 @@ class CloudFetchQueue(ResultSetQueue):
         logger.debug("CloudFetchQueue: collected {} next rows".format(results.num_rows))
         return results
-    def remaining_rows(self) -> pyarrow.Table:
+    def remaining_rows(self) -> "pyarrow.Table":
         """
         Get all remaining rows of the cloud fetch Arrow dataframes.
@@ -237,7 +298,7 @@ class CloudFetchQueue(ResultSetQueue):
             self.table_row_index = 0
         return results
-    def _create_next_table(self) -> Union[pyarrow.Table, None]:
+    def _create_next_table(self) -> Union["pyarrow.Table", None]:
         logger.debug(
             "CloudFetchQueue: Trying to get downloaded file for row {}".format(
                 self.start_row_index
@@ -276,7 +337,7 @@ class CloudFetchQueue(ResultSetQueue):
         return arrow_table
-    def _create_empty_table(self) -> pyarrow.Table:
+    def _create_empty_table(self) -> "pyarrow.Table":
         # Create a 0-row table with just the schema bytes
         return create_arrow_table_from_arrow_file(self.schema_bytes, self.description)
@@ -357,7 +418,12 @@ class RequestErrorInfo(
             user_friendly_error_message = "{}: {}".format(
                 user_friendly_error_message, self.error_message
             )
-        return user_friendly_error_message
+        try:
+            error_context = str(self.error)
+        except:
+            error_context = DEFAULT_ERROR_CONTEXT
+        return user_friendly_error_message + ". " + error_context
 # Taken from PyHive
@@ -515,7 +581,9 @@ def transform_paramstyle(
     return output
-def create_arrow_table_from_arrow_file(file_bytes: bytes, description) -> pyarrow.Table:
+def create_arrow_table_from_arrow_file(
+    file_bytes: bytes, description
+) -> "pyarrow.Table":
     arrow_table = convert_arrow_based_file_to_arrow_table(file_bytes)
     return convert_decimals_in_arrow_table(arrow_table, description)
@@ -542,7 +610,7 @@ def convert_arrow_based_set_to_arrow_table(arrow_batches, lz4_compressed, schema
     return arrow_table, n_rows
-def convert_decimals_in_arrow_table(table, description) -> pyarrow.Table:
+def convert_decimals_in_arrow_table(table, description) -> "pyarrow.Table":
     for i, col in enumerate(table.itercolumns()):
         if description[i][1] == "decimal":
             decimal_col = col.to_pandas().apply(
@@ -560,6 +628,37 @@ def convert_decimals_in_arrow_table(table, description) -> pyarrow.Table:
     return table
+def convert_to_assigned_datatypes_in_column_table(column_table, description):
+    converted_column_table = []
+    for i, col in enumerate(column_table):
+        if description[i][1] == "decimal":
+            converted_column_table.append(
+                tuple(v if v is None else Decimal(v) for v in col)
+            )
+        elif description[i][1] == "date":
+            converted_column_table.append(
+                tuple(v if v is None else datetime.date.fromisoformat(v) for v in col)
+            )
+        elif description[i][1] == "timestamp":
+            converted_column_table.append(
+                tuple(
+                    (
+                        v
+                        if v is None
+                        else datetime.datetime.strptime(
+                            v, "%Y-%m-%d %H:%M:%S.%f"
+                        ).replace(tzinfo=pytz.UTC)
+                    )
+                    for v in col
+                )
+            )
+        else:
+            converted_column_table.append(col)
+    return converted_column_table
 def convert_column_based_set_to_arrow_table(columns, description):
     arrow_table = pyarrow.Table.from_arrays(
         [_convert_column_to_arrow_array(c) for c in columns],
@@ -571,6 +670,13 @@ def convert_column_based_set_to_arrow_table(columns, description):
     return arrow_table, arrow_table.num_rows
+def convert_column_based_set_to_column_table(columns, description):
+    column_names = [c[0] for c in description]
+    column_table = [_convert_column_to_list(c) for c in columns]
+    return column_table, column_names
 def _convert_column_to_arrow_array(t_col):
     """
     Return a pyarrow array from the values in a TColumn instance.
@@ -595,6 +701,26 @@ def _convert_column_to_arrow_array(t_col):
     raise OperationalError("Empty TColumn instance {}".format(t_col))
+def _convert_column_to_list(t_col):
+    SUPPORTED_FIELD_TYPES = (
+        "boolVal",
+        "byteVal",
+        "i16Val",
+        "i32Val",
+        "i64Val",
+        "doubleVal",
+        "stringVal",
+        "binaryVal",
+    )
+    for field in SUPPORTED_FIELD_TYPES:
+        wrapper = getattr(t_col, field)
+        if wrapper:
+            return _create_python_tuple(wrapper)
+    raise OperationalError("Empty TColumn instance {}".format(t_col))
 def _create_arrow_array(t_col_value_wrapper, arrow_type):
     result = t_col_value_wrapper.values
     nulls = t_col_value_wrapper.nulls  # bitfield describing which values are null
@@ -609,3 +735,19 @@ def _create_arrow_array(t_col_value_wrapper, arrow_type):
             result[i] = None
     return pyarrow.array(result, type=arrow_type)
+def _create_python_tuple(t_col_value_wrapper):
+    result = t_col_value_wrapper.values
+    nulls = t_col_value_wrapper.nulls  # bitfield describing which values are null
+    assert isinstance(nulls, bytes)
+    # The number of bits in nulls can be both larger or smaller than the number of
+    # elements in result, so take the minimum of both to iterate over.
+    length = min(len(result), len(nulls) * 8)
+    for i in range(length):
+        if nulls[i >> 3] & BIT_MASKS[i & 0x7]:
+            result[i] = None
+    return tuple(result)