databricks-sql-connector 2.9.2.dev1__tar.gz → 2.9.4b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/CHANGELOG.md +22 -5
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/PKG-INFO +2 -1
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/pyproject.toml +1 -1
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/__init__.py +33 -1
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/retry.py +1 -5
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/cloudfetch/download_manager.py +8 -32
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/cloudfetch/downloader.py +93 -27
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/exc.py +4 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_backend.py +18 -5
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/utils.py +8 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/LICENSE +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/README.md +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/auth.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/authenticators.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/endpoint.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/oauth.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/oauth_http_handler.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/auth/thrift_http_client.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/client.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/experimental/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/experimental/oauth_persistence.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_api/TCLIService/TCLIService-remote +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_api/TCLIService/TCLIService.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_api/TCLIService/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_api/TCLIService/constants.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_api/TCLIService/ttypes.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/thrift_api/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/types.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sqlalchemy/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sqlalchemy/dialect/__init__.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sqlalchemy/dialect/base.py +0 -0
- {databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sqlalchemy/dialect/compiler.py +0 -0
|
@@ -1,14 +1,31 @@
|
|
|
1
1
|
# Release History
|
|
2
2
|
|
|
3
|
-
## 2.9.
|
|
3
|
+
## 2.9.4 (Unreleased)
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## 2.9.4b1 (2024-02-16)
|
|
6
|
+
|
|
7
|
+
- Fix: Cloud fetch file download errors (#356)
|
|
8
|
+
- Fix: Redact the URL query parameters from the urllib3.connectionpool logs (#341)
|
|
9
|
+
|
|
10
|
+
## 2.9.3 (2023-08-24)
|
|
11
|
+
|
|
12
|
+
- Fix: Connections failed when urllib3~=1.0.0 is installed (#206)
|
|
13
|
+
|
|
14
|
+
## 2.9.2 (2023-08-17)
|
|
15
|
+
|
|
16
|
+
- Other: Add `examples/v3_retries_query_execute.py` (#199)
|
|
17
|
+
- Other: suppress log message when `_enable_v3_retries` is not `True` (#199)
|
|
18
|
+
- Other: make this connector backwards compatible with `urllib3>=1.0.0` (#197)
|
|
19
|
+
|
|
20
|
+
## 2.9.1 (2023-08-11)
|
|
21
|
+
|
|
22
|
+
- Other: Explicitly pin urllib3 to ^2.0.0 (#191)
|
|
6
23
|
|
|
7
24
|
## 2.9.0 (2023-08-10)
|
|
8
25
|
|
|
9
|
-
- Replace retry handling with DatabricksRetryPolicy. This is disabled by default. To enable, set `enable_v3_retries=True` when creating `databricks.sql.client`
|
|
10
|
-
- Other: Fix typo in README quick start example
|
|
11
|
-
- Other: Add autospec to Client mocks and tidy up `make_request`
|
|
26
|
+
- Replace retry handling with DatabricksRetryPolicy. This is disabled by default. To enable, set `enable_v3_retries=True` when creating `databricks.sql.client` (#182)
|
|
27
|
+
- Other: Fix typo in README quick start example (#186)
|
|
28
|
+
- Other: Add autospec to Client mocks and tidy up `make_request` (#188)
|
|
12
29
|
|
|
13
30
|
## 2.8.0 (2023-07-21)
|
|
14
31
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: databricks-sql-connector
|
|
3
|
-
Version: 2.9.
|
|
3
|
+
Version: 2.9.4b1
|
|
4
4
|
Summary: Databricks SQL Connector for Python
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Databricks
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.9
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
16
|
Requires-Dist: alembic (>=1.0.11,<2.0.0)
|
|
16
17
|
Requires-Dist: lz4 (>=4.0.2,<5.0.0)
|
|
17
18
|
Requires-Dist: numpy (>=1.16.6) ; python_version >= "3.7" and python_version < "3.11"
|
|
@@ -7,6 +7,38 @@ apilevel = "2.0"
|
|
|
7
7
|
threadsafety = 1 # Threads may share the module, but not connections.
|
|
8
8
|
paramstyle = "pyformat" # Python extended format codes, e.g. ...WHERE name=%(name)s
|
|
9
9
|
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RedactUrlQueryParamsFilter(logging.Filter):
|
|
14
|
+
pattern = re.compile(r"(\?|&)([\w-]+)=([^&\s]+)")
|
|
15
|
+
mask = r"\1\2=<REDACTED>"
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
super().__init__()
|
|
19
|
+
|
|
20
|
+
def redact(self, string):
|
|
21
|
+
return re.sub(self.pattern, self.mask, str(string))
|
|
22
|
+
|
|
23
|
+
def filter(self, record):
|
|
24
|
+
record.msg = self.redact(str(record.msg))
|
|
25
|
+
if isinstance(record.args, dict):
|
|
26
|
+
for k in record.args.keys():
|
|
27
|
+
record.args[k] = (
|
|
28
|
+
self.redact(record.args[k])
|
|
29
|
+
if isinstance(record.arg[k], str)
|
|
30
|
+
else record.args[k]
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
record.args = tuple(
|
|
34
|
+
(self.redact(arg) if isinstance(arg, str) else arg)
|
|
35
|
+
for arg in record.args
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
return True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
logging.getLogger("urllib3.connectionpool").addFilter(RedactUrlQueryParamsFilter())
|
|
10
42
|
|
|
11
43
|
class DBAPITypeObject(object):
|
|
12
44
|
def __init__(self, *values):
|
|
@@ -28,7 +60,7 @@ DATETIME = DBAPITypeObject("timestamp")
|
|
|
28
60
|
DATE = DBAPITypeObject("date")
|
|
29
61
|
ROWID = DBAPITypeObject()
|
|
30
62
|
|
|
31
|
-
__version__ = "2.9.
|
|
63
|
+
__version__ = "2.9.4b1"
|
|
32
64
|
USER_AGENT_NAME = "PyDatabricksSqlConnector"
|
|
33
65
|
|
|
34
66
|
# These two functions are pyhive legacy
|
|
@@ -56,8 +56,7 @@ class DatabricksRetryPolicy(Retry):
|
|
|
56
56
|
`backoff_factor`.
|
|
57
57
|
|
|
58
58
|
:param delay_max:
|
|
59
|
-
Float of seconds for the maximum delay between retries.
|
|
60
|
-
`backoff_max`
|
|
59
|
+
Float of seconds for the maximum delay between retries.
|
|
61
60
|
|
|
62
61
|
:param stop_after_attempts_count:
|
|
63
62
|
Integer maximum number of attempts that will be retried. This is an alias for urllib3's
|
|
@@ -122,7 +121,6 @@ class DatabricksRetryPolicy(Retry):
|
|
|
122
121
|
total=_attempts_remaining,
|
|
123
122
|
respect_retry_after_header=True,
|
|
124
123
|
backoff_factor=self.delay_min,
|
|
125
|
-
backoff_max=self.delay_max,
|
|
126
124
|
allowed_methods=["POST"],
|
|
127
125
|
status_forcelist=[429, 503, *self.force_dangerous_codes],
|
|
128
126
|
)
|
|
@@ -212,13 +210,11 @@ class DatabricksRetryPolicy(Retry):
|
|
|
212
210
|
allowed_methods=self.allowed_methods,
|
|
213
211
|
status_forcelist=self.status_forcelist,
|
|
214
212
|
backoff_factor=self.backoff_factor, # type: ignore
|
|
215
|
-
backoff_max=self.backoff_max, # type: ignore
|
|
216
213
|
raise_on_redirect=self.raise_on_redirect,
|
|
217
214
|
raise_on_status=self.raise_on_status,
|
|
218
215
|
history=self.history,
|
|
219
216
|
remove_headers_on_redirect=self.remove_headers_on_redirect,
|
|
220
217
|
respect_retry_after_header=self.respect_retry_after_header,
|
|
221
|
-
backoff_jitter=self.backoff_jitter, # type: ignore
|
|
222
218
|
)
|
|
223
219
|
|
|
224
220
|
# Update urllib3's current state to reflect the incremented counters
|
|
@@ -8,6 +8,7 @@ from databricks.sql.cloudfetch.downloader import (
|
|
|
8
8
|
ResultSetDownloadHandler,
|
|
9
9
|
DownloadableResultSettings,
|
|
10
10
|
)
|
|
11
|
+
from databricks.sql.exc import ResultSetDownloadError
|
|
11
12
|
from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
|
|
12
13
|
|
|
13
14
|
logger = logging.getLogger(__name__)
|
|
@@ -34,8 +35,6 @@ class ResultFileDownloadManager:
|
|
|
34
35
|
self.download_handlers: List[ResultSetDownloadHandler] = []
|
|
35
36
|
self.thread_pool = ThreadPoolExecutor(max_workers=max_download_threads + 1)
|
|
36
37
|
self.downloadable_result_settings = DownloadableResultSettings(lz4_compressed)
|
|
37
|
-
self.fetch_need_retry = False
|
|
38
|
-
self.num_consecutive_result_file_download_retries = 0
|
|
39
38
|
|
|
40
39
|
def add_file_links(
|
|
41
40
|
self, t_spark_arrow_result_links: List[TSparkArrowResultLink]
|
|
@@ -81,13 +80,15 @@ class ResultFileDownloadManager:
|
|
|
81
80
|
|
|
82
81
|
# Find next file
|
|
83
82
|
idx = self._find_next_file_index(next_row_offset)
|
|
83
|
+
# is this correct?
|
|
84
84
|
if idx is None:
|
|
85
85
|
self._shutdown_manager()
|
|
86
|
+
logger.debug("could not find next file index")
|
|
86
87
|
return None
|
|
87
88
|
handler = self.download_handlers[idx]
|
|
88
89
|
|
|
89
90
|
# Check (and wait) for download status
|
|
90
|
-
if
|
|
91
|
+
if handler.is_file_download_successful():
|
|
91
92
|
# Buffer should be empty so set buffer to new ArrowQueue with result_file
|
|
92
93
|
result = DownloadedFile(
|
|
93
94
|
handler.result_file,
|
|
@@ -97,9 +98,11 @@ class ResultFileDownloadManager:
|
|
|
97
98
|
self.download_handlers.pop(idx)
|
|
98
99
|
# Return True upon successful download to continue loop and not force a retry
|
|
99
100
|
return result
|
|
100
|
-
# Download was not successful for next download item
|
|
101
|
+
# Download was not successful for next download item. Fail
|
|
101
102
|
self._shutdown_manager()
|
|
102
|
-
|
|
103
|
+
raise ResultSetDownloadError(
|
|
104
|
+
f"Download failed for result set starting at {next_row_offset}"
|
|
105
|
+
)
|
|
103
106
|
|
|
104
107
|
def _remove_past_handlers(self, next_row_offset: int):
|
|
105
108
|
# Any link in which its start to end range doesn't include the next row to be fetched does not need downloading
|
|
@@ -133,33 +136,6 @@ class ResultFileDownloadManager:
|
|
|
133
136
|
]
|
|
134
137
|
return next_indices[0] if len(next_indices) > 0 else None
|
|
135
138
|
|
|
136
|
-
def _check_if_download_successful(self, handler: ResultSetDownloadHandler):
|
|
137
|
-
# Check (and wait until download finishes) if download was successful
|
|
138
|
-
if not handler.is_file_download_successful():
|
|
139
|
-
if handler.is_link_expired:
|
|
140
|
-
self.fetch_need_retry = True
|
|
141
|
-
return False
|
|
142
|
-
elif handler.is_download_timedout:
|
|
143
|
-
# Consecutive file retries should not exceed threshold in settings
|
|
144
|
-
if (
|
|
145
|
-
self.num_consecutive_result_file_download_retries
|
|
146
|
-
>= self.downloadable_result_settings.max_consecutive_file_download_retries
|
|
147
|
-
):
|
|
148
|
-
self.fetch_need_retry = True
|
|
149
|
-
return False
|
|
150
|
-
self.num_consecutive_result_file_download_retries += 1
|
|
151
|
-
|
|
152
|
-
# Re-submit handler run to thread pool and recursively check download status
|
|
153
|
-
self.thread_pool.submit(handler.run)
|
|
154
|
-
return self._check_if_download_successful(handler)
|
|
155
|
-
else:
|
|
156
|
-
self.fetch_need_retry = True
|
|
157
|
-
return False
|
|
158
|
-
|
|
159
|
-
self.num_consecutive_result_file_download_retries = 0
|
|
160
|
-
self.fetch_need_retry = False
|
|
161
|
-
return True
|
|
162
|
-
|
|
163
139
|
def _shutdown_manager(self):
|
|
164
140
|
# Clear download handlers and shutdown the thread pool
|
|
165
141
|
self.download_handlers = []
|
|
@@ -1,15 +1,17 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from dataclasses import dataclass
|
|
3
|
-
|
|
4
3
|
import requests
|
|
5
4
|
import lz4.frame
|
|
6
5
|
import threading
|
|
7
6
|
import time
|
|
8
|
-
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
9
|
from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
|
|
10
10
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
|
+
DEFAULT_CLOUD_FILE_TIMEOUT = int(os.getenv("DATABRICKS_CLOUD_FILE_TIMEOUT", 60))
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
@dataclass
|
|
15
17
|
class DownloadableResultSettings:
|
|
@@ -20,13 +22,17 @@ class DownloadableResultSettings:
|
|
|
20
22
|
is_lz4_compressed (bool): Whether file is expected to be lz4 compressed.
|
|
21
23
|
link_expiry_buffer_secs (int): Time in seconds to prevent download of a link before it expires. Default 0 secs.
|
|
22
24
|
download_timeout (int): Timeout for download requests. Default 60 secs.
|
|
23
|
-
|
|
25
|
+
download_max_retries (int): Number of consecutive download retries before shutting down.
|
|
26
|
+
max_retries (int): Number of consecutive download retries before shutting down.
|
|
27
|
+
backoff_factor (int): Factor to increase wait time between retries.
|
|
28
|
+
|
|
24
29
|
"""
|
|
25
30
|
|
|
26
31
|
is_lz4_compressed: bool
|
|
27
32
|
link_expiry_buffer_secs: int = 0
|
|
28
|
-
download_timeout: int =
|
|
29
|
-
|
|
33
|
+
download_timeout: int = DEFAULT_CLOUD_FILE_TIMEOUT
|
|
34
|
+
max_retries: int = 5
|
|
35
|
+
backoff_factor: int = 2
|
|
30
36
|
|
|
31
37
|
|
|
32
38
|
class ResultSetDownloadHandler(threading.Thread):
|
|
@@ -57,16 +63,21 @@ class ResultSetDownloadHandler(threading.Thread):
|
|
|
57
63
|
else None
|
|
58
64
|
)
|
|
59
65
|
try:
|
|
66
|
+
logger.debug(
|
|
67
|
+
f"waiting for at most {timeout} seconds for download file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
68
|
+
)
|
|
69
|
+
|
|
60
70
|
if not self.is_download_finished.wait(timeout=timeout):
|
|
61
71
|
self.is_download_timedout = True
|
|
62
|
-
logger.
|
|
63
|
-
"
|
|
64
|
-
self.settings.download_timeout,
|
|
65
|
-
self.result_link.startRowOffset,
|
|
66
|
-
self.result_link.startRowOffset + self.result_link.rowCount,
|
|
67
|
-
)
|
|
72
|
+
logger.error(
|
|
73
|
+
f"cloud fetch download timed out after {self.settings.download_timeout} seconds for link representing rows {self.result_link.startRowOffset} to {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
68
74
|
)
|
|
69
|
-
|
|
75
|
+
# there are some weird cases when the is_download_finished is not set, but the file is downloaded successfully
|
|
76
|
+
return self.is_file_downloaded_successfully
|
|
77
|
+
|
|
78
|
+
logger.debug(
|
|
79
|
+
f"finish waiting for download file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
80
|
+
)
|
|
70
81
|
except Exception as e:
|
|
71
82
|
logger.error(e)
|
|
72
83
|
return False
|
|
@@ -81,24 +92,36 @@ class ResultSetDownloadHandler(threading.Thread):
|
|
|
81
92
|
"""
|
|
82
93
|
self._reset()
|
|
83
94
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
95
|
+
try:
|
|
96
|
+
# Check if link is already expired or is expiring
|
|
97
|
+
if ResultSetDownloadHandler.check_link_expired(
|
|
98
|
+
self.result_link, self.settings.link_expiry_buffer_secs
|
|
99
|
+
):
|
|
100
|
+
self.is_link_expired = True
|
|
101
|
+
return
|
|
90
102
|
|
|
91
|
-
|
|
92
|
-
|
|
103
|
+
logger.debug(
|
|
104
|
+
f"started to download file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
105
|
+
)
|
|
93
106
|
|
|
94
|
-
try:
|
|
95
107
|
# Get the file via HTTP request
|
|
96
|
-
response =
|
|
108
|
+
response = http_get_with_retry(
|
|
109
|
+
url=self.result_link.fileLink,
|
|
110
|
+
max_retries=self.settings.max_retries,
|
|
111
|
+
backoff_factor=self.settings.backoff_factor,
|
|
112
|
+
download_timeout=self.settings.download_timeout,
|
|
113
|
+
)
|
|
97
114
|
|
|
98
|
-
if not response
|
|
99
|
-
|
|
115
|
+
if not response:
|
|
116
|
+
logger.error(
|
|
117
|
+
f"failed downloading file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
118
|
+
)
|
|
100
119
|
return
|
|
101
120
|
|
|
121
|
+
logger.debug(
|
|
122
|
+
f"success downloading file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
123
|
+
)
|
|
124
|
+
|
|
102
125
|
# Save (and decompress if needed) the downloaded file
|
|
103
126
|
compressed_data = response.content
|
|
104
127
|
decompressed_data = (
|
|
@@ -109,15 +132,22 @@ class ResultSetDownloadHandler(threading.Thread):
|
|
|
109
132
|
self.result_file = decompressed_data
|
|
110
133
|
|
|
111
134
|
# The size of the downloaded file should match the size specified from TSparkArrowResultLink
|
|
112
|
-
self.
|
|
113
|
-
|
|
135
|
+
success = len(self.result_file) == self.result_link.bytesNum
|
|
136
|
+
logger.debug(
|
|
137
|
+
f"download successful file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
114
138
|
)
|
|
139
|
+
self.is_file_downloaded_successfully = success
|
|
115
140
|
except Exception as e:
|
|
141
|
+
logger.error(
|
|
142
|
+
f"exception downloading file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
143
|
+
)
|
|
116
144
|
logger.error(e)
|
|
117
145
|
self.is_file_downloaded_successfully = False
|
|
118
146
|
|
|
119
147
|
finally:
|
|
120
|
-
|
|
148
|
+
logger.debug(
|
|
149
|
+
f"signal finished file: startRow {self.result_link.startRowOffset}, rowCount {self.result_link.rowCount}, endRow {self.result_link.startRowOffset + self.result_link.rowCount}"
|
|
150
|
+
)
|
|
121
151
|
# Awaken threads waiting for this to be true which signals the run is complete
|
|
122
152
|
self.is_download_finished.set()
|
|
123
153
|
|
|
@@ -145,6 +175,7 @@ class ResultSetDownloadHandler(threading.Thread):
|
|
|
145
175
|
link.expiryTime < current_time
|
|
146
176
|
or link.expiryTime - current_time < expiry_buffer_secs
|
|
147
177
|
):
|
|
178
|
+
logger.debug("link expired")
|
|
148
179
|
return True
|
|
149
180
|
return False
|
|
150
181
|
|
|
@@ -171,3 +202,38 @@ class ResultSetDownloadHandler(threading.Thread):
|
|
|
171
202
|
uncompressed_data += data
|
|
172
203
|
start += num_bytes
|
|
173
204
|
return uncompressed_data
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def http_get_with_retry(url, max_retries=5, backoff_factor=2, download_timeout=60):
|
|
208
|
+
attempts = 0
|
|
209
|
+
pattern = re.compile(r"(\?|&)([\w-]+)=([^&\s]+)")
|
|
210
|
+
mask = r"\1\2=<REDACTED>"
|
|
211
|
+
|
|
212
|
+
# TODO: introduce connection pooling. I am seeing weird errors without it.
|
|
213
|
+
while attempts < max_retries:
|
|
214
|
+
try:
|
|
215
|
+
session = requests.Session()
|
|
216
|
+
session.timeout = download_timeout
|
|
217
|
+
response = session.get(url)
|
|
218
|
+
|
|
219
|
+
# Check if the response status code is in the 2xx range for success
|
|
220
|
+
if response.status_code == 200:
|
|
221
|
+
return response
|
|
222
|
+
else:
|
|
223
|
+
logger.error(response)
|
|
224
|
+
except requests.RequestException as e:
|
|
225
|
+
# if this is not redacted, it will print the pre-signed URL
|
|
226
|
+
logger.error(f"request failed with exception: {re.sub(pattern, mask, str(e))}")
|
|
227
|
+
finally:
|
|
228
|
+
session.close()
|
|
229
|
+
# Exponential backoff before the next attempt
|
|
230
|
+
wait_time = backoff_factor**attempts
|
|
231
|
+
logger.info(f"retrying in {wait_time} seconds...")
|
|
232
|
+
time.sleep(wait_time)
|
|
233
|
+
|
|
234
|
+
attempts += 1
|
|
235
|
+
|
|
236
|
+
logger.error(
|
|
237
|
+
f"exceeded maximum number of retries ({max_retries}) while downloading result."
|
|
238
|
+
)
|
|
239
|
+
return None
|
{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/exc.py
RENAMED
|
@@ -115,3 +115,7 @@ class SessionAlreadyClosedError(RequestError):
|
|
|
115
115
|
|
|
116
116
|
class CursorAlreadyClosedError(RequestError):
|
|
117
117
|
"""Thrown if CancelOperation receives a code 404. ThriftBackend should gracefully proceed as this is expected."""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ResultSetDownloadError(RequestError):
|
|
121
|
+
"""Thrown if there was an error during the download of a result set"""
|
|
@@ -371,13 +371,16 @@ class ThriftBackend:
|
|
|
371
371
|
|
|
372
372
|
this_method_name = getattr(method, "__name__")
|
|
373
373
|
|
|
374
|
-
logger.debug(
|
|
374
|
+
logger.debug(
|
|
375
|
+
"sending thrift request: {}(<REDACTED>)".format(this_method_name)
|
|
376
|
+
)
|
|
375
377
|
unsafe_logger.debug("Sending request: {}".format(request))
|
|
376
378
|
|
|
377
379
|
# These three lines are no-ops if the v3 retry policy is not in use
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
380
|
+
if self.enable_v3_retries:
|
|
381
|
+
this_command_type = CommandType.get(this_method_name)
|
|
382
|
+
self._transport.set_retry_command_type(this_command_type)
|
|
383
|
+
self._transport.startRetryTimer()
|
|
381
384
|
|
|
382
385
|
response = method(request)
|
|
383
386
|
|
|
@@ -386,7 +389,9 @@ class ThriftBackend:
|
|
|
386
389
|
|
|
387
390
|
# We need to call type(response) here because thrift doesn't implement __name__ attributes for thrift responses
|
|
388
391
|
logger.debug(
|
|
389
|
-
"
|
|
392
|
+
"received thrift response: {}(<REDACTED>)".format(
|
|
393
|
+
type(response).__name__
|
|
394
|
+
)
|
|
390
395
|
)
|
|
391
396
|
unsafe_logger.debug("Received response: {}".format(response))
|
|
392
397
|
return response
|
|
@@ -740,6 +745,7 @@ class ThriftBackend:
|
|
|
740
745
|
lz4_compressed = t_result_set_metadata_resp.lz4Compressed
|
|
741
746
|
is_staging_operation = t_result_set_metadata_resp.isStagingOperation
|
|
742
747
|
if direct_results and direct_results.resultSet:
|
|
748
|
+
logger.debug(f"received direct results")
|
|
743
749
|
assert direct_results.resultSet.results.startRowOffset == 0
|
|
744
750
|
assert direct_results.resultSetMetadata
|
|
745
751
|
|
|
@@ -752,6 +758,7 @@ class ThriftBackend:
|
|
|
752
758
|
description=description,
|
|
753
759
|
)
|
|
754
760
|
else:
|
|
761
|
+
logger.debug(f"must fetch results")
|
|
755
762
|
arrow_queue_opt = None
|
|
756
763
|
return ExecuteResponse(
|
|
757
764
|
arrow_queue=arrow_queue_opt,
|
|
@@ -815,6 +822,10 @@ class ThriftBackend:
|
|
|
815
822
|
):
|
|
816
823
|
assert session_handle is not None
|
|
817
824
|
|
|
825
|
+
logger.debug(
|
|
826
|
+
f"executing: cloud fetch: {use_cloud_fetch}, max rows: {max_rows}, max bytes: {max_bytes}"
|
|
827
|
+
)
|
|
828
|
+
|
|
818
829
|
spark_arrow_types = ttypes.TSparkArrowTypes(
|
|
819
830
|
timestampAsArrow=self._use_arrow_native_timestamps,
|
|
820
831
|
decimalAsArrow=self._use_arrow_native_decimals,
|
|
@@ -929,6 +940,7 @@ class ThriftBackend:
|
|
|
929
940
|
return self._handle_execute_response(resp, cursor)
|
|
930
941
|
|
|
931
942
|
def _handle_execute_response(self, resp, cursor):
|
|
943
|
+
logger.debug(f"got execute response")
|
|
932
944
|
cursor.active_op_handle = resp.operationHandle
|
|
933
945
|
self._check_direct_results_for_error(resp.directResults)
|
|
934
946
|
|
|
@@ -949,6 +961,7 @@ class ThriftBackend:
|
|
|
949
961
|
arrow_schema_bytes,
|
|
950
962
|
description,
|
|
951
963
|
):
|
|
964
|
+
logger.debug("started to fetch results")
|
|
952
965
|
assert op_handle is not None
|
|
953
966
|
|
|
954
967
|
req = ttypes.TFetchResultsReq(
|
{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/utils.py
RENAMED
|
@@ -5,6 +5,7 @@ from decimal import Decimal
|
|
|
5
5
|
import datetime
|
|
6
6
|
import decimal
|
|
7
7
|
from enum import Enum
|
|
8
|
+
import logging
|
|
8
9
|
import lz4.frame
|
|
9
10
|
from typing import Dict, List, Union, Any
|
|
10
11
|
import pyarrow
|
|
@@ -18,6 +19,7 @@ from databricks.sql.thrift_api.TCLIService.ttypes import (
|
|
|
18
19
|
)
|
|
19
20
|
|
|
20
21
|
BIT_MASKS = [1, 2, 4, 8, 16, 32, 64, 128]
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
class ResultSetQueue(ABC):
|
|
@@ -71,6 +73,9 @@ class ResultSetQueueFactory(ABC):
|
|
|
71
73
|
)
|
|
72
74
|
return ArrowQueue(converted_arrow_table, n_valid_rows)
|
|
73
75
|
elif row_set_type == TSparkRowSetType.URL_BASED_SET:
|
|
76
|
+
logger.debug(
|
|
77
|
+
f"built cloud fetch queue for {len(t_row_set.resultLinks)} links."
|
|
78
|
+
)
|
|
74
79
|
return CloudFetchQueue(
|
|
75
80
|
arrow_schema_bytes,
|
|
76
81
|
start_row_offset=t_row_set.startRowOffset,
|
|
@@ -146,6 +151,9 @@ class CloudFetchQueue(ResultSetQueue):
|
|
|
146
151
|
self.lz4_compressed = lz4_compressed
|
|
147
152
|
self.description = description
|
|
148
153
|
|
|
154
|
+
logger.debug(
|
|
155
|
+
f"creating cloud fetch queue for {len(result_links)} links and max_download_threads {self.max_download_threads}."
|
|
156
|
+
)
|
|
149
157
|
self.download_manager = ResultFileDownloadManager(
|
|
150
158
|
self.max_download_threads, self.lz4_compressed
|
|
151
159
|
)
|
|
File without changes
|
|
File without changes
|
{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{databricks_sql_connector-2.9.2.dev1 → databricks_sql_connector-2.9.4b1}/src/databricks/sql/types.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|