databricks-sdk 0.68.0__py3-none-any.whl → 0.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of databricks-sdk might be problematic. Click here for more details.

@@ -181,11 +181,7 @@ def _make_dbutils(config: client.Config):
181
181
 
182
182
 
183
183
  def _make_files_client(apiClient: client.ApiClient, config: client.Config):
184
- if config.enable_experimental_files_api_client:
185
- _LOG.info("Experimental Files API client is enabled")
186
- return FilesExt(apiClient, config)
187
- else:
188
- return FilesAPI(apiClient)
184
+ return FilesExt(apiClient, config)
189
185
 
190
186
 
191
187
  class WorkspaceClient:
@@ -603,11 +599,6 @@ class WorkspaceClient:
603
599
  """A feature store is a centralized repository that enables data scientists to find and share features."""
604
600
  return self._feature_store
605
601
 
606
- @property
607
- def files(self) -> pkg_files.FilesAPI:
608
- """The Files API is a standard HTTP API that allows you to read, write, list, and delete files and directories by referring to their URI."""
609
- return self._files
610
-
611
602
  @property
612
603
  def functions(self) -> pkg_catalog.FunctionsAPI:
613
604
  """Functions implement User-Defined Functions (UDFs) in Unity Catalog."""
@@ -1013,6 +1004,11 @@ class WorkspaceClient:
1013
1004
  """User identities recognized by Databricks and represented by email addresses."""
1014
1005
  return self._users
1015
1006
 
1007
+ @property
1008
+ def files(self) -> FilesExt:
1009
+ """The Files API is a standard HTTP API that allows you to read, write, list, and delete files and directories by referring to their URI."""
1010
+ return self._files
1011
+
1016
1012
  def get_workspace_id(self) -> int:
1017
1013
  """Get the workspace ID of the workspace that this client is connected to."""
1018
1014
  response = self._api_client.do("GET", "/api/2.0/preview/scim/v2/Me", response_headers=["X-Databricks-Org-Id"])
databricks/sdk/config.py CHANGED
@@ -6,7 +6,7 @@ import os
6
6
  import pathlib
7
7
  import sys
8
8
  import urllib.parse
9
- from typing import Dict, Iterable, Optional
9
+ from typing import Dict, Iterable, List, Optional
10
10
 
11
11
  import requests
12
12
 
@@ -110,18 +110,27 @@ class Config:
110
110
 
111
111
  disable_async_token_refresh: bool = ConfigAttribute(env="DATABRICKS_DISABLE_ASYNC_TOKEN_REFRESH")
112
112
 
113
- enable_experimental_files_api_client: bool = ConfigAttribute(env="DATABRICKS_ENABLE_EXPERIMENTAL_FILES_API_CLIENT")
114
- files_api_client_download_max_total_recovers = None
115
- files_api_client_download_max_total_recovers_without_progressing = 1
113
+ disable_experimental_files_api_client: bool = ConfigAttribute(
114
+ env="DATABRICKS_DISABLE_EXPERIMENTAL_FILES_API_CLIENT"
115
+ )
116
+
117
+ files_ext_client_download_streaming_chunk_size: int = 2 * 1024 * 1024 # 2 MiB
118
+
119
+ # When downloading a file, the maximum number of attempts to retry downloading the whole file. Default is no limit.
120
+ files_ext_client_download_max_total_recovers: Optional[int] = None
116
121
 
117
- # File multipart upload parameters
122
+ # When downloading a file, the maximum number of attempts to retry downloading from the same offset without progressing.
123
+ # This is to avoid infinite retrying when the download is not making any progress. Default is 1.
124
+ files_ext_client_download_max_total_recovers_without_progressing = 1
125
+
126
+ # File multipart upload/download parameters
118
127
  # ----------------------
119
128
 
120
129
  # Minimal input stream size (bytes) to use multipart / resumable uploads.
121
130
  # For small files it's more efficient to make one single-shot upload request.
122
131
  # When uploading a file, SDK will initially buffer this many bytes from input stream.
123
132
  # This parameter can be less or bigger than multipart_upload_chunk_size.
124
- multipart_upload_min_stream_size: int = 5 * 1024 * 1024
133
+ files_ext_multipart_upload_min_stream_size: int = 50 * 1024 * 1024
125
134
 
126
135
  # Maximum number of presigned URLs that can be requested at a time.
127
136
  #
@@ -131,23 +140,59 @@ class Config:
131
140
  # the stream back. In case of a non-seekable stream we cannot rewind, so we'll abort
132
141
  # the upload. To reduce the chance of this, we're requesting presigned URLs one by one
133
142
  # and using them immediately.
134
- multipart_upload_batch_url_count: int = 1
143
+ files_ext_multipart_upload_batch_url_count: int = 1
135
144
 
136
- # Size of the chunk to use for multipart uploads.
145
+ # Size of the chunk to use for multipart uploads & downloads.
137
146
  #
138
147
  # The smaller chunk is, the less chance for network errors (or URL get expired),
139
148
  # but the more requests we'll make.
140
149
  # For AWS, minimum is 5Mb: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
141
150
  # For GCP, minimum is 256 KiB (and also recommended multiple is 256 KiB)
142
151
  # boto uses 8Mb: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig
143
- multipart_upload_chunk_size: int = 10 * 1024 * 1024
144
-
145
- # use maximum duration of 1 hour
146
- multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
152
+ files_ext_multipart_upload_default_part_size: int = 10 * 1024 * 1024 # 10 MiB
153
+
154
+ # List of multipart upload part sizes that can be automatically selected
155
+ files_ext_multipart_upload_part_size_options: List[int] = [
156
+ 10 * 1024 * 1024, # 10 MiB
157
+ 20 * 1024 * 1024, # 20 MiB
158
+ 50 * 1024 * 1024, # 50 MiB
159
+ 100 * 1024 * 1024, # 100 MiB
160
+ 200 * 1024 * 1024, # 200 MiB
161
+ 500 * 1024 * 1024, # 500 MiB
162
+ 1 * 1024 * 1024 * 1024, # 1 GiB
163
+ 2 * 1024 * 1024 * 1024, # 2 GiB
164
+ 4 * 1024 * 1024 * 1024, # 4 GiB
165
+ ]
166
+
167
+ # Maximum size of a single part in multipart upload.
168
+ # For AWS, maximum is 5 GiB: https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html
169
+ # For Azure, maximum is 4 GiB: https://learn.microsoft.com/en-us/rest/api/storageservices/put-block
170
+ # For CloudFlare R2, maximum is 5 GiB: https://developers.cloudflare.com/r2/objects/multipart-objects/
171
+ files_ext_multipart_upload_max_part_size: int = 4 * 1024 * 1024 * 1024 # 4 GiB
172
+
173
+ # Default parallel multipart upload concurrency. Set to 10 because of the experiment results show that it
174
+ # gives good performance result.
175
+ files_ext_multipart_upload_default_parallelism: int = 10
176
+
177
+ # The expiration duration for presigned URLs used in multipart uploads and downloads.
178
+ # The client will request new presigned URLs if the previous one is expired. The duration should be long enough
179
+ # to complete the upload or download of a single part.
180
+ files_ext_multipart_upload_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
181
+ files_ext_presigned_download_url_expiration_duration: datetime.timedelta = datetime.timedelta(hours=1)
182
+
183
+ # When downloading a file in parallel, how many worker threads to use.
184
+ files_ext_parallel_download_default_parallelism: int = 10
185
+
186
+ # When downloading a file, if the file size is smaller than this threshold,
187
+ # We'll use a single-threaded download even if the parallel download is enabled.
188
+ files_ext_parallel_download_min_file_size: int = 50 * 1024 * 1024 # 50 MiB
189
+
190
+ # Default chunk size to use when downloading a file in parallel. Not effective for single threaded download.
191
+ files_ext_parallel_download_default_part_size: int = 10 * 1024 * 1024 # 10 MiB
147
192
 
148
193
  # This is not a "wall time" cutoff for the whole upload request,
149
194
  # but a maximum time between consecutive data reception events (even 1 byte) from the server
150
- multipart_upload_single_chunk_upload_timeout_seconds: float = 60
195
+ files_ext_network_transfer_inactivity_timeout_seconds: float = 60
151
196
 
152
197
  # Cap on the number of custom retries during incremental uploads:
153
198
  # 1) multipart: upload part URL is expired, so new upload URLs must be requested to continue upload
@@ -155,7 +200,10 @@ class Config:
155
200
  # retrieved to continue the upload.
156
201
  # In these two cases standard SDK retries (which are capped by the `retry_timeout_seconds` option) are not used.
157
202
  # Note that retry counter is reset when upload is successfully resumed.
158
- multipart_upload_max_retries = 3
203
+ files_ext_multipart_upload_max_retries = 3
204
+
205
+ # Cap on the number of custom retries during parallel downloads.
206
+ files_ext_parallel_download_max_retries = 3
159
207
 
160
208
  def __init__(
161
209
  self,