pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,346 @@
1
+ import datetime
2
+ import logging
3
+ import re
4
+ import threading
5
+ import uuid
6
+ from datetime import timezone
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Iterator
9
+ from urllib.parse import quote
10
+
11
+ from azure.core.exceptions import AzureError
12
+ from azure.storage.blob import BlobSasPermissions, generate_blob_sas
13
+
14
+ from pixeltable import env, exceptions as excs
15
+ from pixeltable.config import Config
16
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress
17
+
18
+ if TYPE_CHECKING:
19
+ from azure.storage.blob import BlobProperties, BlobServiceClient
20
+
21
+ from pixeltable.catalog import Column
22
+
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ client_lock = threading.Lock()
28
+
29
+
30
+ @env.register_client('azure_blob')
31
+ def _() -> dict[str, 'BlobServiceClient']:
32
+ return {}
33
+
34
+
35
+ class AzureBlobStore(ObjectStoreBase):
36
+ """Class to handle Azure Blob Storage operations."""
37
+
38
+ # TODO: This needs to be redesigned to use asyncio.
39
+
40
+ # URI of the Azure Blob Storage container
41
+ # Always ends with a slash
42
+ __base_uri: str
43
+
44
+ # Storage account name
45
+ __account_name: str
46
+
47
+ # Container name extracted from the URI
48
+ __container_name: str
49
+
50
+ # Prefix path within the container, either empty or ending with a slash
51
+ __prefix_name: str
52
+
53
+ # URI scheme (wasb, wasbs, abfs, abfss, https)
54
+ __scheme: str
55
+
56
+ soa: StorageObjectAddress
57
+
58
+ def __init__(self, soa: StorageObjectAddress):
59
+ self.soa = soa
60
+ self.__scheme = soa.scheme
61
+ self.__account_name = soa.account
62
+ self.__container_name = soa.container
63
+ self.__prefix_name = soa.prefix
64
+
65
+ # Reconstruct base URI in normalized format
66
+ self.__base_uri = self.soa.prefix_free_uri + self.__prefix_name
67
+ _logger.info(
68
+ f'Initialized AzureBlobStore with base URI: {self.__base_uri}, '
69
+ f'account: {self.__account_name}, container: {self.__container_name}, prefix: {self.__prefix_name}'
70
+ )
71
+
72
+ def client(self) -> 'BlobServiceClient':
73
+ """Return the Azure Blob Storage client."""
74
+ client_dict: dict[str, 'BlobServiceClient'] = env.Env.get().get_client('azure_blob')
75
+ with client_lock:
76
+ uri = self.soa.container_free_uri
77
+ if uri not in client_dict:
78
+ storage_account_name = Config.get().get_string_value('storage_account_name', section='azure')
79
+ storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
80
+ if (storage_account_name is None) != (storage_account_key is None):
81
+ raise excs.Error(
82
+ "Azure 'storage_account_name' and 'storage_account_key' must be specified together."
83
+ )
84
+ if storage_account_name is None or storage_account_name != self.__account_name:
85
+ # Attempt a connection to a public resource, with no account key
86
+ client_dict[uri] = self.create_client(endpoint_url=uri)
87
+ else:
88
+ client_dict[uri] = self.create_client(
89
+ endpoint_url=uri, account_name=self.__account_name, account_key=storage_account_key
90
+ )
91
+ return client_dict[uri]
92
+
93
+ @property
94
+ def account_name(self) -> str:
95
+ """Return the storage account name."""
96
+ return self.__account_name
97
+
98
+ @property
99
+ def container_name(self) -> str:
100
+ """Return the container name from the base URI."""
101
+ return self.__container_name
102
+
103
+ @property
104
+ def prefix(self) -> str:
105
+ """Return the prefix from the base URI."""
106
+ return self.__prefix_name
107
+
108
+ def validate(self, error_col_name: str) -> str | None:
109
+ """
110
+ Checks if the URI exists and is accessible.
111
+
112
+ Returns:
113
+ str: The base URI if the container exists and is accessible, None otherwise.
114
+ """
115
+ try:
116
+ container_client = self.client().get_container_client(self.container_name)
117
+ # Check if container exists by trying to get its properties
118
+ container_client.get_container_properties()
119
+ return self.__base_uri
120
+ except AzureError as e:
121
+ self.handle_azure_error(e, self.container_name, f'validate container {error_col_name}')
122
+ return None
123
+
124
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
125
+ """Copies a blob to a local file. Thread safe."""
126
+ try:
127
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=self.prefix + src_path)
128
+ with open(dest_path, 'wb') as download_file:
129
+ download_stream = blob_client.download_blob()
130
+ download_file.write(download_stream.readall())
131
+ except AzureError as e:
132
+ self.handle_azure_error(e, self.container_name, f'download file {src_path}')
133
+ raise
134
+
135
+ # TODO: utils package should not include back-references to `Column`
136
+ def copy_local_file(self, col: 'Column', src_path: Path) -> str:
137
+ """Copy a local file to Azure Blob Storage, and return its new URL"""
138
+ prefix, filename = ObjectPath.create_prefix_raw(
139
+ col.get_tbl().id, col.id, col.get_tbl().version, ext=src_path.suffix
140
+ )
141
+ blob_name = f'{self.prefix}{prefix}/{filename}'
142
+ new_file_uri = f'{self.__base_uri}{prefix}/{filename}'
143
+
144
+ try:
145
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=blob_name)
146
+ with open(src_path, 'rb') as data:
147
+ blob_client.upload_blob(data, overwrite=True)
148
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
149
+ return new_file_uri
150
+ except AzureError as e:
151
+ self.handle_azure_error(e, self.container_name, f'upload file {src_path}')
152
+ raise
153
+
154
+ def _get_filtered_blobs(
155
+ self, tbl_id: uuid.UUID | None, tbl_version: int | None = None
156
+ ) -> Iterator['BlobProperties']:
157
+ """Private method to get filtered blobs for a table, optionally filtered by version.
158
+
159
+ Args:
160
+ tbl_id: Table UUID to filter by
161
+ tbl_version: Optional table version to filter by
162
+
163
+ Returns:
164
+ Iterator over blob objects matching the criteria
165
+ """
166
+ # Use ObjectPath to construct the prefix for this table
167
+ if tbl_id is None:
168
+ prefix = self.prefix
169
+ assert tbl_version is None, 'tbl_version must be None if tbl_id is None'
170
+ else:
171
+ table_prefix = ObjectPath.table_prefix(tbl_id)
172
+ prefix = f'{self.prefix}{table_prefix}/'
173
+
174
+ try:
175
+ container_client = self.client().get_container_client(self.container_name)
176
+
177
+ blob_iterator: Iterator['BlobProperties']
178
+ if tbl_version is None:
179
+ # Return all blobs with the table prefix
180
+ blob_iterator = container_client.list_blobs(name_starts_with=prefix)
181
+ else:
182
+ # Filter by both table_id and table_version using the ObjectPath pattern
183
+ # Pattern: tbl_id_col_id_version_uuid
184
+ version_pattern = re.compile(
185
+ rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
186
+ )
187
+ # Get all blobs with the prefix and filter by version pattern
188
+ all_blobs = container_client.list_blobs(name_starts_with=prefix)
189
+ blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
190
+
191
+ return blob_iterator
192
+
193
+ except AzureError as e:
194
+ self.handle_azure_error(e, self.container_name, f'setup iterator {self.prefix}')
195
+ raise
196
+
197
+ def count(self, tbl_id: uuid.UUID | None, tbl_version: int | None = None) -> int:
198
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
199
+ count only those files belonging to the specified tbl_version.
200
+
201
+ Args:
202
+ tbl_id: Table UUID to count blobs for
203
+ tbl_version: Optional table version to filter by
204
+
205
+ Returns:
206
+ Number of blobs matching the criteria
207
+ """
208
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
209
+ return sum(1 for _ in blob_iterator)
210
+
211
+ def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
212
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
213
+ only those files belonging to the specified tbl_version.
214
+
215
+ Args:
216
+ tbl_id: Table UUID to delete blobs for
217
+ tbl_version: Optional table version to filter by
218
+
219
+ Returns:
220
+ Number of blobs deleted
221
+ """
222
+ assert tbl_id is not None
223
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
224
+ total_deleted = 0
225
+
226
+ try:
227
+ container_client = self.client().get_container_client(self.container_name)
228
+
229
+ for blob in blob_iterator:
230
+ # TODO: Figure out now to properly use batch method delete_blobs(), it doesn't seem to work properly
231
+ container_client.delete_blob(blob.name)
232
+ total_deleted += 1
233
+
234
+ # print(f"Deleted {total_deleted} blobs from container '{self.container_name}'.")
235
+ return total_deleted
236
+
237
+ except AzureError as e:
238
+ self.handle_azure_error(e, self.container_name, f'deleting with {self.prefix}')
239
+ raise
240
+
241
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
242
+ """Return a list of objects found in the specified destination bucket.
243
+ Each returned object includes the full set of prefixes.
244
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
245
+ """
246
+ p = self.soa.prefix_free_uri if return_uri else ''
247
+ r: list[str] = []
248
+ try:
249
+ blob_iterator = self._get_filtered_blobs(tbl_id=None, tbl_version=None)
250
+ for blob in blob_iterator:
251
+ r.append(f'{p}{blob.name}')
252
+ if len(r) >= n_max:
253
+ return r
254
+
255
+ except AzureError as e:
256
+ self.handle_azure_error(e, self.__container_name, f'list objects from {self.__base_uri}')
257
+ return r
258
+
259
+ @classmethod
260
+ def handle_azure_error(
261
+ cls, e: 'AzureError', container_name: str, operation: str = '', *, ignore_404: bool = False
262
+ ) -> None:
263
+ from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
264
+
265
+ if ignore_404 and isinstance(e, ResourceNotFoundError):
266
+ return
267
+
268
+ if isinstance(e, ResourceNotFoundError):
269
+ raise excs.Error(f'Container {container_name} or blob not found during {operation}: {str(e)!r}')
270
+ elif isinstance(e, ClientAuthenticationError):
271
+ raise excs.Error(f'Authentication failed for container {container_name} during {operation}: {str(e)!r}')
272
+ elif isinstance(e, HttpResponseError):
273
+ if e.status_code == 403:
274
+ raise excs.Error(f'Access denied to container {container_name} during {operation}: {str(e)!r}')
275
+ elif e.status_code == 412:
276
+ raise excs.Error(f'Precondition failed for container {container_name} during {operation}: {str(e)!r}')
277
+ else:
278
+ raise excs.Error(
279
+ f'HTTP error during {operation} in container {container_name}: {e.status_code} - {str(e)!r}'
280
+ )
281
+ else:
282
+ raise excs.Error(f'Error during {operation} in container {container_name}: {str(e)!r}')
283
+
284
+ def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
285
+ """Create a presigned URL for downloading an object from Azure Blob Storage."""
286
+ if not soa.has_object:
287
+ raise excs.Error(f'StorageObjectAddress does not contain an object name: {soa}')
288
+
289
+ azure_client = self.client()
290
+ account_name = azure_client.account_name if azure_client.account_name else self.__account_name
291
+
292
+ # Account key cannot be extracted from client for security reasons, get from config
293
+ storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
294
+
295
+ if not account_name or not storage_account_key:
296
+ raise excs.Error(
297
+ 'Azure storage_account_name and storage_account_key must be configured '
298
+ 'to generate presigned URLs. Set them in the config under the [azure] section, '
299
+ 'or include the account name in the Azure URL.'
300
+ )
301
+
302
+ # Use datetime.now(timezone.utc) + timedelta like in pixeltable cloud
303
+ expiry_time = datetime.datetime.now(timezone.utc) + datetime.timedelta(seconds=expiration_seconds)
304
+
305
+ sas_token = generate_blob_sas(
306
+ account_name=account_name,
307
+ container_name=soa.container,
308
+ blob_name=soa.key,
309
+ account_key=storage_account_key,
310
+ permission=BlobSasPermissions(read=True),
311
+ expiry=expiry_time,
312
+ version='2022-11-02', # Specify API version to avoid version mismatch issues
313
+ )
314
+
315
+ # Build URL directly - URL encode the blob key to handle special characters
316
+ # Use safe='/' to preserve path separators in the blob key
317
+ encoded_key = quote(soa.key, safe='/')
318
+ blob_url = f'https://{account_name}.blob.core.windows.net/{soa.container}/{encoded_key}?{sas_token}'
319
+ return blob_url
320
+
321
+ @classmethod
322
+ def create_client(
323
+ cls, endpoint_url: str, account_name: str | None = None, account_key: str | None = None
324
+ ) -> 'BlobServiceClient':
325
+ from azure.core.credentials import AzureNamedKeyCredential
326
+ from azure.storage.blob import BlobServiceClient # TODO: Use azure.storage.blob.aio instead
327
+
328
+ assert (account_name is None) == (account_key is None)
329
+ try:
330
+ # e.g. endpoint_url: str = f'https://{account_name}.blob.core.windows.net'
331
+ assert endpoint_url is not None, 'No Azure Storage account information provided'
332
+
333
+ # Use empty SAS token for anonymous authentication
334
+ credential = None
335
+ if account_name is not None:
336
+ credential = AzureNamedKeyCredential(name=account_name, key=account_key)
337
+ return BlobServiceClient(
338
+ account_url=endpoint_url,
339
+ credential=credential,
340
+ max_single_get_size=(32 * 2**20),
341
+ max_chunk_get_size=(4 * 2**20),
342
+ connection_timeout=15,
343
+ read_timeout=30,
344
+ )
345
+ except Exception as e:
346
+ raise excs.Error(f'Failed to create Azure Blob Storage client: {str(e)!r}') from e
pixeltable/utils/coco.py CHANGED
@@ -50,11 +50,11 @@ def _verify_input_dict(input_dict: dict[str, Any]) -> None:
50
50
  raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
51
51
 
52
52
 
53
- def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
54
- """Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
53
+ def write_coco_dataset(query: pxt.Query, dest_path: Path) -> Path:
54
+ """Export a ResultSet as a COCO dataset in dest_path and return the path of the data.json file."""
55
55
  # TODO: validate schema
56
- if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
57
- raise excs.Error(f'Expected exactly one json-typed column in select list: {df._select_list_exprs}')
56
+ if len(query._select_list_exprs) != 1 or not query._select_list_exprs[0].col_type.is_json_type():
57
+ raise excs.Error(f'Expected exactly one json-typed column in select list: {query._select_list_exprs}')
58
58
  input_dict_slot_idx = -1 # df._select_list_exprs[0].slot_idx isn't valid until _exec()
59
59
 
60
60
  # create output dir
@@ -68,9 +68,9 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
68
68
  annotations: list[dict[str, Any]] = []
69
69
  ann_id = -1
70
70
  categories: set[Any] = set()
71
- for input_row in df._exec():
71
+ for input_row in query._exec():
72
72
  if input_dict_slot_idx == -1:
73
- input_dict_expr = df._select_list_exprs[0]
73
+ input_dict_expr = query._select_list_exprs[0]
74
74
  input_dict_slot_idx = input_dict_expr.slot_idx
75
75
  input_dict = input_row[input_dict_slot_idx]
76
76
  _verify_input_dict(input_dict)
pixeltable/utils/code.py CHANGED
@@ -1,12 +1,11 @@
1
1
  import types
2
- from typing import Optional
3
2
 
4
3
  from pixeltable.func import Function
5
4
 
6
5
  # Utilities related to the organization of the Pixeltable codebase.
7
6
 
8
7
 
9
- def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> list[str]:
8
+ def local_public_names(mod_name: str, exclude: list[str] | None = None) -> list[str]:
10
9
  """
11
10
  Returns a list of all functions and submodules that are local to the specified module and are
12
11
  publicly accessible. Intended to facilitate implementation of module __dir__() methods for
@@ -21,7 +20,8 @@ def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> li
21
20
  for obj in mod.__dict__.values():
22
21
  if isinstance(obj, Function):
23
22
  # Pixeltable function
24
- names.append(obj.name)
23
+ if not obj.name.startswith('_'):
24
+ names.append(obj.name)
25
25
  elif isinstance(obj, types.FunctionType):
26
26
  # Python function
27
27
  if obj.__module__ == mod.__name__ and not obj.__name__.startswith('_'):
@@ -1,6 +1,8 @@
1
1
  import logging
2
2
  from typing import TextIO
3
3
 
4
+ from pixeltable import exceptions as excs
5
+
4
6
 
5
7
  def map_level(verbosity: int) -> int:
6
8
  """
@@ -19,7 +21,8 @@ def map_level(verbosity: int) -> int:
19
21
  return logging.INFO
20
22
  if verbosity == 2:
21
23
  return logging.DEBUG
22
- return logging.INFO
24
+
25
+ raise excs.Error(f'Invalid verbosity level: {verbosity}')
23
26
 
24
27
 
25
28
  class ConsoleOutputHandler(logging.StreamHandler):
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import threading
3
- from concurrent.futures import ThreadPoolExecutor
4
3
  from typing import Any, Coroutine, TypeVar
5
4
 
6
- T = TypeVar('T')
5
+ from pixeltable.env import Env
7
6
 
7
+ T = TypeVar('T')
8
8
 
9
9
  # TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
10
10
  # scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
@@ -15,27 +15,10 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
15
15
  """
16
16
  Runs the given coroutine synchronously, even if called in the context of a running event loop.
17
17
  """
18
-
19
- def run_in_new_loop() -> T:
20
- new_loop = asyncio.new_event_loop()
21
- asyncio.set_event_loop(new_loop)
22
- try:
23
- return new_loop.run_until_complete(coroutine)
24
- finally:
25
- new_loop.close()
26
-
27
- try:
28
- loop = asyncio.get_running_loop()
29
- except RuntimeError:
30
- # No event loop; just call `asyncio.run()`
31
- return asyncio.run(coroutine)
18
+ loop = Env.get().event_loop
32
19
 
33
20
  if threading.current_thread() is threading.main_thread():
34
- if not loop.is_running():
35
- return loop.run_until_complete(coroutine)
36
- else:
37
- with ThreadPoolExecutor() as pool:
38
- future = pool.submit(run_in_new_loop)
39
- return future.result(timeout=timeout)
21
+ return loop.run_until_complete(coroutine)
40
22
  else:
41
- return asyncio.run_coroutine_threadsafe(coroutine, loop).result()
23
+ # Not in main thread, use run_coroutine_threadsafe
24
+ return asyncio.run_coroutine_threadsafe(coroutine, loop).result(timeout)
pixeltable/utils/dbms.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import abc
2
2
 
3
- from sqlalchemy import URL
3
+ import sqlalchemy as sql
4
4
 
5
5
 
6
6
  class Dbms(abc.ABC):
@@ -11,9 +11,9 @@ class Dbms(abc.ABC):
11
11
  name: str
12
12
  transaction_isolation_level: str
13
13
  version_index_type: str
14
- db_url: URL
14
+ db_url: sql.URL
15
15
 
16
- def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: URL) -> None:
16
+ def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: sql.URL) -> None:
17
17
  self.name = name
18
18
  self.transaction_isolation_level = transaction_isolation_level
19
19
  self.version_index_type = version_index_type
@@ -28,14 +28,19 @@ class Dbms(abc.ABC):
28
28
  @abc.abstractmethod
29
29
  def default_system_db_url(self) -> str: ...
30
30
 
31
+ @abc.abstractmethod
32
+ def create_vector_index_stmt(
33
+ self, store_index_name: str, sa_value_col: sql.Column, metric: str
34
+ ) -> sql.Compiled: ...
35
+
31
36
 
32
37
  class PostgresqlDbms(Dbms):
33
38
  """
34
39
  Implements utilities to interact with Postgres database.
35
40
  """
36
41
 
37
- def __init__(self, db_url: URL):
38
- super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
42
+ def __init__(self, db_url: sql.URL):
43
+ super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
39
44
 
40
45
  def drop_db_stmt(self, database: str) -> str:
41
46
  return f'DROP DATABASE {database}'
@@ -47,13 +52,25 @@ class PostgresqlDbms(Dbms):
47
52
  a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
48
53
  return a
49
54
 
55
+ def create_vector_index_stmt(self, store_index_name: str, sa_value_col: sql.Column, metric: str) -> sql.Compiled:
56
+ from sqlalchemy.dialects import postgresql
57
+
58
+ sa_idx = sql.Index(
59
+ store_index_name,
60
+ sa_value_col,
61
+ postgresql_using='hnsw',
62
+ postgresql_with={'m': 16, 'ef_construction': 64},
63
+ postgresql_ops={sa_value_col.name: metric},
64
+ )
65
+ return sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(dialect=postgresql.dialect())
66
+
50
67
 
51
68
  class CockroachDbms(Dbms):
52
69
  """
53
70
  Implements utilities to interact with CockroachDb database.
54
71
  """
55
72
 
56
- def __init__(self, db_url: URL):
73
+ def __init__(self, db_url: sql.URL):
57
74
  super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
58
75
 
59
76
  def drop_db_stmt(self, database: str) -> str:
@@ -64,3 +81,12 @@ class CockroachDbms(Dbms):
64
81
 
65
82
  def default_system_db_url(self) -> str:
66
83
  return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
84
+
85
+ def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index | None:
86
+ return None
87
+
88
+ def create_vector_index_stmt(self, store_index_name: str, sa_value_col: sql.Column, metric: str) -> sql.Compiled:
89
+ return sql.text(
90
+ f'CREATE VECTOR INDEX IF NOT EXISTS {store_index_name} ON {sa_value_col.table.name}'
91
+ f'({sa_value_col.name} {metric})'
92
+ ).compile()
@@ -1,5 +1,4 @@
1
1
  import dataclasses
2
- from typing import Optional, Union
3
2
 
4
3
  import pandas as pd
5
4
  from pandas.io.formats.style import Styler
@@ -7,11 +6,11 @@ from pandas.io.formats.style import Styler
7
6
 
8
7
  @dataclasses.dataclass
9
8
  class _Descriptor:
10
- body: Union[str, pd.DataFrame]
9
+ body: str | pd.DataFrame
11
10
  # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
12
11
  show_index: bool
13
12
  show_header: bool
14
- styler: Optional[Styler] = None
13
+ styler: Styler | None = None
15
14
 
16
15
 
17
16
  class DescriptionHelper:
@@ -33,10 +32,10 @@ class DescriptionHelper:
33
32
 
34
33
  def append(
35
34
  self,
36
- descriptor: Union[str, pd.DataFrame],
35
+ descriptor: str | pd.DataFrame,
37
36
  show_index: bool = False,
38
37
  show_header: bool = True,
39
- styler: Optional[Styler] = None,
38
+ styler: Styler | None = None,
40
39
  ) -> None:
41
40
  self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
42
41
 
@@ -1,10 +1,9 @@
1
1
  import dataclasses
2
2
  import os
3
- from typing import Optional
4
3
 
5
4
  import bs4
6
- import fitz # type: ignore[import-untyped]
7
5
  import puremagic
6
+ from pypdfium2 import PdfDocument # type: ignore[import-untyped]
8
7
 
9
8
  from pixeltable import exceptions as excs, type_system as ts
10
9
  from pixeltable.env import Env
@@ -13,10 +12,10 @@ from pixeltable.env import Env
13
12
  @dataclasses.dataclass
14
13
  class DocumentHandle:
15
14
  format: ts.DocumentType.DocumentFormat
16
- bs_doc: Optional[bs4.BeautifulSoup] = None
17
- md_ast: Optional[dict] = None
18
- pdf_doc: Optional[fitz.Document] = None
19
- txt_doc: Optional[str] = None
15
+ bs_doc: bs4.BeautifulSoup | None = None
16
+ md_ast: dict | None = None
17
+ pdf_doc: PdfDocument | None = None
18
+ txt_doc: str | None = None
20
19
 
21
20
 
22
21
  def get_document_handle(path: str) -> DocumentHandle:
@@ -34,7 +33,7 @@ def get_document_handle(path: str) -> DocumentHandle:
34
33
  raise excs.Error(f'Unrecognized document format: {path}')
35
34
 
36
35
 
37
- def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
36
+ def get_handle_by_extension(path: str, extension: str) -> DocumentHandle | None:
38
37
  doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
39
38
 
40
39
  try:
@@ -43,7 +42,7 @@ def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandl
43
42
  if doc_format == ts.DocumentType.DocumentFormat.MD:
44
43
  return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
45
44
  if doc_format == ts.DocumentType.DocumentFormat.PDF:
46
- return DocumentHandle(doc_format, pdf_doc=get_pdf_handle(path))
45
+ return DocumentHandle(doc_format, pdf_doc=PdfDocument(path))
47
46
  if doc_format == ts.DocumentType.DocumentFormat.XML:
48
47
  return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
49
48
  if doc_format == ts.DocumentType.DocumentFormat.TXT:
@@ -72,16 +71,6 @@ def get_markdown_handle(path: str) -> dict:
72
71
  return md_ast(text)
73
72
 
74
73
 
75
- def get_pdf_handle(path: str) -> fitz.Document:
76
- doc = fitz.open(path)
77
- # check pdf (bc it will work for images)
78
- if not doc.is_pdf:
79
- raise excs.Error(f'Not a valid PDF document: {path}')
80
- # try to read one page
81
- next(page for page in doc)
82
- return doc
83
-
84
-
85
74
  def get_xml_handle(path: str) -> bs4.BeautifulSoup:
86
75
  with open(path, 'r', encoding='utf8') as fp:
87
76
  doc = bs4.BeautifulSoup(fp, 'xml')