pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,346 @@
1
+ import datetime
2
+ import logging
3
+ import re
4
+ import threading
5
+ import uuid
6
+ from datetime import timezone
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Iterator
9
+ from urllib.parse import quote
10
+
11
+ from azure.core.exceptions import AzureError
12
+ from azure.storage.blob import BlobSasPermissions, generate_blob_sas
13
+
14
+ from pixeltable import env, exceptions as excs
15
+ from pixeltable.config import Config
16
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress
17
+
18
+ if TYPE_CHECKING:
19
+ from azure.storage.blob import BlobProperties, BlobServiceClient
20
+
21
+ from pixeltable.catalog import Column
22
+
23
+
24
+ _logger = logging.getLogger('pixeltable')
25
+
26
+
27
+ client_lock = threading.Lock()
28
+
29
+
30
+ @env.register_client('azure_blob')
31
+ def _() -> dict[str, 'BlobServiceClient']:
32
+ return {}
33
+
34
+
35
+ class AzureBlobStore(ObjectStoreBase):
36
+ """Class to handle Azure Blob Storage operations."""
37
+
38
+ # TODO: This needs to be redesigned to use asyncio.
39
+
40
+ # URI of the Azure Blob Storage container
41
+ # Always ends with a slash
42
+ __base_uri: str
43
+
44
+ # Storage account name
45
+ __account_name: str
46
+
47
+ # Container name extracted from the URI
48
+ __container_name: str
49
+
50
+ # Prefix path within the container, either empty or ending with a slash
51
+ __prefix_name: str
52
+
53
+ # URI scheme (wasb, wasbs, abfs, abfss, https)
54
+ __scheme: str
55
+
56
+ soa: StorageObjectAddress
57
+
58
+ def __init__(self, soa: StorageObjectAddress):
59
+ self.soa = soa
60
+ self.__scheme = soa.scheme
61
+ self.__account_name = soa.account
62
+ self.__container_name = soa.container
63
+ self.__prefix_name = soa.prefix
64
+
65
+ # Reconstruct base URI in normalized format
66
+ self.__base_uri = self.soa.prefix_free_uri + self.__prefix_name
67
+ _logger.info(
68
+ f'Initialized AzureBlobStore with base URI: {self.__base_uri}, '
69
+ f'account: {self.__account_name}, container: {self.__container_name}, prefix: {self.__prefix_name}'
70
+ )
71
+
72
+ def client(self) -> 'BlobServiceClient':
73
+ """Return the Azure Blob Storage client."""
74
+ client_dict: dict[str, 'BlobServiceClient'] = env.Env.get().get_client('azure_blob')
75
+ with client_lock:
76
+ uri = self.soa.container_free_uri
77
+ if uri not in client_dict:
78
+ storage_account_name = Config.get().get_string_value('storage_account_name', section='azure')
79
+ storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
80
+ if (storage_account_name is None) != (storage_account_key is None):
81
+ raise excs.Error(
82
+ "Azure 'storage_account_name' and 'storage_account_key' must be specified together."
83
+ )
84
+ if storage_account_name is None or storage_account_name != self.__account_name:
85
+ # Attempt a connection to a public resource, with no account key
86
+ client_dict[uri] = self.create_client(endpoint_url=uri)
87
+ else:
88
+ client_dict[uri] = self.create_client(
89
+ endpoint_url=uri, account_name=self.__account_name, account_key=storage_account_key
90
+ )
91
+ return client_dict[uri]
92
+
93
+ @property
94
+ def account_name(self) -> str:
95
+ """Return the storage account name."""
96
+ return self.__account_name
97
+
98
+ @property
99
+ def container_name(self) -> str:
100
+ """Return the container name from the base URI."""
101
+ return self.__container_name
102
+
103
+ @property
104
+ def prefix(self) -> str:
105
+ """Return the prefix from the base URI."""
106
+ return self.__prefix_name
107
+
108
+ def validate(self, error_col_name: str) -> str | None:
109
+ """
110
+ Checks if the URI exists and is accessible.
111
+
112
+ Returns:
113
+ str: The base URI if the container exists and is accessible, None otherwise.
114
+ """
115
+ try:
116
+ container_client = self.client().get_container_client(self.container_name)
117
+ # Check if container exists by trying to get its properties
118
+ container_client.get_container_properties()
119
+ return self.__base_uri
120
+ except AzureError as e:
121
+ self.handle_azure_error(e, self.container_name, f'validate container {error_col_name}')
122
+ return None
123
+
124
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
125
+ """Copies a blob to a local file. Thread safe."""
126
+ try:
127
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=self.prefix + src_path)
128
+ with open(dest_path, 'wb') as download_file:
129
+ download_stream = blob_client.download_blob()
130
+ download_file.write(download_stream.readall())
131
+ except AzureError as e:
132
+ self.handle_azure_error(e, self.container_name, f'download file {src_path}')
133
+ raise
134
+
135
+ # TODO: utils package should not include back-references to `Column`
136
+ def copy_local_file(self, col: 'Column', src_path: Path) -> str:
137
+ """Copy a local file to Azure Blob Storage, and return its new URL"""
138
+ prefix, filename = ObjectPath.create_prefix_raw(
139
+ col.get_tbl().id, col.id, col.get_tbl().version, ext=src_path.suffix
140
+ )
141
+ blob_name = f'{self.prefix}{prefix}/{filename}'
142
+ new_file_uri = f'{self.__base_uri}{prefix}/{filename}'
143
+
144
+ try:
145
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=blob_name)
146
+ with open(src_path, 'rb') as data:
147
+ blob_client.upload_blob(data, overwrite=True)
148
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
149
+ return new_file_uri
150
+ except AzureError as e:
151
+ self.handle_azure_error(e, self.container_name, f'upload file {src_path}')
152
+ raise
153
+
154
+ def _get_filtered_blobs(
155
+ self, tbl_id: uuid.UUID | None, tbl_version: int | None = None
156
+ ) -> Iterator['BlobProperties']:
157
+ """Private method to get filtered blobs for a table, optionally filtered by version.
158
+
159
+ Args:
160
+ tbl_id: Table UUID to filter by
161
+ tbl_version: Optional table version to filter by
162
+
163
+ Returns:
164
+ Iterator over blob objects matching the criteria
165
+ """
166
+ # Use ObjectPath to construct the prefix for this table
167
+ if tbl_id is None:
168
+ prefix = self.prefix
169
+ assert tbl_version is None, 'tbl_version must be None if tbl_id is None'
170
+ else:
171
+ table_prefix = ObjectPath.table_prefix(tbl_id)
172
+ prefix = f'{self.prefix}{table_prefix}/'
173
+
174
+ try:
175
+ container_client = self.client().get_container_client(self.container_name)
176
+
177
+ blob_iterator: Iterator['BlobProperties']
178
+ if tbl_version is None:
179
+ # Return all blobs with the table prefix
180
+ blob_iterator = container_client.list_blobs(name_starts_with=prefix)
181
+ else:
182
+ # Filter by both table_id and table_version using the ObjectPath pattern
183
+ # Pattern: tbl_id_col_id_version_uuid
184
+ version_pattern = re.compile(
185
+ rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
186
+ )
187
+ # Get all blobs with the prefix and filter by version pattern
188
+ all_blobs = container_client.list_blobs(name_starts_with=prefix)
189
+ blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
190
+
191
+ return blob_iterator
192
+
193
+ except AzureError as e:
194
+ self.handle_azure_error(e, self.container_name, f'setup iterator {self.prefix}')
195
+ raise
196
+
197
+ def count(self, tbl_id: uuid.UUID | None, tbl_version: int | None = None) -> int:
198
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
199
+ count only those files belonging to the specified tbl_version.
200
+
201
+ Args:
202
+ tbl_id: Table UUID to count blobs for
203
+ tbl_version: Optional table version to filter by
204
+
205
+ Returns:
206
+ Number of blobs matching the criteria
207
+ """
208
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
209
+ return sum(1 for _ in blob_iterator)
210
+
211
+ def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
212
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
213
+ only those files belonging to the specified tbl_version.
214
+
215
+ Args:
216
+ tbl_id: Table UUID to delete blobs for
217
+ tbl_version: Optional table version to filter by
218
+
219
+ Returns:
220
+ Number of blobs deleted
221
+ """
222
+ assert tbl_id is not None
223
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
224
+ total_deleted = 0
225
+
226
+ try:
227
+ container_client = self.client().get_container_client(self.container_name)
228
+
229
+ for blob in blob_iterator:
230
+ # TODO: Figure out now to properly use batch method delete_blobs(), it doesn't seem to work properly
231
+ container_client.delete_blob(blob.name)
232
+ total_deleted += 1
233
+
234
+ # print(f"Deleted {total_deleted} blobs from container '{self.container_name}'.")
235
+ return total_deleted
236
+
237
+ except AzureError as e:
238
+ self.handle_azure_error(e, self.container_name, f'deleting with {self.prefix}')
239
+ raise
240
+
241
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
242
+ """Return a list of objects found in the specified destination bucket.
243
+ Each returned object includes the full set of prefixes.
244
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
245
+ """
246
+ p = self.soa.prefix_free_uri if return_uri else ''
247
+ r: list[str] = []
248
+ try:
249
+ blob_iterator = self._get_filtered_blobs(tbl_id=None, tbl_version=None)
250
+ for blob in blob_iterator:
251
+ r.append(f'{p}{blob.name}')
252
+ if len(r) >= n_max:
253
+ return r
254
+
255
+ except AzureError as e:
256
+ self.handle_azure_error(e, self.__container_name, f'list objects from {self.__base_uri}')
257
+ return r
258
+
259
+ @classmethod
260
+ def handle_azure_error(
261
+ cls, e: 'AzureError', container_name: str, operation: str = '', *, ignore_404: bool = False
262
+ ) -> None:
263
+ from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
264
+
265
+ if ignore_404 and isinstance(e, ResourceNotFoundError):
266
+ return
267
+
268
+ if isinstance(e, ResourceNotFoundError):
269
+ raise excs.Error(f'Container {container_name} or blob not found during {operation}: {str(e)!r}')
270
+ elif isinstance(e, ClientAuthenticationError):
271
+ raise excs.Error(f'Authentication failed for container {container_name} during {operation}: {str(e)!r}')
272
+ elif isinstance(e, HttpResponseError):
273
+ if e.status_code == 403:
274
+ raise excs.Error(f'Access denied to container {container_name} during {operation}: {str(e)!r}')
275
+ elif e.status_code == 412:
276
+ raise excs.Error(f'Precondition failed for container {container_name} during {operation}: {str(e)!r}')
277
+ else:
278
+ raise excs.Error(
279
+ f'HTTP error during {operation} in container {container_name}: {e.status_code} - {str(e)!r}'
280
+ )
281
+ else:
282
+ raise excs.Error(f'Error during {operation} in container {container_name}: {str(e)!r}')
283
+
284
+ def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
285
+ """Create a presigned URL for downloading an object from Azure Blob Storage."""
286
+ if not soa.has_object:
287
+ raise excs.Error(f'StorageObjectAddress does not contain an object name: {soa}')
288
+
289
+ azure_client = self.client()
290
+ account_name = azure_client.account_name if azure_client.account_name else self.__account_name
291
+
292
+ # Account key cannot be extracted from client for security reasons, get from config
293
+ storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
294
+
295
+ if not account_name or not storage_account_key:
296
+ raise excs.Error(
297
+ 'Azure storage_account_name and storage_account_key must be configured '
298
+ 'to generate presigned URLs. Set them in the config under the [azure] section, '
299
+ 'or include the account name in the Azure URL.'
300
+ )
301
+
302
+ # Use datetime.now(timezone.utc) + timedelta like in pixeltable cloud
303
+ expiry_time = datetime.datetime.now(timezone.utc) + datetime.timedelta(seconds=expiration_seconds)
304
+
305
+ sas_token = generate_blob_sas(
306
+ account_name=account_name,
307
+ container_name=soa.container,
308
+ blob_name=soa.key,
309
+ account_key=storage_account_key,
310
+ permission=BlobSasPermissions(read=True),
311
+ expiry=expiry_time,
312
+ version='2022-11-02', # Specify API version to avoid version mismatch issues
313
+ )
314
+
315
+ # Build URL directly - URL encode the blob key to handle special characters
316
+ # Use safe='/' to preserve path separators in the blob key
317
+ encoded_key = quote(soa.key, safe='/')
318
+ blob_url = f'https://{account_name}.blob.core.windows.net/{soa.container}/{encoded_key}?{sas_token}'
319
+ return blob_url
320
+
321
+ @classmethod
322
+ def create_client(
323
+ cls, endpoint_url: str, account_name: str | None = None, account_key: str | None = None
324
+ ) -> 'BlobServiceClient':
325
+ from azure.core.credentials import AzureNamedKeyCredential
326
+ from azure.storage.blob import BlobServiceClient # TODO: Use azure.storage.blob.aio instead
327
+
328
+ assert (account_name is None) == (account_key is None)
329
+ try:
330
+ # e.g. endpoint_url: str = f'https://{account_name}.blob.core.windows.net'
331
+ assert endpoint_url is not None, 'No Azure Storage account information provided'
332
+
333
+ # Use empty SAS token for anonymous authentication
334
+ credential = None
335
+ if account_name is not None:
336
+ credential = AzureNamedKeyCredential(name=account_name, key=account_key)
337
+ return BlobServiceClient(
338
+ account_url=endpoint_url,
339
+ credential=credential,
340
+ max_single_get_size=(32 * 2**20),
341
+ max_chunk_get_size=(4 * 2**20),
342
+ connection_timeout=15,
343
+ read_timeout=30,
344
+ )
345
+ except Exception as e:
346
+ raise excs.Error(f'Failed to create Azure Blob Storage client: {str(e)!r}') from e
pixeltable/utils/coco.py CHANGED
@@ -22,6 +22,7 @@ Required format:
22
22
  }
23
23
  """
24
24
 
25
+
25
26
  def _verify_input_dict(input_dict: dict[str, Any]) -> None:
26
27
  """Verify that input_dict is a valid input dict for write_coco_dataset()"""
27
28
  if not isinstance(input_dict, dict):
@@ -30,7 +31,7 @@ def _verify_input_dict(input_dict: dict[str, Any]) -> None:
30
31
  raise excs.Error(f'Missing key "image" in input dict: {input_dict}{format_msg}')
31
32
  if not isinstance(input_dict['image'], PIL.Image.Image):
32
33
  raise excs.Error(f'Value for "image" is not a PIL.Image.Image: {input_dict}{format_msg}')
33
- if 'annotations' not in input_dict:
34
+ if 'annotations' not in input_dict:
34
35
  raise excs.Error(f'Missing key "annotations" in input dict: {input_dict}{format_msg}')
35
36
  if not isinstance(input_dict['annotations'], list):
36
37
  raise excs.Error(f'Value for "annotations" is not a list: {input_dict}{format_msg}')
@@ -48,11 +49,12 @@ def _verify_input_dict(input_dict: dict[str, Any]) -> None:
48
49
  if not isinstance(annotation['category'], (str, int)):
49
50
  raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
50
51
 
51
- def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
52
- """Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
52
+
53
+ def write_coco_dataset(query: pxt.Query, dest_path: Path) -> Path:
54
+ """Export a ResultSet as a COCO dataset in dest_path and return the path of the data.json file."""
53
55
  # TODO: validate schema
54
- if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():
55
- raise excs.Error(f'Expected exactly one json-typed column in select list: {df._select_list_exprs}')
56
+ if len(query._select_list_exprs) != 1 or not query._select_list_exprs[0].col_type.is_json_type():
57
+ raise excs.Error(f'Expected exactly one json-typed column in select list: {query._select_list_exprs}')
56
58
  input_dict_slot_idx = -1 # df._select_list_exprs[0].slot_idx isn't valid until _exec()
57
59
 
58
60
  # create output dir
@@ -66,9 +68,9 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
66
68
  annotations: list[dict[str, Any]] = []
67
69
  ann_id = -1
68
70
  categories: set[Any] = set()
69
- for input_row in df._exec():
71
+ for input_row in query._exec():
70
72
  if input_dict_slot_idx == -1:
71
- input_dict_expr = df._select_list_exprs[0]
73
+ input_dict_expr = query._select_list_exprs[0]
72
74
  input_dict_slot_idx = input_dict_expr.slot_idx
73
75
  input_dict = input_row[input_dict_slot_idx]
74
76
  _verify_input_dict(input_dict)
@@ -96,31 +98,28 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
96
98
  img_path = images_dir / f'{img_id}.jpg'
97
99
  img.save(img_path)
98
100
 
99
- images.append({
100
- 'id': img_id,
101
- 'file_name': str(img_path),
102
- 'width': img.width,
103
- 'height': img.height,
104
- })
101
+ images.append({'id': img_id, 'file_name': str(img_path), 'width': img.width, 'height': img.height})
105
102
 
106
103
  # create annotation records for this image
107
104
  for annotation in input_dict['annotations']:
108
105
  ann_id += 1
109
- x, y, w, h = annotation['bbox']
106
+ _, _, w, h = annotation['bbox']
110
107
  category = annotation['category']
111
108
  categories.add(category)
112
- annotations.append({
113
- 'id': ann_id,
114
- 'image_id': img_id,
115
- # we use the category name here and fix it up at the end, when we have assigned category ids
116
- 'category_id': category,
117
- 'bbox': annotation['bbox'],
118
- 'area': w * h,
119
- 'iscrowd': 0,
120
- })
109
+ annotations.append(
110
+ {
111
+ 'id': ann_id,
112
+ 'image_id': img_id,
113
+ # we use the category name here and fix it up at the end, when we have assigned category ids
114
+ 'category_id': category,
115
+ 'bbox': annotation['bbox'],
116
+ 'area': w * h,
117
+ 'iscrowd': 0,
118
+ }
119
+ )
121
120
 
122
121
  # replace category names with ids
123
- category_ids = {category: id for id, category in enumerate(sorted(list(categories)))}
122
+ category_ids = {category: id for id, category in enumerate(sorted(categories))}
124
123
  for annotation in annotations:
125
124
  annotation['category_id'] = category_ids[annotation['category_id']]
126
125
 
@@ -130,8 +129,8 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
130
129
  'categories': [{'id': id, 'name': category} for category, id in category_ids.items()],
131
130
  }
132
131
  output_path = dest_path / 'data.json'
133
- with open(output_path, 'w') as f:
134
- json.dump(result, f)
132
+ with open(output_path, 'w', encoding='utf-8') as fp:
133
+ json.dump(result, fp)
135
134
  return output_path
136
135
 
137
136
 
@@ -226,5 +225,5 @@ COCO_2017_CATEGORIES = {
226
225
  87: 'scissors',
227
226
  88: 'teddy bear',
228
227
  89: 'hair drier',
229
- 90: 'toothbrush'
228
+ 90: 'toothbrush',
230
229
  }
pixeltable/utils/code.py CHANGED
@@ -1,12 +1,11 @@
1
1
  import types
2
- from typing import Optional
3
2
 
4
3
  from pixeltable.func import Function
5
4
 
6
-
7
5
  # Utilities related to the organization of the Pixeltable codebase.
8
6
 
9
- def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> list[str]:
7
+
8
+ def local_public_names(mod_name: str, exclude: list[str] | None = None) -> list[str]:
10
9
  """
11
10
  Returns a list of all functions and submodules that are local to the specified module and are
12
11
  publicly accessible. Intended to facilitate implementation of module __dir__() methods for
@@ -21,7 +20,8 @@ def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> li
21
20
  for obj in mod.__dict__.values():
22
21
  if isinstance(obj, Function):
23
22
  # Pixeltable function
24
- names.append(obj.name)
23
+ if not obj.name.startswith('_'):
24
+ names.append(obj.name)
25
25
  elif isinstance(obj, types.FunctionType):
26
26
  # Python function
27
27
  if obj.__module__ == mod.__name__ and not obj.__name__.startswith('_'):
@@ -0,0 +1,46 @@
1
+ import logging
2
+ from typing import TextIO
3
+
4
+ from pixeltable import exceptions as excs
5
+
6
+
7
+ def map_level(verbosity: int) -> int:
8
+ """
9
+ Map verbosity level to logging level.
10
+ 0 - minimum logging - warn and above
11
+ 1 - default logging - info and above
12
+ 2 - more logging - debug and above
13
+ Args:
14
+
15
+ Returns:
16
+ Logging level as integer
17
+ """
18
+ if verbosity == 0:
19
+ return logging.WARN
20
+ if verbosity == 1:
21
+ return logging.INFO
22
+ if verbosity == 2:
23
+ return logging.DEBUG
24
+
25
+ raise excs.Error(f'Invalid verbosity level: {verbosity}')
26
+
27
+
28
+ class ConsoleOutputHandler(logging.StreamHandler):
29
+ def __init__(self, stream: TextIO):
30
+ super().__init__(stream)
31
+
32
+ def emit(self, record: logging.LogRecord) -> None:
33
+ if record.msg.endswith('\n'):
34
+ self.stream.write(record.msg)
35
+ else:
36
+ self.stream.write(record.msg + '\n')
37
+
38
+
39
+ class ConsoleMessageFilter(logging.Filter):
40
+ def filter(self, record: logging.LogRecord) -> bool:
41
+ return getattr(record, 'user_visible', False)
42
+
43
+
44
+ class ConsoleLogger(logging.LoggerAdapter):
45
+ def __init__(self, logger: logging.Logger):
46
+ super().__init__(logger, extra={'user_visible': True})
@@ -0,0 +1,24 @@
1
+ import asyncio
2
+ import threading
3
+ from typing import Any, Coroutine, TypeVar
4
+
5
+ from pixeltable.env import Env
6
+
7
+ T = TypeVar('T')
8
+
9
+ # TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
10
+ # scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
11
+ # general, it can be removed.
12
+
13
+
14
+ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: float = 30) -> T:
15
+ """
16
+ Runs the given coroutine synchronously, even if called in the context of a running event loop.
17
+ """
18
+ loop = Env.get().event_loop
19
+
20
+ if threading.current_thread() is threading.main_thread():
21
+ return loop.run_until_complete(coroutine)
22
+ else:
23
+ # Not in main thread, use run_coroutine_threadsafe
24
+ return asyncio.run_coroutine_threadsafe(coroutine, loop).result(timeout)
@@ -0,0 +1,92 @@
1
+ import abc
2
+
3
+ import sqlalchemy as sql
4
+
5
+
6
+ class Dbms(abc.ABC):
7
+ """
8
+ Provides abstractions for utilities to interact with a database system.
9
+ """
10
+
11
+ name: str
12
+ transaction_isolation_level: str
13
+ version_index_type: str
14
+ db_url: sql.URL
15
+
16
+ def __init__(self, name: str, transaction_isolation_level: str, version_index_type: str, db_url: sql.URL) -> None:
17
+ self.name = name
18
+ self.transaction_isolation_level = transaction_isolation_level
19
+ self.version_index_type = version_index_type
20
+ self.db_url = db_url
21
+
22
+ @abc.abstractmethod
23
+ def drop_db_stmt(self, database: str) -> str: ...
24
+
25
+ @abc.abstractmethod
26
+ def create_db_stmt(self, database: str) -> str: ...
27
+
28
+ @abc.abstractmethod
29
+ def default_system_db_url(self) -> str: ...
30
+
31
+ @abc.abstractmethod
32
+ def create_vector_index_stmt(
33
+ self, store_index_name: str, sa_value_col: sql.Column, metric: str
34
+ ) -> sql.Compiled: ...
35
+
36
+
37
+ class PostgresqlDbms(Dbms):
38
+ """
39
+ Implements utilities to interact with Postgres database.
40
+ """
41
+
42
+ def __init__(self, db_url: sql.URL):
43
+ super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
44
+
45
+ def drop_db_stmt(self, database: str) -> str:
46
+ return f'DROP DATABASE {database}'
47
+
48
+ def create_db_stmt(self, database: str) -> str:
49
+ return f"CREATE DATABASE {database} ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C' TEMPLATE template0"
50
+
51
+ def default_system_db_url(self) -> str:
52
+ a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
53
+ return a
54
+
55
+ def create_vector_index_stmt(self, store_index_name: str, sa_value_col: sql.Column, metric: str) -> sql.Compiled:
56
+ from sqlalchemy.dialects import postgresql
57
+
58
+ sa_idx = sql.Index(
59
+ store_index_name,
60
+ sa_value_col,
61
+ postgresql_using='hnsw',
62
+ postgresql_with={'m': 16, 'ef_construction': 64},
63
+ postgresql_ops={sa_value_col.name: metric},
64
+ )
65
+ return sql.schema.CreateIndex(sa_idx, if_not_exists=True).compile(dialect=postgresql.dialect())
66
+
67
+
68
+ class CockroachDbms(Dbms):
69
+ """
70
+ Implements utilities to interact with CockroachDb database.
71
+ """
72
+
73
+ def __init__(self, db_url: sql.URL):
74
+ super().__init__('cockroachdb', 'SERIALIZABLE', 'btree', db_url)
75
+
76
+ def drop_db_stmt(self, database: str) -> str:
77
+ return f'DROP DATABASE {database} CASCADE'
78
+
79
+ def create_db_stmt(self, database: str) -> str:
80
+ return f"CREATE DATABASE {database} TEMPLATE template0 ENCODING 'utf-8' LC_COLLATE 'C' LC_CTYPE 'C'"
81
+
82
+ def default_system_db_url(self) -> str:
83
+ return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
84
+
85
+ def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index | None:
86
+ return None
87
+
88
+ def create_vector_index_stmt(self, store_index_name: str, sa_value_col: sql.Column, metric: str) -> sql.Compiled:
89
+ return sql.text(
90
+ f'CREATE VECTOR INDEX IF NOT EXISTS {store_index_name} ON {sa_value_col.table.name}'
91
+ f'({sa_value_col.name} {metric})'
92
+ ).compile()