pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. pixeltable/__init__.py +83 -19
  2. pixeltable/_query.py +1444 -0
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +7 -4
  5. pixeltable/catalog/catalog.py +2394 -119
  6. pixeltable/catalog/column.py +225 -104
  7. pixeltable/catalog/dir.py +38 -9
  8. pixeltable/catalog/globals.py +53 -34
  9. pixeltable/catalog/insertable_table.py +265 -115
  10. pixeltable/catalog/path.py +80 -17
  11. pixeltable/catalog/schema_object.py +28 -43
  12. pixeltable/catalog/table.py +1270 -677
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +1270 -751
  15. pixeltable/catalog/table_version_handle.py +109 -0
  16. pixeltable/catalog/table_version_path.py +137 -42
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +251 -134
  20. pixeltable/config.py +215 -0
  21. pixeltable/env.py +736 -285
  22. pixeltable/exceptions.py +26 -2
  23. pixeltable/exec/__init__.py +7 -2
  24. pixeltable/exec/aggregation_node.py +39 -21
  25. pixeltable/exec/cache_prefetch_node.py +87 -109
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +25 -28
  29. pixeltable/exec/data_row_batch.py +11 -46
  30. pixeltable/exec/exec_context.py +26 -11
  31. pixeltable/exec/exec_node.py +35 -27
  32. pixeltable/exec/expr_eval/__init__.py +3 -0
  33. pixeltable/exec/expr_eval/evaluators.py +365 -0
  34. pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
  35. pixeltable/exec/expr_eval/globals.py +200 -0
  36. pixeltable/exec/expr_eval/row_buffer.py +74 -0
  37. pixeltable/exec/expr_eval/schedulers.py +413 -0
  38. pixeltable/exec/globals.py +35 -0
  39. pixeltable/exec/in_memory_data_node.py +35 -27
  40. pixeltable/exec/object_store_save_node.py +293 -0
  41. pixeltable/exec/row_update_node.py +44 -29
  42. pixeltable/exec/sql_node.py +414 -115
  43. pixeltable/exprs/__init__.py +8 -5
  44. pixeltable/exprs/arithmetic_expr.py +79 -45
  45. pixeltable/exprs/array_slice.py +5 -5
  46. pixeltable/exprs/column_property_ref.py +40 -26
  47. pixeltable/exprs/column_ref.py +254 -61
  48. pixeltable/exprs/comparison.py +14 -9
  49. pixeltable/exprs/compound_predicate.py +9 -10
  50. pixeltable/exprs/data_row.py +213 -72
  51. pixeltable/exprs/expr.py +270 -104
  52. pixeltable/exprs/expr_dict.py +6 -5
  53. pixeltable/exprs/expr_set.py +20 -11
  54. pixeltable/exprs/function_call.py +383 -284
  55. pixeltable/exprs/globals.py +18 -5
  56. pixeltable/exprs/in_predicate.py +7 -7
  57. pixeltable/exprs/inline_expr.py +37 -37
  58. pixeltable/exprs/is_null.py +8 -4
  59. pixeltable/exprs/json_mapper.py +120 -54
  60. pixeltable/exprs/json_path.py +90 -60
  61. pixeltable/exprs/literal.py +61 -16
  62. pixeltable/exprs/method_ref.py +7 -6
  63. pixeltable/exprs/object_ref.py +19 -8
  64. pixeltable/exprs/row_builder.py +238 -75
  65. pixeltable/exprs/rowid_ref.py +53 -15
  66. pixeltable/exprs/similarity_expr.py +65 -50
  67. pixeltable/exprs/sql_element_cache.py +5 -5
  68. pixeltable/exprs/string_op.py +107 -0
  69. pixeltable/exprs/type_cast.py +25 -13
  70. pixeltable/exprs/variable.py +2 -2
  71. pixeltable/func/__init__.py +9 -5
  72. pixeltable/func/aggregate_function.py +197 -92
  73. pixeltable/func/callable_function.py +119 -35
  74. pixeltable/func/expr_template_function.py +101 -48
  75. pixeltable/func/function.py +375 -62
  76. pixeltable/func/function_registry.py +20 -19
  77. pixeltable/func/globals.py +6 -5
  78. pixeltable/func/mcp.py +74 -0
  79. pixeltable/func/query_template_function.py +151 -35
  80. pixeltable/func/signature.py +178 -49
  81. pixeltable/func/tools.py +164 -0
  82. pixeltable/func/udf.py +176 -53
  83. pixeltable/functions/__init__.py +44 -4
  84. pixeltable/functions/anthropic.py +226 -47
  85. pixeltable/functions/audio.py +148 -11
  86. pixeltable/functions/bedrock.py +137 -0
  87. pixeltable/functions/date.py +188 -0
  88. pixeltable/functions/deepseek.py +113 -0
  89. pixeltable/functions/document.py +81 -0
  90. pixeltable/functions/fal.py +76 -0
  91. pixeltable/functions/fireworks.py +72 -20
  92. pixeltable/functions/gemini.py +249 -0
  93. pixeltable/functions/globals.py +208 -53
  94. pixeltable/functions/groq.py +108 -0
  95. pixeltable/functions/huggingface.py +1088 -95
  96. pixeltable/functions/image.py +155 -84
  97. pixeltable/functions/json.py +8 -11
  98. pixeltable/functions/llama_cpp.py +31 -19
  99. pixeltable/functions/math.py +169 -0
  100. pixeltable/functions/mistralai.py +50 -75
  101. pixeltable/functions/net.py +70 -0
  102. pixeltable/functions/ollama.py +29 -36
  103. pixeltable/functions/openai.py +548 -160
  104. pixeltable/functions/openrouter.py +143 -0
  105. pixeltable/functions/replicate.py +15 -14
  106. pixeltable/functions/reve.py +250 -0
  107. pixeltable/functions/string.py +310 -85
  108. pixeltable/functions/timestamp.py +37 -19
  109. pixeltable/functions/together.py +77 -120
  110. pixeltable/functions/twelvelabs.py +188 -0
  111. pixeltable/functions/util.py +7 -2
  112. pixeltable/functions/uuid.py +30 -0
  113. pixeltable/functions/video.py +1528 -117
  114. pixeltable/functions/vision.py +26 -26
  115. pixeltable/functions/voyageai.py +289 -0
  116. pixeltable/functions/whisper.py +19 -10
  117. pixeltable/functions/whisperx.py +179 -0
  118. pixeltable/functions/yolox.py +112 -0
  119. pixeltable/globals.py +716 -236
  120. pixeltable/index/__init__.py +3 -1
  121. pixeltable/index/base.py +17 -21
  122. pixeltable/index/btree.py +32 -22
  123. pixeltable/index/embedding_index.py +155 -92
  124. pixeltable/io/__init__.py +12 -7
  125. pixeltable/io/datarows.py +140 -0
  126. pixeltable/io/external_store.py +83 -125
  127. pixeltable/io/fiftyone.py +24 -33
  128. pixeltable/io/globals.py +47 -182
  129. pixeltable/io/hf_datasets.py +96 -127
  130. pixeltable/io/label_studio.py +171 -156
  131. pixeltable/io/lancedb.py +3 -0
  132. pixeltable/io/pandas.py +136 -115
  133. pixeltable/io/parquet.py +40 -153
  134. pixeltable/io/table_data_conduit.py +702 -0
  135. pixeltable/io/utils.py +100 -0
  136. pixeltable/iterators/__init__.py +8 -4
  137. pixeltable/iterators/audio.py +207 -0
  138. pixeltable/iterators/base.py +9 -3
  139. pixeltable/iterators/document.py +144 -87
  140. pixeltable/iterators/image.py +17 -38
  141. pixeltable/iterators/string.py +15 -12
  142. pixeltable/iterators/video.py +523 -127
  143. pixeltable/metadata/__init__.py +33 -8
  144. pixeltable/metadata/converters/convert_10.py +2 -3
  145. pixeltable/metadata/converters/convert_13.py +2 -2
  146. pixeltable/metadata/converters/convert_15.py +15 -11
  147. pixeltable/metadata/converters/convert_16.py +4 -5
  148. pixeltable/metadata/converters/convert_17.py +4 -5
  149. pixeltable/metadata/converters/convert_18.py +4 -6
  150. pixeltable/metadata/converters/convert_19.py +6 -9
  151. pixeltable/metadata/converters/convert_20.py +3 -6
  152. pixeltable/metadata/converters/convert_21.py +6 -8
  153. pixeltable/metadata/converters/convert_22.py +3 -2
  154. pixeltable/metadata/converters/convert_23.py +33 -0
  155. pixeltable/metadata/converters/convert_24.py +55 -0
  156. pixeltable/metadata/converters/convert_25.py +19 -0
  157. pixeltable/metadata/converters/convert_26.py +23 -0
  158. pixeltable/metadata/converters/convert_27.py +29 -0
  159. pixeltable/metadata/converters/convert_28.py +13 -0
  160. pixeltable/metadata/converters/convert_29.py +110 -0
  161. pixeltable/metadata/converters/convert_30.py +63 -0
  162. pixeltable/metadata/converters/convert_31.py +11 -0
  163. pixeltable/metadata/converters/convert_32.py +15 -0
  164. pixeltable/metadata/converters/convert_33.py +17 -0
  165. pixeltable/metadata/converters/convert_34.py +21 -0
  166. pixeltable/metadata/converters/convert_35.py +9 -0
  167. pixeltable/metadata/converters/convert_36.py +38 -0
  168. pixeltable/metadata/converters/convert_37.py +15 -0
  169. pixeltable/metadata/converters/convert_38.py +39 -0
  170. pixeltable/metadata/converters/convert_39.py +124 -0
  171. pixeltable/metadata/converters/convert_40.py +73 -0
  172. pixeltable/metadata/converters/convert_41.py +12 -0
  173. pixeltable/metadata/converters/convert_42.py +9 -0
  174. pixeltable/metadata/converters/convert_43.py +44 -0
  175. pixeltable/metadata/converters/util.py +44 -18
  176. pixeltable/metadata/notes.py +21 -0
  177. pixeltable/metadata/schema.py +185 -42
  178. pixeltable/metadata/utils.py +74 -0
  179. pixeltable/mypy/__init__.py +3 -0
  180. pixeltable/mypy/mypy_plugin.py +123 -0
  181. pixeltable/plan.py +616 -225
  182. pixeltable/share/__init__.py +3 -0
  183. pixeltable/share/packager.py +797 -0
  184. pixeltable/share/protocol/__init__.py +33 -0
  185. pixeltable/share/protocol/common.py +165 -0
  186. pixeltable/share/protocol/operation_types.py +33 -0
  187. pixeltable/share/protocol/replica.py +119 -0
  188. pixeltable/share/publish.py +349 -0
  189. pixeltable/store.py +398 -232
  190. pixeltable/type_system.py +730 -267
  191. pixeltable/utils/__init__.py +40 -0
  192. pixeltable/utils/arrow.py +201 -29
  193. pixeltable/utils/av.py +298 -0
  194. pixeltable/utils/azure_store.py +346 -0
  195. pixeltable/utils/coco.py +26 -27
  196. pixeltable/utils/code.py +4 -4
  197. pixeltable/utils/console_output.py +46 -0
  198. pixeltable/utils/coroutine.py +24 -0
  199. pixeltable/utils/dbms.py +92 -0
  200. pixeltable/utils/description_helper.py +11 -12
  201. pixeltable/utils/documents.py +60 -61
  202. pixeltable/utils/exception_handler.py +36 -0
  203. pixeltable/utils/filecache.py +38 -22
  204. pixeltable/utils/formatter.py +88 -51
  205. pixeltable/utils/gcs_store.py +295 -0
  206. pixeltable/utils/http.py +133 -0
  207. pixeltable/utils/http_server.py +14 -13
  208. pixeltable/utils/iceberg.py +13 -0
  209. pixeltable/utils/image.py +17 -0
  210. pixeltable/utils/lancedb.py +90 -0
  211. pixeltable/utils/local_store.py +322 -0
  212. pixeltable/utils/misc.py +5 -0
  213. pixeltable/utils/object_stores.py +573 -0
  214. pixeltable/utils/pydantic.py +60 -0
  215. pixeltable/utils/pytorch.py +20 -20
  216. pixeltable/utils/s3_store.py +527 -0
  217. pixeltable/utils/sql.py +32 -5
  218. pixeltable/utils/system.py +30 -0
  219. pixeltable/utils/transactional_directory.py +4 -3
  220. pixeltable-0.5.7.dist-info/METADATA +579 -0
  221. pixeltable-0.5.7.dist-info/RECORD +227 -0
  222. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  223. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  224. pixeltable/__version__.py +0 -3
  225. pixeltable/catalog/named_function.py +0 -36
  226. pixeltable/catalog/path_dict.py +0 -141
  227. pixeltable/dataframe.py +0 -894
  228. pixeltable/exec/expr_eval_node.py +0 -232
  229. pixeltable/ext/__init__.py +0 -14
  230. pixeltable/ext/functions/__init__.py +0 -8
  231. pixeltable/ext/functions/whisperx.py +0 -77
  232. pixeltable/ext/functions/yolox.py +0 -157
  233. pixeltable/tool/create_test_db_dump.py +0 -311
  234. pixeltable/tool/create_test_video.py +0 -81
  235. pixeltable/tool/doc_plugins/griffe.py +0 -50
  236. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  237. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  238. pixeltable/tool/embed_udf.py +0 -9
  239. pixeltable/tool/mypy_plugin.py +0 -55
  240. pixeltable/utils/media_store.py +0 -76
  241. pixeltable/utils/s3.py +0 -16
  242. pixeltable-0.2.26.dist-info/METADATA +0 -400
  243. pixeltable-0.2.26.dist-info/RECORD +0 -156
  244. pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
  245. {pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,573 @@
1
+ from __future__ import annotations
2
+
3
+ import enum
4
+ import os
5
+ import re
6
+ import urllib.parse
7
+ import urllib.request
8
+ import uuid
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, NamedTuple
11
+ from uuid import UUID
12
+
13
+ from pixeltable import env, exceptions as excs
14
+
15
+ if TYPE_CHECKING:
16
+ from pixeltable.catalog import Column
17
+
18
+
19
+ class StorageTarget(enum.Enum):
20
+ """Enumeration of storage kinds."""
21
+
22
+ LOCAL_STORE = 'os' # Local file system
23
+ S3_STORE = 's3' # Amazon S3
24
+ R2_STORE = 'r2' # Cloudflare R2
25
+ B2_STORE = 'b2' # Backblaze B2
26
+ TIGRIS_STORE = 'tigris' # Tigris
27
+ GCS_STORE = 'gs' # Google Cloud Storage
28
+ AZURE_STORE = 'az' # Azure Blob Storage
29
+ HTTP_STORE = 'http' # HTTP/HTTPS
30
+
31
+ def __str__(self) -> str:
32
+ return self.value
33
+
34
+
35
+ class StorageObjectAddress(NamedTuple):
36
+ """Contains components of an object address.
37
+ Unused components are empty strings.
38
+ """
39
+
40
+ storage_target: StorageTarget # The kind of storage referenced. This is NOT the same as the scheme.
41
+ scheme: str # The scheme parsed from the source
42
+ account: str = '' # Account number parsed from the source when applicable
43
+ account_extension: str = '' # Account extension parsed from the source when applicable
44
+ container: str = '' # Container / bucket name parsed from the source
45
+ key: str = '' # Key parsed from the source (prefix + object_name)
46
+ prefix: str = '' # Prefix (within the bucket) parsed from the source
47
+ object_name: str = '' # Object name parsed from the source (if requested and applicable)
48
+ path: Path | None = None
49
+
50
+ @property
51
+ def has_object(self) -> bool:
52
+ return len(self.object_name) > 0
53
+
54
+ @property
55
+ def is_http_readable(self) -> bool:
56
+ return self.scheme.startswith('http') and self.has_object
57
+
58
+ @property
59
+ def is_azure_scheme(self) -> bool:
60
+ return self.scheme in ('wasb', 'wasbs', 'abfs', 'abfss')
61
+
62
+ @property
63
+ def has_valid_storage_target(self) -> bool:
64
+ return self.storage_target in (
65
+ StorageTarget.LOCAL_STORE,
66
+ StorageTarget.S3_STORE,
67
+ StorageTarget.R2_STORE,
68
+ StorageTarget.B2_STORE,
69
+ StorageTarget.TIGRIS_STORE,
70
+ StorageTarget.GCS_STORE,
71
+ StorageTarget.AZURE_STORE,
72
+ StorageTarget.HTTP_STORE,
73
+ )
74
+
75
+ @property
76
+ def prefix_free_uri(self) -> str:
77
+ """Return the URI without any prefixes."""
78
+ if self.is_azure_scheme:
79
+ return f'{self.scheme}://{self.container}@{self.account}.{self.account_extension}/'
80
+ if self.account and self.account_extension:
81
+ return f'{self.scheme}://{self.account}.{self.account_extension}/{self.container}/'
82
+ if self.account_extension:
83
+ return f'{self.scheme}://{self.account_extension}/{self.container}/'
84
+ return f'{self.scheme}://{self.container}/'
85
+
86
+ @property
87
+ def container_free_uri(self) -> str:
88
+ """Return the URI without any prefixes."""
89
+ assert not self.is_azure_scheme, 'Azure storage requires a container name'
90
+ if self.account and self.account_extension:
91
+ return f'{self.scheme}://{self.account}.{self.account_extension}/'
92
+ if self.account_extension:
93
+ return f'{self.scheme}://{self.account_extension}/'
94
+ return f'{self.scheme}://'
95
+
96
+ @property
97
+ def to_path(self) -> Path:
98
+ assert self.storage_target == StorageTarget.LOCAL_STORE
99
+ assert self.path is not None
100
+ return self.path
101
+
102
+ def __str__(self) -> str:
103
+ """A debug aid to override default str representation. Not to be used for any purpose."""
104
+ return f'{self.storage_target}..{self.scheme}://{self.account}.{self.account_extension}/{self.container}/{self.prefix}{self.object_name}'
105
+
106
+ def __repr__(self) -> str:
107
+ """A debug aid to override default repr representation. Not to be used for any purpose."""
108
+ return (
109
+ f'SObjectAddress(client: {self.storage_target!r}, s: {self.scheme!r}, a: {self.account!r}, '
110
+ f'ae: {self.account_extension!r}, c: {self.container!r}, '
111
+ f'p: {self.prefix!r}, o: {self.object_name!r})'
112
+ )
113
+
114
+
115
+ class ObjectPath:
116
+ PATTERN = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
117
+
118
+ @classmethod
119
+ def table_prefix(cls, tbl_id: UUID) -> str:
120
+ """Construct a unique unix-style prefix for objects in a table (without leading/trailing slashes)."""
121
+ assert isinstance(tbl_id, uuid.UUID)
122
+ return tbl_id.hex
123
+
124
+ @classmethod
125
+ def create_prefix_raw(cls, tbl_id: UUID, col_id: int, tbl_version: int, ext: str | None = None) -> tuple[str, str]:
126
+ """Construct a unique unix-style prefix and filename for a persisted file.
127
+ The results are derived from table, col, and version specs.
128
+ Returns:
129
+ prefix: a unix-style prefix for the file without leading/trailing slashes
130
+ filename: a unique filename for the file without leading slashes
131
+ """
132
+ table_prefix = cls.table_prefix(tbl_id)
133
+ id_hex = uuid.uuid4().hex
134
+ prefix = f'{table_prefix}/{id_hex[:2]}/{id_hex[:4]}'
135
+ filename = f'{table_prefix}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
136
+ return prefix, filename
137
+
138
+ @classmethod
139
+ def separate_prefix_object(cls, path_and_object: str, may_contain_object_name: bool) -> tuple[str, str]:
140
+ path = path_and_object
141
+ object_name = ''
142
+ if not may_contain_object_name or path.endswith('/'):
143
+ prefix = path.rstrip('/')
144
+ elif '/' in path:
145
+ # If there are slashes in the path, separate into prefix and object
146
+ prefix, object_name = path.rsplit('/', 1)
147
+ prefix = prefix.rstrip('/')
148
+ else:
149
+ # If no slashes, the entire path is the object name
150
+ prefix = ''
151
+ object_name = path
152
+ if len(prefix) > 0 and not prefix.endswith('/'):
153
+ prefix += '/'
154
+ return prefix, object_name
155
+
156
+ @classmethod
157
+ def parse_object_storage_addr1(cls, src_addr: str) -> StorageObjectAddress:
158
+ """
159
+ Parses a cloud storage URI into its scheme, bucket, and key.
160
+
161
+ Args:
162
+ uri (str): The cloud storage URI (e.g., "gs://my-bucket/path/to/object.txt").
163
+
164
+ Returns:
165
+ StorageObjectAddress: A NamedTuple containing components of the address.
166
+
167
+ Formats:
168
+ s3://container/<optional prefix>/<optional object>
169
+ gs://container/<optional prefix>/<optional object>
170
+ wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
171
+ abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
172
+ https://account.blob.core.windows.net/container/<optional prefix>/<optional object>
173
+ https://account.r2.cloudflarestorage.com/container/<optional prefix>/<optional object>
174
+ https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
175
+ """
176
+ parsed = urllib.parse.urlparse(src_addr)
177
+ scheme = parsed.scheme.lower()
178
+ account_name = ''
179
+ account_extension = ''
180
+ container = ''
181
+ key = ''
182
+ path = None
183
+
184
+ # len(parsed.scheme) == 1 occurs for Windows drive letters like C:\
185
+ if not parsed.scheme or len(parsed.scheme) == 1:
186
+ # If no scheme, treat as local file path; this will be further validated before use
187
+ storage_target = StorageTarget.LOCAL_STORE
188
+ scheme = 'file'
189
+ path = Path(src_addr)
190
+
191
+ elif scheme == 'file':
192
+ storage_target = StorageTarget.LOCAL_STORE
193
+ pth = parsed.path
194
+ if parsed.netloc:
195
+ # This is a UNC path, ie, file://host/share/path/to/file
196
+ pth = f'\\\\{parsed.netloc}{pth}'
197
+ path = Path(urllib.parse.unquote(urllib.request.url2pathname(pth)))
198
+ key = str(parsed.path).lstrip('/')
199
+
200
+ elif scheme in ('s3', 'gs'):
201
+ storage_target = StorageTarget.S3_STORE if scheme == 's3' else StorageTarget.GCS_STORE
202
+ container = parsed.netloc
203
+ key = parsed.path.lstrip('/')
204
+
205
+ elif scheme in ('wasb', 'wasbs', 'abfs', 'abfss'):
206
+ # Azure-specific URI schemes
207
+ # wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
208
+ # abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
209
+ storage_target = StorageTarget.AZURE_STORE
210
+ container_and_account = parsed.netloc
211
+ if '@' in container_and_account:
212
+ container, account_host = container_and_account.split('@', 1)
213
+ account_name = account_host.split('.')[0]
214
+ account_extension = account_host.split('.', 1)[1]
215
+ else:
216
+ raise ValueError(f'Invalid Azure URI format: {src_addr}')
217
+ key = parsed.path.lstrip('/')
218
+
219
+ elif scheme in ('http', 'https'):
220
+ # Standard HTTP(S) URL format
221
+ # https://account.blob.core.windows.net/container/<optional path>/<optional object>
222
+ # https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
223
+ # https://s3.us-west-004.backblazeb2.com/container/<optional path>/<optional object>
224
+ # https://t3.storage.dev/container/<optional path>/<optional object> (Tigris)
225
+ # and possibly others
226
+ key = parsed.path
227
+ if 'cloudflare' in parsed.netloc:
228
+ storage_target = StorageTarget.R2_STORE
229
+ elif 'backblazeb2' in parsed.netloc:
230
+ storage_target = StorageTarget.B2_STORE
231
+ elif 'windows' in parsed.netloc:
232
+ storage_target = StorageTarget.AZURE_STORE
233
+ elif 't3.storage.dev' in parsed.netloc:
234
+ storage_target = StorageTarget.TIGRIS_STORE
235
+ else:
236
+ storage_target = StorageTarget.HTTP_STORE
237
+ if storage_target in (
238
+ StorageTarget.S3_STORE,
239
+ StorageTarget.AZURE_STORE,
240
+ StorageTarget.R2_STORE,
241
+ StorageTarget.B2_STORE,
242
+ StorageTarget.TIGRIS_STORE,
243
+ ):
244
+ account_name = parsed.netloc.split('.', 1)[0]
245
+ account_extension = parsed.netloc.split('.', 1)[1]
246
+ path_parts = key.lstrip('/').split('/', 1)
247
+ container = path_parts[0] if path_parts else ''
248
+ key = path_parts[1] if len(path_parts) > 1 else ''
249
+ else:
250
+ account_extension = parsed.netloc
251
+ key = key.lstrip('/')
252
+ else:
253
+ raise ValueError(f'Unsupported URI scheme: {parsed.scheme}')
254
+
255
+ r = StorageObjectAddress(storage_target, scheme, account_name, account_extension, container, key, '', '', path)
256
+ assert r.has_valid_storage_target
257
+ return r
258
+
259
+ @classmethod
260
+ def parse_object_storage_addr(cls, src_addr: str, allow_obj_name: bool) -> StorageObjectAddress:
261
+ """
262
+ Parses a cloud storage URI into its scheme, bucket, prefix, and object name.
263
+
264
+ Args:
265
+ uri (str): The cloud storage URI (e.g., "gs://my-bucket/path/to/object.txt").
266
+
267
+ Returns:
268
+ StorageObjectAddress: A NamedTuple containing components of the address.
269
+
270
+ Formats:
271
+ s3://container/<optional prefix>/<optional object>
272
+ gs://container/<optional prefix>/<optional object>
273
+ wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
274
+ abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
275
+ https://account.blob.core.windows.net/container/<optional prefix>/<optional object>
276
+ https://account.r2.cloudflarestorage.com/container/<optional prefix>/<optional object>
277
+ https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
278
+ """
279
+ soa = cls.parse_object_storage_addr1(src_addr)
280
+ prefix, object_name = cls.separate_prefix_object(soa.key, allow_obj_name)
281
+ assert not object_name.endswith('/')
282
+ r = soa._replace(prefix=prefix, object_name=object_name)
283
+ return r
284
+
285
+
286
+ class ObjectStoreBase:
287
+ def validate(self, error_prefix: str) -> str | None:
288
+ """Check the store configuration. Returns base URI if store is accessible.
289
+
290
+ Args:
291
+ error_col_name: a string of the form 'Column {name}: ' used when raising errors
292
+
293
+ Returns:
294
+ Base URI for the store. This value is stored in any Column attached to the store.
295
+ """
296
+ raise AssertionError
297
+
298
+ def copy_local_file(self, col: Column, src_path: Path) -> str:
299
+ """Copy a file associated with a Column to the store, returning the file's URL within the destination.
300
+
301
+ Args:
302
+ col: The Column to which the file belongs, used to generate the URI of the stored object.
303
+ src_path: The Path to the local file
304
+
305
+ Returns:
306
+ The URI of the object in the store
307
+ """
308
+ raise AssertionError
309
+
310
+ def move_local_file(self, col: Column, src_path: Path) -> str | None:
311
+ """Move a file associated with a Column to the store, returning the file's URL within the destination.
312
+
313
+ Args:
314
+ col: The Column to which the file belongs, used to generate the URI of the stored object.
315
+ src_path: The Path to the local file
316
+
317
+ Returns:
318
+ The URI of the object in the store, None if the object cannot be moved to the store
319
+ """
320
+ return None
321
+
322
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
323
+ """Copies an object from the store to a local media file.
324
+
325
+ Args:
326
+ src_path: The URI of the object in the store
327
+ dest_path: The desired Path to the local file
328
+ """
329
+ raise AssertionError
330
+
331
+ def count(self, tbl_id: UUID, tbl_version: int | None = None) -> int:
332
+ """Return the number of objects in the store associated with the given tbl_id
333
+
334
+ Args:
335
+ tbl_id: Only count objects associated with a given table
336
+ tbl_version: Only count objects associated with a specific table version
337
+
338
+ Returns:
339
+ Number of objects found with the specified criteria
340
+ """
341
+ raise AssertionError
342
+
343
+ def delete(self, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
344
+ """Delete objects in the destination for a given table ID, table version.
345
+
346
+ Args:
347
+ tbl_id: Only count objects associated with a given table
348
+ tbl_version: Only count objects associated with a specific table version
349
+
350
+ Returns:
351
+ Number of objects deleted or None if the store does not count deletions.
352
+ """
353
+ raise AssertionError
354
+
355
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
356
+ """Return a list of objects in the store.
357
+
358
+ Args:
359
+ return_uri: If True, returns a full URI for each object, otherwise just the path to the object.
360
+ n_max: Maximum number of objects to list
361
+ """
362
+ raise AssertionError
363
+
364
+ def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
365
+ """Create a presigned URL for downloading an object from the store.
366
+
367
+ Args:
368
+ soa: StorageObjectAddress containing the object location
369
+ expiration_seconds: Time in seconds for the URL to remain valid
370
+
371
+ Returns:
372
+ A presigned HTTP URL that can be used to access the object
373
+ """
374
+ raise AssertionError
375
+
376
+
377
+ class ObjectOps:
378
+ @classmethod
379
+ def get_store(
380
+ cls, dest: str | StorageObjectAddress | None, allow_obj_name: bool, col_name: str | None = None
381
+ ) -> ObjectStoreBase:
382
+ from pixeltable.env import Env
383
+ from pixeltable.utils.local_store import LocalStore
384
+
385
+ dest = dest or str(Env.get().media_dir) # Use local media dir as fallback
386
+ soa = (
387
+ dest
388
+ if isinstance(dest, StorageObjectAddress)
389
+ else ObjectPath.parse_object_storage_addr(dest, allow_obj_name=allow_obj_name)
390
+ )
391
+ if soa.storage_target == StorageTarget.LOCAL_STORE:
392
+ return LocalStore(soa)
393
+ if soa.storage_target in (
394
+ StorageTarget.S3_STORE,
395
+ StorageTarget.R2_STORE,
396
+ StorageTarget.B2_STORE,
397
+ StorageTarget.TIGRIS_STORE,
398
+ ):
399
+ env.Env.get().require_package('boto3')
400
+ from pixeltable.utils.s3_store import S3Store
401
+
402
+ return S3Store(soa)
403
+ if soa.storage_target == StorageTarget.GCS_STORE and soa.scheme == 'gs':
404
+ env.Env.get().require_package('google.cloud.storage')
405
+ from pixeltable.utils.gcs_store import GCSStore
406
+
407
+ return GCSStore(soa)
408
+ if soa.storage_target == StorageTarget.AZURE_STORE:
409
+ env.Env.get().require_package('azure.storage.blob')
410
+ from pixeltable.utils.azure_store import AzureBlobStore
411
+
412
+ return AzureBlobStore(soa)
413
+ if soa.storage_target == StorageTarget.HTTP_STORE and soa.is_http_readable:
414
+ return HTTPStore(soa)
415
+ error_col_name = f'Column {col_name!r}: ' if col_name is not None else ''
416
+ raise excs.Error(
417
+ f'{error_col_name}`destination` must be a valid reference to a supported destination, got {dest!r}'
418
+ )
419
+
420
+ @classmethod
421
+ def validate_destination(cls, dest: str | Path | None, col_name: str | None = None) -> str:
422
+ """Convert a Column destination parameter to a URI, else raise errors.
423
+ Args:
424
+ dest: The requested destination
425
+ col_name: Used to raise error messages
426
+ Returns:
427
+ URI of destination, or raises an error
428
+ """
429
+ error_col_str = f'column {col_name!r}' if col_name is not None else ''
430
+
431
+ # General checks on any destination
432
+ if isinstance(dest, Path):
433
+ dest = str(dest)
434
+ if dest is not None and not isinstance(dest, str):
435
+ raise excs.Error(f'{error_col_str}: `destination` must be a string or path; got {dest!r}')
436
+
437
+ # Specific checks for storage backends
438
+ store = cls.get_store(dest, False, col_name)
439
+ dest2 = store.validate(error_col_str)
440
+ if dest2 is None:
441
+ raise excs.Error(f'{error_col_str}: `destination` must be a supported destination; got {dest!r}')
442
+ return dest2
443
+
444
+ @classmethod
445
+ def copy_object_to_local_file(cls, src_uri: str, dest_path: Path) -> None:
446
+ """Copy an object from a URL to a local Path. Thread safe.
447
+ Raises an exception if the download fails or the scheme is not supported
448
+ """
449
+ soa = ObjectPath.parse_object_storage_addr(src_uri, allow_obj_name=True)
450
+ store = cls.get_store(src_uri, True)
451
+ store.copy_object_to_local_file(soa.object_name, dest_path)
452
+
453
+ @classmethod
454
+ def put_file(cls, col: Column, src_path: Path, relocate_or_delete: bool) -> str:
455
+ """Move or copy a file to the destination, returning the file's URL within the destination.
456
+ If relocate_or_delete is True and the file is in the TempStore, the file will be deleted after the operation.
457
+ """
458
+ from pixeltable.utils.local_store import TempStore
459
+
460
+ if relocate_or_delete:
461
+ # File is temporary, used only once, so we can delete it after copy if it can't be moved
462
+ assert TempStore.contains_path(src_path)
463
+ dest = col.destination
464
+ store = cls.get_store(dest, False, col.name)
465
+ # Attempt to move
466
+ if relocate_or_delete:
467
+ moved_file_url = store.move_local_file(col, src_path)
468
+ if moved_file_url is not None:
469
+ return moved_file_url
470
+ new_file_url = store.copy_local_file(col, src_path)
471
+ if relocate_or_delete:
472
+ TempStore.delete_media_file(src_path)
473
+ return new_file_url
474
+
475
+ @classmethod
476
+ def move_local_file(cls, col: Column, src_path: Path) -> str:
477
+ """Move a file to the destination specified by the Column, returning the file's URL within the destination."""
478
+ store = cls.get_store(col.destination, False, col.name)
479
+ return store.move_local_file(col, src_path)
480
+
481
+ @classmethod
482
+ def copy_local_file(cls, col: Column, src_path: Path) -> str:
483
+ """Copy a file to the destination specified by the Column, returning the file's URL within the destination."""
484
+ store = cls.get_store(col.destination, False, col.name)
485
+ return store.copy_local_file(col, src_path)
486
+
487
+ @classmethod
488
+ def delete(cls, dest: str | None, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
489
+ """Delete objects in the destination for a given table ID, table version.
490
+ Returns:
491
+ Number of objects deleted or None
492
+ """
493
+ store = cls.get_store(dest, False)
494
+ return store.delete(tbl_id, tbl_version)
495
+
496
+ @classmethod
497
+ def count(
498
+ cls,
499
+ tbl_id: UUID,
500
+ tbl_version: int | None = None,
501
+ dest: str | None = None,
502
+ default_input_dest: bool = False,
503
+ default_output_dest: bool = False,
504
+ ) -> int:
505
+ """
506
+ Return the count of objects in the destination for a given table ID.
507
+
508
+ At most one of dest, default_input, default_output may be specified. If none are specified, the fallback is the
509
+ local media directory.
510
+
511
+ Args:
512
+ tbl_id: Table ID for which to count objects
513
+ tbl_version: If specified, only counts objects for a specific table version
514
+ dest: The destination to count objects in
515
+ default_input_dest: If `True`, use the default input media destination
516
+ default_output_dest: If `True`, use the default output media destination
517
+ """
518
+ assert sum((dest is not None, default_input_dest, default_output_dest)) <= 1, (
519
+ 'At most one of dest, default_input, default_output may be specified'
520
+ )
521
+ if default_input_dest:
522
+ dest = env.Env.get().default_input_media_dest
523
+ if default_output_dest:
524
+ dest = env.Env.get().default_output_media_dest
525
+ store = cls.get_store(dest, False)
526
+ return store.count(tbl_id, tbl_version)
527
+
528
+ @classmethod
529
+ def list_objects(cls, dest: str | None, return_uri: bool, n_max: int = 10) -> list[str]:
530
+ """Return a list of objects found in the specified destination bucket.
531
+ The dest specification string must not contain an object name.
532
+ Each returned object includes the full set of prefixes.
533
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
534
+ """
535
+ store = cls.get_store(dest, False)
536
+ return store.list_objects(return_uri, n_max)
537
+
538
+ @classmethod
539
+ def list_uris(cls, source_uri: str, n_max: int = 10) -> list[str]:
540
+ """Return a list of URIs found within the specified uri"""
541
+ return cls.list_objects(source_uri, True, n_max)
542
+
543
+
544
+ class HTTPStore(ObjectStoreBase):
545
+ base_url: str
546
+
547
+ def __init__(self, soa: StorageObjectAddress):
548
+ self.base_url = f'{soa.scheme}://{soa.account_extension}/{soa.prefix}'
549
+ if not self.base_url.endswith('/'):
550
+ self.base_url += '/'
551
+
552
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
553
+ with urllib.request.urlopen(self.base_url + src_path) as resp, open(dest_path, 'wb') as f:
554
+ data = resp.read()
555
+ f.write(data)
556
+ f.flush() # Ensures Python buffers are written to OS
557
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
558
+
559
+ def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
560
+ """Create a presigned URL for HTTP storage (returns the HTTP URL as-is).
561
+
562
+ Args:
563
+ soa: StorageObjectAddress containing the object location
564
+ expiration_seconds: Time in seconds for the URL to remain valid (ignored for HTTP)
565
+
566
+ Returns:
567
+ The HTTP URL as-is since it's already servable
568
+ """
569
+ if not soa.has_object:
570
+ raise excs.Error(f'StorageObjectAddress does not contain an object name: {soa}')
571
+
572
+ # Construct the full HTTP URL from the StorageObjectAddress
573
+ return f'{soa.scheme}://{soa.account_extension}/{soa.key}'
@@ -0,0 +1,60 @@
1
+ import typing
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from types import UnionType
5
+ from typing import Any, Union
6
+
7
+ import pydantic
8
+
9
+
10
+ def is_json_convertible(model: type[pydantic.BaseModel]) -> bool:
11
+ """
12
+ Determine if instances of a Pydantic model can be converted to valid JSON
13
+ based on the type hints of its fields.
14
+ """
15
+ type_hints = typing.get_type_hints(model)
16
+ return all(_type_is_json_convertible(field_type) for field_type in type_hints.values())
17
+
18
+
19
+ def _type_is_json_convertible(type_hint: Any) -> bool:
20
+ """
21
+ Recursively check if a type hint represents a JSON-compatible type.
22
+
23
+ TODO: also allow ndarrays and PIL.Image.Image, once we support those within json structures.
24
+ """
25
+ if type_hint is type(None):
26
+ return True
27
+ if type_hint is Any:
28
+ return False
29
+
30
+ if type_hint in (str, int, float, bool, datetime):
31
+ return True
32
+
33
+ if isinstance(type_hint, type) and issubclass(type_hint, Enum):
34
+ return all(isinstance(member.value, (str, int, float, bool, type(None))) for member in type_hint)
35
+
36
+ if isinstance(type_hint, type) and issubclass(type_hint, pydantic.BaseModel):
37
+ return is_json_convertible(type_hint)
38
+
39
+ origin = typing.get_origin(type_hint)
40
+ args = typing.get_args(type_hint)
41
+
42
+ if origin in (Union, UnionType):
43
+ return all(_type_is_json_convertible(arg) for arg in args)
44
+
45
+ if origin in (list, tuple):
46
+ return all(_type_is_json_convertible(arg) for arg in args) if len(args) > 0 else False
47
+
48
+ if origin is dict:
49
+ if len(args) != 2:
50
+ # we can't tell what this is
51
+ return False
52
+ key_type, value_type = args
53
+ # keys must be strings, values must be json-convertible
54
+ return key_type is str and _type_is_json_convertible(value_type)
55
+
56
+ # Literal types are json-convertible if their values are
57
+ if origin is typing.Literal:
58
+ return all(isinstance(val, (str, int, float, bool, type(None))) for val in args)
59
+
60
+ return False