pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,528 @@
1
+ from __future__ import annotations
2
+
3
+ import enum
4
+ import os
5
+ import re
6
+ import urllib.parse
7
+ import urllib.request
8
+ import uuid
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, NamedTuple
11
+ from uuid import UUID
12
+
13
+ from pixeltable import env, exceptions as excs
14
+
15
+ if TYPE_CHECKING:
16
+ from pixeltable.catalog import Column
17
+
18
+
19
+ class StorageTarget(enum.Enum):
20
+ """Enumeration of storage kinds."""
21
+
22
+ LOCAL_STORE = 'os' # Local file system
23
+ S3_STORE = 's3' # Amazon S3
24
+ R2_STORE = 'r2' # Cloudflare R2
25
+ B2_STORE = 'b2' # Backblaze B2
26
+ GCS_STORE = 'gs' # Google Cloud Storage
27
+ AZURE_STORE = 'az' # Azure Blob Storage
28
+ HTTP_STORE = 'http' # HTTP/HTTPS
29
+
30
+ def __str__(self) -> str:
31
+ return self.value
32
+
33
+
34
+ class StorageObjectAddress(NamedTuple):
35
+ """Contains components of an object address.
36
+ Unused components are empty strings.
37
+ """
38
+
39
+ storage_target: StorageTarget # The kind of storage referenced. This is NOT the same as the scheme.
40
+ scheme: str # The scheme parsed from the source
41
+ account: str = '' # Account number parsed from the source when applicable
42
+ account_extension: str = '' # Account extension parsed from the source when applicable
43
+ container: str = '' # Container / bucket name parsed from the source
44
+ key: str = '' # Key parsed from the source (prefix + object_name)
45
+ prefix: str = '' # Prefix (within the bucket) parsed from the source
46
+ object_name: str = '' # Object name parsed from the source (if requested and applicable)
47
+ path: Path | None = None
48
+
49
+ @property
50
+ def has_object(self) -> bool:
51
+ return len(self.object_name) > 0
52
+
53
+ @property
54
+ def is_http_readable(self) -> bool:
55
+ return self.scheme.startswith('http') and self.has_object
56
+
57
+ @property
58
+ def is_azure_scheme(self) -> bool:
59
+ return self.scheme in ('wasb', 'wasbs', 'abfs', 'abfss')
60
+
61
+ @property
62
+ def has_valid_storage_target(self) -> bool:
63
+ return self.storage_target in (
64
+ StorageTarget.LOCAL_STORE,
65
+ StorageTarget.S3_STORE,
66
+ StorageTarget.R2_STORE,
67
+ StorageTarget.B2_STORE,
68
+ StorageTarget.GCS_STORE,
69
+ StorageTarget.AZURE_STORE,
70
+ StorageTarget.HTTP_STORE,
71
+ )
72
+
73
+ @property
74
+ def prefix_free_uri(self) -> str:
75
+ """Return the URI without any prefixes."""
76
+ if self.is_azure_scheme:
77
+ return f'{self.scheme}://{self.container}@{self.account}.{self.account_extension}/'
78
+ if self.account and self.account_extension:
79
+ return f'{self.scheme}://{self.account}.{self.account_extension}/{self.container}/'
80
+ if self.account_extension:
81
+ return f'{self.scheme}://{self.account_extension}/{self.container}/'
82
+ return f'{self.scheme}://{self.container}/'
83
+
84
+ @property
85
+ def container_free_uri(self) -> str:
86
+ """Return the URI without any prefixes."""
87
+ assert not self.is_azure_scheme, 'Azure storage requires a container name'
88
+ if self.account and self.account_extension:
89
+ return f'{self.scheme}://{self.account}.{self.account_extension}/'
90
+ if self.account_extension:
91
+ return f'{self.scheme}://{self.account_extension}/'
92
+ return f'{self.scheme}://'
93
+
94
+ @property
95
+ def to_path(self) -> Path:
96
+ assert self.storage_target == StorageTarget.LOCAL_STORE
97
+ assert self.path is not None
98
+ return self.path
99
+
100
+ def __str__(self) -> str:
101
+ """A debug aid to override default str representation. Not to be used for any purpose."""
102
+ return f'{self.storage_target}..{self.scheme}://{self.account}.{self.account_extension}/{self.container}/{self.prefix}{self.object_name}'
103
+
104
+ def __repr__(self) -> str:
105
+ """A debug aid to override default repr representation. Not to be used for any purpose."""
106
+ return (
107
+ f'SObjectAddress(client: {self.storage_target!r}, s: {self.scheme!r}, a: {self.account!r}, '
108
+ f'ae: {self.account_extension!r}, c: {self.container!r}, '
109
+ f'p: {self.prefix!r}, o: {self.object_name!r})'
110
+ )
111
+
112
+
113
+ class ObjectPath:
114
+ PATTERN = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
115
+
116
+ @classmethod
117
+ def table_prefix(cls, tbl_id: UUID) -> str:
118
+ """Construct a unique unix-style prefix for objects in a table (without leading/trailing slashes)."""
119
+ assert isinstance(tbl_id, uuid.UUID)
120
+ return tbl_id.hex
121
+
122
+ @classmethod
123
+ def create_prefix_raw(cls, tbl_id: UUID, col_id: int, tbl_version: int, ext: str | None = None) -> tuple[str, str]:
124
+ """Construct a unique unix-style prefix and filename for a persisted file.
125
+ The results are derived from table, col, and version specs.
126
+ Returns:
127
+ prefix: a unix-style prefix for the file without leading/trailing slashes
128
+ filename: a unique filename for the file without leading slashes
129
+ """
130
+ table_prefix = cls.table_prefix(tbl_id)
131
+ id_hex = uuid.uuid4().hex
132
+ prefix = f'{table_prefix}/{id_hex[:2]}/{id_hex[:4]}'
133
+ filename = f'{table_prefix}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
134
+ return prefix, filename
135
+
136
+ @classmethod
137
+ def separate_prefix_object(cls, path_and_object: str, may_contain_object_name: bool) -> tuple[str, str]:
138
+ path = path_and_object
139
+ object_name = ''
140
+ if not may_contain_object_name or path.endswith('/'):
141
+ prefix = path.rstrip('/')
142
+ elif '/' in path:
143
+ # If there are slashes in the path, separate into prefix and object
144
+ prefix, object_name = path.rsplit('/', 1)
145
+ prefix = prefix.rstrip('/')
146
+ else:
147
+ # If no slashes, the entire path is the object name
148
+ prefix = ''
149
+ object_name = path
150
+ if len(prefix) > 0 and not prefix.endswith('/'):
151
+ prefix += '/'
152
+ return prefix, object_name
153
+
154
+ @classmethod
155
+ def parse_object_storage_addr1(cls, src_addr: str) -> StorageObjectAddress:
156
+ """
157
+ Parses a cloud storage URI into its scheme, bucket, and key.
158
+
159
+ Args:
160
+ uri (str): The cloud storage URI (e.g., "gs://my-bucket/path/to/object.txt").
161
+
162
+ Returns:
163
+ StorageObjectAddress: A NamedTuple containing components of the address.
164
+
165
+ Formats:
166
+ s3://container/<optional prefix>/<optional object>
167
+ gs://container/<optional prefix>/<optional object>
168
+ wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
169
+ abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
170
+ https://account.blob.core.windows.net/container/<optional prefix>/<optional object>
171
+ https://account.r2.cloudflarestorage.com/container/<optional prefix>/<optional object>
172
+ https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
173
+ """
174
+ parsed = urllib.parse.urlparse(src_addr)
175
+ scheme = parsed.scheme.lower()
176
+ account_name = ''
177
+ account_extension = ''
178
+ container = ''
179
+ key = ''
180
+ path = None
181
+
182
+ # len(parsed.scheme) == 1 occurs for Windows drive letters like C:\
183
+ if not parsed.scheme or len(parsed.scheme) == 1:
184
+ # If no scheme, treat as local file path; this will be further validated before use
185
+ storage_target = StorageTarget.LOCAL_STORE
186
+ scheme = 'file'
187
+ path = Path(src_addr)
188
+
189
+ elif scheme == 'file':
190
+ storage_target = StorageTarget.LOCAL_STORE
191
+ pth = parsed.path
192
+ if parsed.netloc:
193
+ # This is a UNC path, ie, file://host/share/path/to/file
194
+ pth = f'\\\\{parsed.netloc}{pth}'
195
+ path = Path(urllib.parse.unquote(urllib.request.url2pathname(pth)))
196
+ key = str(parsed.path).lstrip('/')
197
+
198
+ elif scheme in ('s3', 'gs'):
199
+ storage_target = StorageTarget.S3_STORE if scheme == 's3' else StorageTarget.GCS_STORE
200
+ container = parsed.netloc
201
+ key = parsed.path.lstrip('/')
202
+
203
+ elif scheme in ('wasb', 'wasbs', 'abfs', 'abfss'):
204
+ # Azure-specific URI schemes
205
+ # wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
206
+ # abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
207
+ storage_target = StorageTarget.AZURE_STORE
208
+ container_and_account = parsed.netloc
209
+ if '@' in container_and_account:
210
+ container, account_host = container_and_account.split('@', 1)
211
+ account_name = account_host.split('.')[0]
212
+ account_extension = account_host.split('.', 1)[1]
213
+ else:
214
+ raise ValueError(f'Invalid Azure URI format: {src_addr}')
215
+ key = parsed.path.lstrip('/')
216
+
217
+ elif scheme in ('http', 'https'):
218
+ # Standard HTTP(S) URL format
219
+ # https://account.blob.core.windows.net/container/<optional path>/<optional object>
220
+ # https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
221
+ # https://s3.us-west-004.backblazeb2.com/container/<optional path>/<optional object>
222
+ # and possibly others
223
+ key = parsed.path
224
+ if 'cloudflare' in parsed.netloc:
225
+ storage_target = StorageTarget.R2_STORE
226
+ elif 'backblazeb2' in parsed.netloc:
227
+ storage_target = StorageTarget.B2_STORE
228
+ elif 'windows' in parsed.netloc:
229
+ storage_target = StorageTarget.AZURE_STORE
230
+ else:
231
+ storage_target = StorageTarget.HTTP_STORE
232
+ if storage_target in (
233
+ StorageTarget.S3_STORE,
234
+ StorageTarget.AZURE_STORE,
235
+ StorageTarget.R2_STORE,
236
+ StorageTarget.B2_STORE,
237
+ ):
238
+ account_name = parsed.netloc.split('.', 1)[0]
239
+ account_extension = parsed.netloc.split('.', 1)[1]
240
+ path_parts = key.lstrip('/').split('/', 1)
241
+ container = path_parts[0] if path_parts else ''
242
+ key = path_parts[1] if len(path_parts) > 1 else ''
243
+ else:
244
+ account_extension = parsed.netloc
245
+ key = key.lstrip('/')
246
+ else:
247
+ raise ValueError(f'Unsupported URI scheme: {parsed.scheme}')
248
+
249
+ r = StorageObjectAddress(storage_target, scheme, account_name, account_extension, container, key, '', '', path)
250
+ assert r.has_valid_storage_target
251
+ return r
252
+
253
+ @classmethod
254
+ def parse_object_storage_addr(cls, src_addr: str, allow_obj_name: bool) -> StorageObjectAddress:
255
+ """
256
+ Parses a cloud storage URI into its scheme, bucket, prefix, and object name.
257
+
258
+ Args:
259
+ uri (str): The cloud storage URI (e.g., "gs://my-bucket/path/to/object.txt").
260
+
261
+ Returns:
262
+ StorageObjectAddress: A NamedTuple containing components of the address.
263
+
264
+ Formats:
265
+ s3://container/<optional prefix>/<optional object>
266
+ gs://container/<optional prefix>/<optional object>
267
+ wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
268
+ abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
269
+ https://account.blob.core.windows.net/container/<optional prefix>/<optional object>
270
+ https://account.r2.cloudflarestorage.com/container/<optional prefix>/<optional object>
271
+ https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
272
+ """
273
+ soa = cls.parse_object_storage_addr1(src_addr)
274
+ prefix, object_name = cls.separate_prefix_object(soa.key, allow_obj_name)
275
+ assert not object_name.endswith('/')
276
+ r = soa._replace(prefix=prefix, object_name=object_name)
277
+ return r
278
+
279
+
280
+ class ObjectStoreBase:
281
+ def validate(self, error_prefix: str) -> str | None:
282
+ """Check the store configuration. Returns base URI if store is accessible.
283
+
284
+ Args:
285
+ error_col_name: a string of the form 'Column {name}: ' used when raising errors
286
+
287
+ Returns:
288
+ Base URI for the store. This value is stored in any Column attached to the store.
289
+ """
290
+ raise AssertionError
291
+
292
+ def copy_local_file(self, col: Column, src_path: Path) -> str:
293
+ """Copy a file associated with a Column to the store, returning the file's URL within the destination.
294
+
295
+ Args:
296
+ col: The Column to which the file belongs, used to generate the URI of the stored object.
297
+ src_path: The Path to the local file
298
+
299
+ Returns:
300
+ The URI of the object in the store
301
+ """
302
+ raise AssertionError
303
+
304
+ def move_local_file(self, col: Column, src_path: Path) -> str | None:
305
+ """Move a file associated with a Column to the store, returning the file's URL within the destination.
306
+
307
+ Args:
308
+ col: The Column to which the file belongs, used to generate the URI of the stored object.
309
+ src_path: The Path to the local file
310
+
311
+ Returns:
312
+ The URI of the object in the store, None if the object cannot be moved to the store
313
+ """
314
+ return None
315
+
316
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
317
+ """Copies an object from the store to a local media file.
318
+
319
+ Args:
320
+ src_path: The URI of the object in the store
321
+ dest_path: The desired Path to the local file
322
+ """
323
+ raise AssertionError
324
+
325
+ def count(self, tbl_id: UUID, tbl_version: int | None = None) -> int:
326
+ """Return the number of objects in the store associated with the given tbl_id
327
+
328
+ Args:
329
+ tbl_id: Only count objects associated with a given table
330
+ tbl_version: Only count objects associated with a specific table version
331
+
332
+ Returns:
333
+ Number of objects found with the specified criteria
334
+ """
335
+ raise AssertionError
336
+
337
+ def delete(self, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
338
+ """Delete objects in the destination for a given table ID, table version.
339
+
340
+ Args:
341
+ tbl_id: Only count objects associated with a given table
342
+ tbl_version: Only count objects associated with a specific table version
343
+
344
+ Returns:
345
+ Number of objects deleted or None if the store does not count deletions.
346
+ """
347
+ raise AssertionError
348
+
349
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
350
+ """Return a list of objects in the store.
351
+
352
+ Args:
353
+ return_uri: If True, returns a full URI for each object, otherwise just the path to the object.
354
+ n_max: Maximum number of objects to list
355
+ """
356
+ raise AssertionError
357
+
358
+
359
+ class ObjectOps:
360
+ @classmethod
361
+ def get_store(cls, dest: str | None, allow_obj_name: bool, col_name: str | None = None) -> ObjectStoreBase:
362
+ from pixeltable.env import Env
363
+ from pixeltable.utils.local_store import LocalStore
364
+
365
+ dest = dest or str(Env.get().media_dir) # Use local media dir as fallback
366
+ soa = ObjectPath.parse_object_storage_addr(dest, allow_obj_name=allow_obj_name)
367
+ if soa.storage_target == StorageTarget.LOCAL_STORE:
368
+ return LocalStore(soa)
369
+ if soa.storage_target in (StorageTarget.S3_STORE, StorageTarget.R2_STORE, StorageTarget.B2_STORE):
370
+ env.Env.get().require_package('boto3')
371
+ from pixeltable.utils.s3_store import S3Store
372
+
373
+ return S3Store(soa)
374
+ if soa.storage_target == StorageTarget.GCS_STORE and soa.scheme == 'gs':
375
+ env.Env.get().require_package('google.cloud.storage')
376
+ from pixeltable.utils.gcs_store import GCSStore
377
+
378
+ return GCSStore(soa)
379
+ if soa.storage_target == StorageTarget.AZURE_STORE:
380
+ env.Env.get().require_package('azure.storage.blob')
381
+ from pixeltable.utils.azure_store import AzureBlobStore
382
+
383
+ return AzureBlobStore(soa)
384
+ if soa.storage_target == StorageTarget.HTTP_STORE and soa.is_http_readable:
385
+ return HTTPStore(soa)
386
+ error_col_name = f'Column {col_name!r}: ' if col_name is not None else ''
387
+ raise excs.Error(
388
+ f'{error_col_name}`destination` must be a valid reference to a supported destination, got {dest!r}'
389
+ )
390
+
391
+ @classmethod
392
+ def validate_destination(cls, dest: str | Path | None, col_name: str | None = None) -> str:
393
+ """Convert a Column destination parameter to a URI, else raise errors.
394
+ Args:
395
+ dest: The requested destination
396
+ col_name: Used to raise error messages
397
+ Returns:
398
+ URI of destination, or raises an error
399
+ """
400
+ error_col_str = f'column {col_name!r}' if col_name is not None else ''
401
+
402
+ # General checks on any destination
403
+ if isinstance(dest, Path):
404
+ dest = str(dest)
405
+ if dest is not None and not isinstance(dest, str):
406
+ raise excs.Error(f'{error_col_str}: `destination` must be a string or path; got {dest!r}')
407
+
408
+ # Specific checks for storage backends
409
+ store = cls.get_store(dest, False, col_name)
410
+ dest2 = store.validate(error_col_str)
411
+ if dest2 is None:
412
+ raise excs.Error(f'{error_col_str}: `destination` must be a supported destination; got {dest!r}')
413
+ return dest2
414
+
415
+ @classmethod
416
+ def copy_object_to_local_file(cls, src_uri: str, dest_path: Path) -> None:
417
+ """Copy an object from a URL to a local Path. Thread safe.
418
+ Raises an exception if the download fails or the scheme is not supported
419
+ """
420
+ soa = ObjectPath.parse_object_storage_addr(src_uri, allow_obj_name=True)
421
+ store = cls.get_store(src_uri, True)
422
+ store.copy_object_to_local_file(soa.object_name, dest_path)
423
+
424
+ @classmethod
425
+ def put_file(cls, col: Column, src_path: Path, relocate_or_delete: bool) -> str:
426
+ """Move or copy a file to the destination, returning the file's URL within the destination.
427
+ If relocate_or_delete is True and the file is in the TempStore, the file will be deleted after the operation.
428
+ """
429
+ from pixeltable.utils.local_store import TempStore
430
+
431
+ if relocate_or_delete:
432
+ # File is temporary, used only once, so we can delete it after copy if it can't be moved
433
+ assert TempStore.contains_path(src_path)
434
+ dest = col.destination
435
+ store = cls.get_store(dest, False, col.name)
436
+ # Attempt to move
437
+ if relocate_or_delete:
438
+ moved_file_url = store.move_local_file(col, src_path)
439
+ if moved_file_url is not None:
440
+ return moved_file_url
441
+ new_file_url = store.copy_local_file(col, src_path)
442
+ if relocate_or_delete:
443
+ TempStore.delete_media_file(src_path)
444
+ return new_file_url
445
+
446
+ @classmethod
447
+ def move_local_file(cls, col: Column, src_path: Path) -> str:
448
+ """Move a file to the destination specified by the Column, returning the file's URL within the destination."""
449
+ store = cls.get_store(col.destination, False, col.name)
450
+ return store.move_local_file(col, src_path)
451
+
452
+ @classmethod
453
+ def copy_local_file(cls, col: Column, src_path: Path) -> str:
454
+ """Copy a file to the destination specified by the Column, returning the file's URL within the destination."""
455
+ store = cls.get_store(col.destination, False, col.name)
456
+ return store.copy_local_file(col, src_path)
457
+
458
+ @classmethod
459
+ def delete(cls, dest: str | None, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
460
+ """Delete objects in the destination for a given table ID, table version.
461
+ Returns:
462
+ Number of objects deleted or None
463
+ """
464
+ store = cls.get_store(dest, False)
465
+ return store.delete(tbl_id, tbl_version)
466
+
467
+ @classmethod
468
+ def count(
469
+ cls,
470
+ tbl_id: UUID,
471
+ tbl_version: int | None = None,
472
+ dest: str | None = None,
473
+ default_input_dest: bool = False,
474
+ default_output_dest: bool = False,
475
+ ) -> int:
476
+ """
477
+ Return the count of objects in the destination for a given table ID.
478
+
479
+ At most one of dest, default_input, default_output may be specified. If none are specified, the fallback is the
480
+ local media directory.
481
+
482
+ Args:
483
+ tbl_id: Table ID for which to count objects
484
+ tbl_version: If specified, only counts objects for a specific table version
485
+ dest: The destination to count objects in
486
+ default_input_dest: If `True`, use the default input media destination
487
+ default_output_dest: If `True`, use the default output media destination
488
+ """
489
+ assert sum((dest is not None, default_input_dest, default_output_dest)) <= 1, (
490
+ 'At most one of dest, default_input, default_output may be specified'
491
+ )
492
+ if default_input_dest:
493
+ dest = env.Env.get().default_input_media_dest
494
+ if default_output_dest:
495
+ dest = env.Env.get().default_output_media_dest
496
+ store = cls.get_store(dest, False)
497
+ return store.count(tbl_id, tbl_version)
498
+
499
+ @classmethod
500
+ def list_objects(cls, dest: str | None, return_uri: bool, n_max: int = 10) -> list[str]:
501
+ """Return a list of objects found in the specified destination bucket.
502
+ The dest specification string must not contain an object name.
503
+ Each returned object includes the full set of prefixes.
504
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
505
+ """
506
+ store = cls.get_store(dest, False)
507
+ return store.list_objects(return_uri, n_max)
508
+
509
+ @classmethod
510
+ def list_uris(cls, source_uri: str, n_max: int = 10) -> list[str]:
511
+ """Return a list of URIs found within the specified uri"""
512
+ return cls.list_objects(source_uri, True, n_max)
513
+
514
+
515
+ class HTTPStore(ObjectStoreBase):
516
+ base_url: str
517
+
518
+ def __init__(self, soa: StorageObjectAddress):
519
+ self.base_url = f'{soa.scheme}://{soa.account_extension}/{soa.prefix}'
520
+ if not self.base_url.endswith('/'):
521
+ self.base_url += '/'
522
+
523
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
524
+ with urllib.request.urlopen(self.base_url + src_path) as resp, open(dest_path, 'wb') as f:
525
+ data = resp.read()
526
+ f.write(data)
527
+ f.flush() # Ensures Python buffers are written to OS
528
+ os.fsync(f.fileno()) # Forces OS to write to physical storage
@@ -0,0 +1,60 @@
1
+ import typing
2
+ from datetime import datetime
3
+ from enum import Enum
4
+ from types import UnionType
5
+ from typing import Any, Union
6
+
7
+ import pydantic
8
+
9
+
10
+ def is_json_convertible(model: type[pydantic.BaseModel]) -> bool:
11
+ """
12
+ Determine if instances of a Pydantic model can be converted to valid JSON
13
+ based on the type hints of its fields.
14
+ """
15
+ type_hints = typing.get_type_hints(model)
16
+ return all(_type_is_json_convertible(field_type) for field_type in type_hints.values())
17
+
18
+
19
+ def _type_is_json_convertible(type_hint: Any) -> bool:
20
+ """
21
+ Recursively check if a type hint represents a JSON-compatible type.
22
+
23
+ TODO: also allow ndarrays and PIL.Image.Image, once we support those within json structures.
24
+ """
25
+ if type_hint is type(None):
26
+ return True
27
+ if type_hint is Any:
28
+ return False
29
+
30
+ if type_hint in (str, int, float, bool, datetime):
31
+ return True
32
+
33
+ if isinstance(type_hint, type) and issubclass(type_hint, Enum):
34
+ return all(isinstance(member.value, (str, int, float, bool, type(None))) for member in type_hint)
35
+
36
+ if isinstance(type_hint, type) and issubclass(type_hint, pydantic.BaseModel):
37
+ return is_json_convertible(type_hint)
38
+
39
+ origin = typing.get_origin(type_hint)
40
+ args = typing.get_args(type_hint)
41
+
42
+ if origin in (Union, UnionType):
43
+ return all(_type_is_json_convertible(arg) for arg in args)
44
+
45
+ if origin in (list, tuple):
46
+ return all(_type_is_json_convertible(arg) for arg in args) if len(args) > 0 else False
47
+
48
+ if origin is dict:
49
+ if len(args) != 2:
50
+ # we can't tell what this is
51
+ return False
52
+ key_type, value_type = args
53
+ # keys must be strings, values must be json-convertible
54
+ return key_type is str and _type_is_json_convertible(value_type)
55
+
56
+ # Literal types are json-convertible if their values are
57
+ if origin is typing.Literal:
58
+ return all(isinstance(val, (str, int, float, bool, type(None))) for val in args)
59
+
60
+ return False
@@ -19,15 +19,14 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
19
19
  PyTorch dataset interface for pixeltable data.
20
20
  NB. This class must inherit from torch.utils.data.IterableDataset for it
21
21
  to work with torch.utils.data.DataLoader.
22
+
23
+ Args:
24
+ path: path to directory containing parquet files
25
+ image_format: 'np' or 'pt'. 'np' is RGB uint8 array,
26
+ 'pt' is result of torchvision.transforms.ToTensor()
22
27
  """
23
28
 
24
29
  def __init__(self, path: Path, image_format: str):
25
- """
26
- Args:
27
- path: path to directory containing parquet files
28
- image_format: 'np' or 'pt'. 'np' is RGB uint8 array,
29
- 'pt' is result of torchvision.transforms.ToTensor()
30
- """
31
30
  super().__init__()
32
31
 
33
32
  self.path = path