pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import enum
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import urllib.parse
|
|
7
|
+
import urllib.request
|
|
8
|
+
import uuid
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, NamedTuple
|
|
11
|
+
from uuid import UUID
|
|
12
|
+
|
|
13
|
+
from pixeltable import env, exceptions as excs
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from pixeltable.catalog import Column
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StorageTarget(enum.Enum):
|
|
20
|
+
"""Enumeration of storage kinds."""
|
|
21
|
+
|
|
22
|
+
LOCAL_STORE = 'os' # Local file system
|
|
23
|
+
S3_STORE = 's3' # Amazon S3
|
|
24
|
+
R2_STORE = 'r2' # Cloudflare R2
|
|
25
|
+
B2_STORE = 'b2' # Backblaze B2
|
|
26
|
+
GCS_STORE = 'gs' # Google Cloud Storage
|
|
27
|
+
AZURE_STORE = 'az' # Azure Blob Storage
|
|
28
|
+
HTTP_STORE = 'http' # HTTP/HTTPS
|
|
29
|
+
|
|
30
|
+
def __str__(self) -> str:
|
|
31
|
+
return self.value
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class StorageObjectAddress(NamedTuple):
|
|
35
|
+
"""Contains components of an object address.
|
|
36
|
+
Unused components are empty strings.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
storage_target: StorageTarget # The kind of storage referenced. This is NOT the same as the scheme.
|
|
40
|
+
scheme: str # The scheme parsed from the source
|
|
41
|
+
account: str = '' # Account number parsed from the source when applicable
|
|
42
|
+
account_extension: str = '' # Account extension parsed from the source when applicable
|
|
43
|
+
container: str = '' # Container / bucket name parsed from the source
|
|
44
|
+
key: str = '' # Key parsed from the source (prefix + object_name)
|
|
45
|
+
prefix: str = '' # Prefix (within the bucket) parsed from the source
|
|
46
|
+
object_name: str = '' # Object name parsed from the source (if requested and applicable)
|
|
47
|
+
path: Path | None = None
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def has_object(self) -> bool:
|
|
51
|
+
return len(self.object_name) > 0
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def is_http_readable(self) -> bool:
|
|
55
|
+
return self.scheme.startswith('http') and self.has_object
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def is_azure_scheme(self) -> bool:
|
|
59
|
+
return self.scheme in ('wasb', 'wasbs', 'abfs', 'abfss')
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def has_valid_storage_target(self) -> bool:
|
|
63
|
+
return self.storage_target in (
|
|
64
|
+
StorageTarget.LOCAL_STORE,
|
|
65
|
+
StorageTarget.S3_STORE,
|
|
66
|
+
StorageTarget.R2_STORE,
|
|
67
|
+
StorageTarget.B2_STORE,
|
|
68
|
+
StorageTarget.GCS_STORE,
|
|
69
|
+
StorageTarget.AZURE_STORE,
|
|
70
|
+
StorageTarget.HTTP_STORE,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def prefix_free_uri(self) -> str:
|
|
75
|
+
"""Return the URI without any prefixes."""
|
|
76
|
+
if self.is_azure_scheme:
|
|
77
|
+
return f'{self.scheme}://{self.container}@{self.account}.{self.account_extension}/'
|
|
78
|
+
if self.account and self.account_extension:
|
|
79
|
+
return f'{self.scheme}://{self.account}.{self.account_extension}/{self.container}/'
|
|
80
|
+
if self.account_extension:
|
|
81
|
+
return f'{self.scheme}://{self.account_extension}/{self.container}/'
|
|
82
|
+
return f'{self.scheme}://{self.container}/'
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def container_free_uri(self) -> str:
|
|
86
|
+
"""Return the URI without any prefixes."""
|
|
87
|
+
assert not self.is_azure_scheme, 'Azure storage requires a container name'
|
|
88
|
+
if self.account and self.account_extension:
|
|
89
|
+
return f'{self.scheme}://{self.account}.{self.account_extension}/'
|
|
90
|
+
if self.account_extension:
|
|
91
|
+
return f'{self.scheme}://{self.account_extension}/'
|
|
92
|
+
return f'{self.scheme}://'
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def to_path(self) -> Path:
|
|
96
|
+
assert self.storage_target == StorageTarget.LOCAL_STORE
|
|
97
|
+
assert self.path is not None
|
|
98
|
+
return self.path
|
|
99
|
+
|
|
100
|
+
def __str__(self) -> str:
|
|
101
|
+
"""A debug aid to override default str representation. Not to be used for any purpose."""
|
|
102
|
+
return f'{self.storage_target}..{self.scheme}://{self.account}.{self.account_extension}/{self.container}/{self.prefix}{self.object_name}'
|
|
103
|
+
|
|
104
|
+
def __repr__(self) -> str:
|
|
105
|
+
"""A debug aid to override default repr representation. Not to be used for any purpose."""
|
|
106
|
+
return (
|
|
107
|
+
f'SObjectAddress(client: {self.storage_target!r}, s: {self.scheme!r}, a: {self.account!r}, '
|
|
108
|
+
f'ae: {self.account_extension!r}, c: {self.container!r}, '
|
|
109
|
+
f'p: {self.prefix!r}, o: {self.object_name!r})'
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ObjectPath:
|
|
114
|
+
PATTERN = re.compile(r'([0-9a-fA-F]+)_(\d+)_(\d+)_([0-9a-fA-F]+)') # tbl_id, col_id, version, uuid
|
|
115
|
+
|
|
116
|
+
@classmethod
|
|
117
|
+
def table_prefix(cls, tbl_id: UUID) -> str:
|
|
118
|
+
"""Construct a unique unix-style prefix for objects in a table (without leading/trailing slashes)."""
|
|
119
|
+
assert isinstance(tbl_id, uuid.UUID)
|
|
120
|
+
return tbl_id.hex
|
|
121
|
+
|
|
122
|
+
@classmethod
|
|
123
|
+
def create_prefix_raw(cls, tbl_id: UUID, col_id: int, tbl_version: int, ext: str | None = None) -> tuple[str, str]:
|
|
124
|
+
"""Construct a unique unix-style prefix and filename for a persisted file.
|
|
125
|
+
The results are derived from table, col, and version specs.
|
|
126
|
+
Returns:
|
|
127
|
+
prefix: a unix-style prefix for the file without leading/trailing slashes
|
|
128
|
+
filename: a unique filename for the file without leading slashes
|
|
129
|
+
"""
|
|
130
|
+
table_prefix = cls.table_prefix(tbl_id)
|
|
131
|
+
id_hex = uuid.uuid4().hex
|
|
132
|
+
prefix = f'{table_prefix}/{id_hex[:2]}/{id_hex[:4]}'
|
|
133
|
+
filename = f'{table_prefix}_{col_id}_{tbl_version}_{id_hex}{ext or ""}'
|
|
134
|
+
return prefix, filename
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def separate_prefix_object(cls, path_and_object: str, may_contain_object_name: bool) -> tuple[str, str]:
|
|
138
|
+
path = path_and_object
|
|
139
|
+
object_name = ''
|
|
140
|
+
if not may_contain_object_name or path.endswith('/'):
|
|
141
|
+
prefix = path.rstrip('/')
|
|
142
|
+
elif '/' in path:
|
|
143
|
+
# If there are slashes in the path, separate into prefix and object
|
|
144
|
+
prefix, object_name = path.rsplit('/', 1)
|
|
145
|
+
prefix = prefix.rstrip('/')
|
|
146
|
+
else:
|
|
147
|
+
# If no slashes, the entire path is the object name
|
|
148
|
+
prefix = ''
|
|
149
|
+
object_name = path
|
|
150
|
+
if len(prefix) > 0 and not prefix.endswith('/'):
|
|
151
|
+
prefix += '/'
|
|
152
|
+
return prefix, object_name
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def parse_object_storage_addr1(cls, src_addr: str) -> StorageObjectAddress:
|
|
156
|
+
"""
|
|
157
|
+
Parses a cloud storage URI into its scheme, bucket, and key.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
uri (str): The cloud storage URI (e.g., "gs://my-bucket/path/to/object.txt").
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
StorageObjectAddress: A NamedTuple containing components of the address.
|
|
164
|
+
|
|
165
|
+
Formats:
|
|
166
|
+
s3://container/<optional prefix>/<optional object>
|
|
167
|
+
gs://container/<optional prefix>/<optional object>
|
|
168
|
+
wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
|
|
169
|
+
abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
|
|
170
|
+
https://account.blob.core.windows.net/container/<optional prefix>/<optional object>
|
|
171
|
+
https://account.r2.cloudflarestorage.com/container/<optional prefix>/<optional object>
|
|
172
|
+
https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
|
|
173
|
+
"""
|
|
174
|
+
parsed = urllib.parse.urlparse(src_addr)
|
|
175
|
+
scheme = parsed.scheme.lower()
|
|
176
|
+
account_name = ''
|
|
177
|
+
account_extension = ''
|
|
178
|
+
container = ''
|
|
179
|
+
key = ''
|
|
180
|
+
path = None
|
|
181
|
+
|
|
182
|
+
# len(parsed.scheme) == 1 occurs for Windows drive letters like C:\
|
|
183
|
+
if not parsed.scheme or len(parsed.scheme) == 1:
|
|
184
|
+
# If no scheme, treat as local file path; this will be further validated before use
|
|
185
|
+
storage_target = StorageTarget.LOCAL_STORE
|
|
186
|
+
scheme = 'file'
|
|
187
|
+
path = Path(src_addr)
|
|
188
|
+
|
|
189
|
+
elif scheme == 'file':
|
|
190
|
+
storage_target = StorageTarget.LOCAL_STORE
|
|
191
|
+
pth = parsed.path
|
|
192
|
+
if parsed.netloc:
|
|
193
|
+
# This is a UNC path, ie, file://host/share/path/to/file
|
|
194
|
+
pth = f'\\\\{parsed.netloc}{pth}'
|
|
195
|
+
path = Path(urllib.parse.unquote(urllib.request.url2pathname(pth)))
|
|
196
|
+
key = str(parsed.path).lstrip('/')
|
|
197
|
+
|
|
198
|
+
elif scheme in ('s3', 'gs'):
|
|
199
|
+
storage_target = StorageTarget.S3_STORE if scheme == 's3' else StorageTarget.GCS_STORE
|
|
200
|
+
container = parsed.netloc
|
|
201
|
+
key = parsed.path.lstrip('/')
|
|
202
|
+
|
|
203
|
+
elif scheme in ('wasb', 'wasbs', 'abfs', 'abfss'):
|
|
204
|
+
# Azure-specific URI schemes
|
|
205
|
+
# wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
|
|
206
|
+
# abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
|
|
207
|
+
storage_target = StorageTarget.AZURE_STORE
|
|
208
|
+
container_and_account = parsed.netloc
|
|
209
|
+
if '@' in container_and_account:
|
|
210
|
+
container, account_host = container_and_account.split('@', 1)
|
|
211
|
+
account_name = account_host.split('.')[0]
|
|
212
|
+
account_extension = account_host.split('.', 1)[1]
|
|
213
|
+
else:
|
|
214
|
+
raise ValueError(f'Invalid Azure URI format: {src_addr}')
|
|
215
|
+
key = parsed.path.lstrip('/')
|
|
216
|
+
|
|
217
|
+
elif scheme in ('http', 'https'):
|
|
218
|
+
# Standard HTTP(S) URL format
|
|
219
|
+
# https://account.blob.core.windows.net/container/<optional path>/<optional object>
|
|
220
|
+
# https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
|
|
221
|
+
# https://s3.us-west-004.backblazeb2.com/container/<optional path>/<optional object>
|
|
222
|
+
# and possibly others
|
|
223
|
+
key = parsed.path
|
|
224
|
+
if 'cloudflare' in parsed.netloc:
|
|
225
|
+
storage_target = StorageTarget.R2_STORE
|
|
226
|
+
elif 'backblazeb2' in parsed.netloc:
|
|
227
|
+
storage_target = StorageTarget.B2_STORE
|
|
228
|
+
elif 'windows' in parsed.netloc:
|
|
229
|
+
storage_target = StorageTarget.AZURE_STORE
|
|
230
|
+
else:
|
|
231
|
+
storage_target = StorageTarget.HTTP_STORE
|
|
232
|
+
if storage_target in (
|
|
233
|
+
StorageTarget.S3_STORE,
|
|
234
|
+
StorageTarget.AZURE_STORE,
|
|
235
|
+
StorageTarget.R2_STORE,
|
|
236
|
+
StorageTarget.B2_STORE,
|
|
237
|
+
):
|
|
238
|
+
account_name = parsed.netloc.split('.', 1)[0]
|
|
239
|
+
account_extension = parsed.netloc.split('.', 1)[1]
|
|
240
|
+
path_parts = key.lstrip('/').split('/', 1)
|
|
241
|
+
container = path_parts[0] if path_parts else ''
|
|
242
|
+
key = path_parts[1] if len(path_parts) > 1 else ''
|
|
243
|
+
else:
|
|
244
|
+
account_extension = parsed.netloc
|
|
245
|
+
key = key.lstrip('/')
|
|
246
|
+
else:
|
|
247
|
+
raise ValueError(f'Unsupported URI scheme: {parsed.scheme}')
|
|
248
|
+
|
|
249
|
+
r = StorageObjectAddress(storage_target, scheme, account_name, account_extension, container, key, '', '', path)
|
|
250
|
+
assert r.has_valid_storage_target
|
|
251
|
+
return r
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def parse_object_storage_addr(cls, src_addr: str, allow_obj_name: bool) -> StorageObjectAddress:
|
|
255
|
+
"""
|
|
256
|
+
Parses a cloud storage URI into its scheme, bucket, prefix, and object name.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
uri (str): The cloud storage URI (e.g., "gs://my-bucket/path/to/object.txt").
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
StorageObjectAddress: A NamedTuple containing components of the address.
|
|
263
|
+
|
|
264
|
+
Formats:
|
|
265
|
+
s3://container/<optional prefix>/<optional object>
|
|
266
|
+
gs://container/<optional prefix>/<optional object>
|
|
267
|
+
wasb[s]://container@account.blob.core.windows.net/<optional prefix>/<optional object>
|
|
268
|
+
abfs[s]://container@account.dfs.core.windows.net/<optional prefix>/<optional object>
|
|
269
|
+
https://account.blob.core.windows.net/container/<optional prefix>/<optional object>
|
|
270
|
+
https://account.r2.cloudflarestorage.com/container/<optional prefix>/<optional object>
|
|
271
|
+
https://raw.github.com/pixeltable/pixeltable/main/docs/resources/images/000000000030.jpg
|
|
272
|
+
"""
|
|
273
|
+
soa = cls.parse_object_storage_addr1(src_addr)
|
|
274
|
+
prefix, object_name = cls.separate_prefix_object(soa.key, allow_obj_name)
|
|
275
|
+
assert not object_name.endswith('/')
|
|
276
|
+
r = soa._replace(prefix=prefix, object_name=object_name)
|
|
277
|
+
return r
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
class ObjectStoreBase:
|
|
281
|
+
def validate(self, error_prefix: str) -> str | None:
|
|
282
|
+
"""Check the store configuration. Returns base URI if store is accessible.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
error_col_name: a string of the form 'Column {name}: ' used when raising errors
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Base URI for the store. This value is stored in any Column attached to the store.
|
|
289
|
+
"""
|
|
290
|
+
raise AssertionError
|
|
291
|
+
|
|
292
|
+
def copy_local_file(self, col: Column, src_path: Path) -> str:
|
|
293
|
+
"""Copy a file associated with a Column to the store, returning the file's URL within the destination.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
col: The Column to which the file belongs, used to generate the URI of the stored object.
|
|
297
|
+
src_path: The Path to the local file
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
The URI of the object in the store
|
|
301
|
+
"""
|
|
302
|
+
raise AssertionError
|
|
303
|
+
|
|
304
|
+
def move_local_file(self, col: Column, src_path: Path) -> str | None:
|
|
305
|
+
"""Move a file associated with a Column to the store, returning the file's URL within the destination.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
col: The Column to which the file belongs, used to generate the URI of the stored object.
|
|
309
|
+
src_path: The Path to the local file
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
The URI of the object in the store, None if the object cannot be moved to the store
|
|
313
|
+
"""
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
|
|
317
|
+
"""Copies an object from the store to a local media file.
|
|
318
|
+
|
|
319
|
+
Args:
|
|
320
|
+
src_path: The URI of the object in the store
|
|
321
|
+
dest_path: The desired Path to the local file
|
|
322
|
+
"""
|
|
323
|
+
raise AssertionError
|
|
324
|
+
|
|
325
|
+
def count(self, tbl_id: UUID, tbl_version: int | None = None) -> int:
|
|
326
|
+
"""Return the number of objects in the store associated with the given tbl_id
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
tbl_id: Only count objects associated with a given table
|
|
330
|
+
tbl_version: Only count objects associated with a specific table version
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
Number of objects found with the specified criteria
|
|
334
|
+
"""
|
|
335
|
+
raise AssertionError
|
|
336
|
+
|
|
337
|
+
def delete(self, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
|
|
338
|
+
"""Delete objects in the destination for a given table ID, table version.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
tbl_id: Only count objects associated with a given table
|
|
342
|
+
tbl_version: Only count objects associated with a specific table version
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
Number of objects deleted or None if the store does not count deletions.
|
|
346
|
+
"""
|
|
347
|
+
raise AssertionError
|
|
348
|
+
|
|
349
|
+
def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
350
|
+
"""Return a list of objects in the store.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
return_uri: If True, returns a full URI for each object, otherwise just the path to the object.
|
|
354
|
+
n_max: Maximum number of objects to list
|
|
355
|
+
"""
|
|
356
|
+
raise AssertionError
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
class ObjectOps:
|
|
360
|
+
@classmethod
|
|
361
|
+
def get_store(cls, dest: str | None, allow_obj_name: bool, col_name: str | None = None) -> ObjectStoreBase:
|
|
362
|
+
from pixeltable.env import Env
|
|
363
|
+
from pixeltable.utils.local_store import LocalStore
|
|
364
|
+
|
|
365
|
+
dest = dest or str(Env.get().media_dir) # Use local media dir as fallback
|
|
366
|
+
soa = ObjectPath.parse_object_storage_addr(dest, allow_obj_name=allow_obj_name)
|
|
367
|
+
if soa.storage_target == StorageTarget.LOCAL_STORE:
|
|
368
|
+
return LocalStore(soa)
|
|
369
|
+
if soa.storage_target in (StorageTarget.S3_STORE, StorageTarget.R2_STORE, StorageTarget.B2_STORE):
|
|
370
|
+
env.Env.get().require_package('boto3')
|
|
371
|
+
from pixeltable.utils.s3_store import S3Store
|
|
372
|
+
|
|
373
|
+
return S3Store(soa)
|
|
374
|
+
if soa.storage_target == StorageTarget.GCS_STORE and soa.scheme == 'gs':
|
|
375
|
+
env.Env.get().require_package('google.cloud.storage')
|
|
376
|
+
from pixeltable.utils.gcs_store import GCSStore
|
|
377
|
+
|
|
378
|
+
return GCSStore(soa)
|
|
379
|
+
if soa.storage_target == StorageTarget.AZURE_STORE:
|
|
380
|
+
env.Env.get().require_package('azure.storage.blob')
|
|
381
|
+
from pixeltable.utils.azure_store import AzureBlobStore
|
|
382
|
+
|
|
383
|
+
return AzureBlobStore(soa)
|
|
384
|
+
if soa.storage_target == StorageTarget.HTTP_STORE and soa.is_http_readable:
|
|
385
|
+
return HTTPStore(soa)
|
|
386
|
+
error_col_name = f'Column {col_name!r}: ' if col_name is not None else ''
|
|
387
|
+
raise excs.Error(
|
|
388
|
+
f'{error_col_name}`destination` must be a valid reference to a supported destination, got {dest!r}'
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
@classmethod
|
|
392
|
+
def validate_destination(cls, dest: str | Path | None, col_name: str | None = None) -> str:
|
|
393
|
+
"""Convert a Column destination parameter to a URI, else raise errors.
|
|
394
|
+
Args:
|
|
395
|
+
dest: The requested destination
|
|
396
|
+
col_name: Used to raise error messages
|
|
397
|
+
Returns:
|
|
398
|
+
URI of destination, or raises an error
|
|
399
|
+
"""
|
|
400
|
+
error_col_str = f'column {col_name!r}' if col_name is not None else ''
|
|
401
|
+
|
|
402
|
+
# General checks on any destination
|
|
403
|
+
if isinstance(dest, Path):
|
|
404
|
+
dest = str(dest)
|
|
405
|
+
if dest is not None and not isinstance(dest, str):
|
|
406
|
+
raise excs.Error(f'{error_col_str}: `destination` must be a string or path; got {dest!r}')
|
|
407
|
+
|
|
408
|
+
# Specific checks for storage backends
|
|
409
|
+
store = cls.get_store(dest, False, col_name)
|
|
410
|
+
dest2 = store.validate(error_col_str)
|
|
411
|
+
if dest2 is None:
|
|
412
|
+
raise excs.Error(f'{error_col_str}: `destination` must be a supported destination; got {dest!r}')
|
|
413
|
+
return dest2
|
|
414
|
+
|
|
415
|
+
@classmethod
|
|
416
|
+
def copy_object_to_local_file(cls, src_uri: str, dest_path: Path) -> None:
|
|
417
|
+
"""Copy an object from a URL to a local Path. Thread safe.
|
|
418
|
+
Raises an exception if the download fails or the scheme is not supported
|
|
419
|
+
"""
|
|
420
|
+
soa = ObjectPath.parse_object_storage_addr(src_uri, allow_obj_name=True)
|
|
421
|
+
store = cls.get_store(src_uri, True)
|
|
422
|
+
store.copy_object_to_local_file(soa.object_name, dest_path)
|
|
423
|
+
|
|
424
|
+
@classmethod
|
|
425
|
+
def put_file(cls, col: Column, src_path: Path, relocate_or_delete: bool) -> str:
|
|
426
|
+
"""Move or copy a file to the destination, returning the file's URL within the destination.
|
|
427
|
+
If relocate_or_delete is True and the file is in the TempStore, the file will be deleted after the operation.
|
|
428
|
+
"""
|
|
429
|
+
from pixeltable.utils.local_store import TempStore
|
|
430
|
+
|
|
431
|
+
if relocate_or_delete:
|
|
432
|
+
# File is temporary, used only once, so we can delete it after copy if it can't be moved
|
|
433
|
+
assert TempStore.contains_path(src_path)
|
|
434
|
+
dest = col.destination
|
|
435
|
+
store = cls.get_store(dest, False, col.name)
|
|
436
|
+
# Attempt to move
|
|
437
|
+
if relocate_or_delete:
|
|
438
|
+
moved_file_url = store.move_local_file(col, src_path)
|
|
439
|
+
if moved_file_url is not None:
|
|
440
|
+
return moved_file_url
|
|
441
|
+
new_file_url = store.copy_local_file(col, src_path)
|
|
442
|
+
if relocate_or_delete:
|
|
443
|
+
TempStore.delete_media_file(src_path)
|
|
444
|
+
return new_file_url
|
|
445
|
+
|
|
446
|
+
@classmethod
|
|
447
|
+
def move_local_file(cls, col: Column, src_path: Path) -> str:
|
|
448
|
+
"""Move a file to the destination specified by the Column, returning the file's URL within the destination."""
|
|
449
|
+
store = cls.get_store(col.destination, False, col.name)
|
|
450
|
+
return store.move_local_file(col, src_path)
|
|
451
|
+
|
|
452
|
+
@classmethod
|
|
453
|
+
def copy_local_file(cls, col: Column, src_path: Path) -> str:
|
|
454
|
+
"""Copy a file to the destination specified by the Column, returning the file's URL within the destination."""
|
|
455
|
+
store = cls.get_store(col.destination, False, col.name)
|
|
456
|
+
return store.copy_local_file(col, src_path)
|
|
457
|
+
|
|
458
|
+
@classmethod
|
|
459
|
+
def delete(cls, dest: str | None, tbl_id: UUID, tbl_version: int | None = None) -> int | None:
|
|
460
|
+
"""Delete objects in the destination for a given table ID, table version.
|
|
461
|
+
Returns:
|
|
462
|
+
Number of objects deleted or None
|
|
463
|
+
"""
|
|
464
|
+
store = cls.get_store(dest, False)
|
|
465
|
+
return store.delete(tbl_id, tbl_version)
|
|
466
|
+
|
|
467
|
+
@classmethod
|
|
468
|
+
def count(
|
|
469
|
+
cls,
|
|
470
|
+
tbl_id: UUID,
|
|
471
|
+
tbl_version: int | None = None,
|
|
472
|
+
dest: str | None = None,
|
|
473
|
+
default_input_dest: bool = False,
|
|
474
|
+
default_output_dest: bool = False,
|
|
475
|
+
) -> int:
|
|
476
|
+
"""
|
|
477
|
+
Return the count of objects in the destination for a given table ID.
|
|
478
|
+
|
|
479
|
+
At most one of dest, default_input, default_output may be specified. If none are specified, the fallback is the
|
|
480
|
+
local media directory.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
tbl_id: Table ID for which to count objects
|
|
484
|
+
tbl_version: If specified, only counts objects for a specific table version
|
|
485
|
+
dest: The destination to count objects in
|
|
486
|
+
default_input_dest: If `True`, use the default input media destination
|
|
487
|
+
default_output_dest: If `True`, use the default output media destination
|
|
488
|
+
"""
|
|
489
|
+
assert sum((dest is not None, default_input_dest, default_output_dest)) <= 1, (
|
|
490
|
+
'At most one of dest, default_input, default_output may be specified'
|
|
491
|
+
)
|
|
492
|
+
if default_input_dest:
|
|
493
|
+
dest = env.Env.get().default_input_media_dest
|
|
494
|
+
if default_output_dest:
|
|
495
|
+
dest = env.Env.get().default_output_media_dest
|
|
496
|
+
store = cls.get_store(dest, False)
|
|
497
|
+
return store.count(tbl_id, tbl_version)
|
|
498
|
+
|
|
499
|
+
@classmethod
|
|
500
|
+
def list_objects(cls, dest: str | None, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
501
|
+
"""Return a list of objects found in the specified destination bucket.
|
|
502
|
+
The dest specification string must not contain an object name.
|
|
503
|
+
Each returned object includes the full set of prefixes.
|
|
504
|
+
if return_uri is True, full URI's are returned; otherwise, just the object keys.
|
|
505
|
+
"""
|
|
506
|
+
store = cls.get_store(dest, False)
|
|
507
|
+
return store.list_objects(return_uri, n_max)
|
|
508
|
+
|
|
509
|
+
@classmethod
|
|
510
|
+
def list_uris(cls, source_uri: str, n_max: int = 10) -> list[str]:
|
|
511
|
+
"""Return a list of URIs found within the specified uri"""
|
|
512
|
+
return cls.list_objects(source_uri, True, n_max)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
class HTTPStore(ObjectStoreBase):
|
|
516
|
+
base_url: str
|
|
517
|
+
|
|
518
|
+
def __init__(self, soa: StorageObjectAddress):
|
|
519
|
+
self.base_url = f'{soa.scheme}://{soa.account_extension}/{soa.prefix}'
|
|
520
|
+
if not self.base_url.endswith('/'):
|
|
521
|
+
self.base_url += '/'
|
|
522
|
+
|
|
523
|
+
def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
|
|
524
|
+
with urllib.request.urlopen(self.base_url + src_path) as resp, open(dest_path, 'wb') as f:
|
|
525
|
+
data = resp.read()
|
|
526
|
+
f.write(data)
|
|
527
|
+
f.flush() # Ensures Python buffers are written to OS
|
|
528
|
+
os.fsync(f.fileno()) # Forces OS to write to physical storage
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from types import UnionType
|
|
5
|
+
from typing import Any, Union
|
|
6
|
+
|
|
7
|
+
import pydantic
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_json_convertible(model: type[pydantic.BaseModel]) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
Determine if instances of a Pydantic model can be converted to valid JSON
|
|
13
|
+
based on the type hints of its fields.
|
|
14
|
+
"""
|
|
15
|
+
type_hints = typing.get_type_hints(model)
|
|
16
|
+
return all(_type_is_json_convertible(field_type) for field_type in type_hints.values())
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _type_is_json_convertible(type_hint: Any) -> bool:
|
|
20
|
+
"""
|
|
21
|
+
Recursively check if a type hint represents a JSON-compatible type.
|
|
22
|
+
|
|
23
|
+
TODO: also allow ndarrays and PIL.Image.Image, once we support those within json structures.
|
|
24
|
+
"""
|
|
25
|
+
if type_hint is type(None):
|
|
26
|
+
return True
|
|
27
|
+
if type_hint is Any:
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
if type_hint in (str, int, float, bool, datetime):
|
|
31
|
+
return True
|
|
32
|
+
|
|
33
|
+
if isinstance(type_hint, type) and issubclass(type_hint, Enum):
|
|
34
|
+
return all(isinstance(member.value, (str, int, float, bool, type(None))) for member in type_hint)
|
|
35
|
+
|
|
36
|
+
if isinstance(type_hint, type) and issubclass(type_hint, pydantic.BaseModel):
|
|
37
|
+
return is_json_convertible(type_hint)
|
|
38
|
+
|
|
39
|
+
origin = typing.get_origin(type_hint)
|
|
40
|
+
args = typing.get_args(type_hint)
|
|
41
|
+
|
|
42
|
+
if origin in (Union, UnionType):
|
|
43
|
+
return all(_type_is_json_convertible(arg) for arg in args)
|
|
44
|
+
|
|
45
|
+
if origin in (list, tuple):
|
|
46
|
+
return all(_type_is_json_convertible(arg) for arg in args) if len(args) > 0 else False
|
|
47
|
+
|
|
48
|
+
if origin is dict:
|
|
49
|
+
if len(args) != 2:
|
|
50
|
+
# we can't tell what this is
|
|
51
|
+
return False
|
|
52
|
+
key_type, value_type = args
|
|
53
|
+
# keys must be strings, values must be json-convertible
|
|
54
|
+
return key_type is str and _type_is_json_convertible(value_type)
|
|
55
|
+
|
|
56
|
+
# Literal types are json-convertible if their values are
|
|
57
|
+
if origin is typing.Literal:
|
|
58
|
+
return all(isinstance(val, (str, int, float, bool, type(None))) for val in args)
|
|
59
|
+
|
|
60
|
+
return False
|
pixeltable/utils/pytorch.py
CHANGED
|
@@ -19,15 +19,14 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
|
|
|
19
19
|
PyTorch dataset interface for pixeltable data.
|
|
20
20
|
NB. This class must inherit from torch.utils.data.IterableDataset for it
|
|
21
21
|
to work with torch.utils.data.DataLoader.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
path: path to directory containing parquet files
|
|
25
|
+
image_format: 'np' or 'pt'. 'np' is RGB uint8 array,
|
|
26
|
+
'pt' is result of torchvision.transforms.ToTensor()
|
|
22
27
|
"""
|
|
23
28
|
|
|
24
29
|
def __init__(self, path: Path, image_format: str):
|
|
25
|
-
"""
|
|
26
|
-
Args:
|
|
27
|
-
path: path to directory containing parquet files
|
|
28
|
-
image_format: 'np' or 'pt'. 'np' is RGB uint8 array,
|
|
29
|
-
'pt' is result of torchvision.transforms.ToTensor()
|
|
30
|
-
"""
|
|
31
30
|
super().__init__()
|
|
32
31
|
|
|
33
32
|
self.path = path
|