pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +42 -8
- pixeltable/{dataframe.py → _query.py} +470 -206
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -4
- pixeltable/catalog/catalog.py +1785 -432
- pixeltable/catalog/column.py +190 -113
- pixeltable/catalog/dir.py +2 -4
- pixeltable/catalog/globals.py +19 -46
- pixeltable/catalog/insertable_table.py +191 -98
- pixeltable/catalog/path.py +63 -23
- pixeltable/catalog/schema_object.py +11 -15
- pixeltable/catalog/table.py +843 -436
- pixeltable/catalog/table_metadata.py +103 -0
- pixeltable/catalog/table_version.py +978 -657
- pixeltable/catalog/table_version_handle.py +72 -16
- pixeltable/catalog/table_version_path.py +112 -43
- pixeltable/catalog/tbl_ops.py +53 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +134 -90
- pixeltable/config.py +134 -22
- pixeltable/env.py +471 -157
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +4 -1
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +268 -0
- pixeltable/exec/cell_reconstruction_node.py +168 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +11 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +106 -56
- pixeltable/exec/globals.py +35 -0
- pixeltable/exec/in_memory_data_node.py +19 -19
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +351 -84
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +36 -23
- pixeltable/exprs/column_ref.py +213 -89
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +164 -54
- pixeltable/exprs/expr.py +70 -44
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +100 -40
- pixeltable/exprs/globals.py +2 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +18 -32
- pixeltable/exprs/is_null.py +7 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +27 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +167 -67
- pixeltable/exprs/rowid_ref.py +25 -10
- pixeltable/exprs/similarity_expr.py +58 -40
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +17 -11
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +29 -27
- pixeltable/func/signature.py +46 -19
- pixeltable/func/tools.py +31 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +16 -0
- pixeltable/functions/anthropic.py +123 -77
- pixeltable/functions/audio.py +147 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +7 -4
- pixeltable/functions/deepseek.py +35 -43
- pixeltable/functions/document.py +81 -0
- pixeltable/functions/fal.py +76 -0
- pixeltable/functions/fireworks.py +11 -20
- pixeltable/functions/gemini.py +195 -39
- pixeltable/functions/globals.py +142 -14
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1056 -24
- pixeltable/functions/image.py +115 -57
- pixeltable/functions/json.py +1 -1
- pixeltable/functions/llama_cpp.py +28 -13
- pixeltable/functions/math.py +67 -5
- pixeltable/functions/mistralai.py +18 -55
- pixeltable/functions/net.py +70 -0
- pixeltable/functions/ollama.py +20 -13
- pixeltable/functions/openai.py +240 -226
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +4 -4
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +239 -69
- pixeltable/functions/timestamp.py +16 -16
- pixeltable/functions/together.py +24 -84
- pixeltable/functions/twelvelabs.py +188 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/uuid.py +30 -0
- pixeltable/functions/video.py +1515 -107
- pixeltable/functions/vision.py +8 -8
- pixeltable/functions/voyageai.py +289 -0
- pixeltable/functions/whisper.py +16 -8
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +362 -115
- pixeltable/index/base.py +17 -21
- pixeltable/index/btree.py +28 -22
- pixeltable/index/embedding_index.py +100 -118
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +8 -7
- pixeltable/io/external_store.py +56 -105
- pixeltable/io/fiftyone.py +13 -13
- pixeltable/io/globals.py +31 -30
- pixeltable/io/hf_datasets.py +61 -16
- pixeltable/io/label_studio.py +74 -70
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +21 -12
- pixeltable/io/parquet.py +25 -105
- pixeltable/io/table_data_conduit.py +250 -123
- pixeltable/io/utils.py +4 -4
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +26 -25
- pixeltable/iterators/base.py +9 -3
- pixeltable/iterators/document.py +112 -78
- pixeltable/iterators/image.py +12 -15
- pixeltable/iterators/string.py +11 -4
- pixeltable/iterators/video.py +523 -120
- pixeltable/metadata/__init__.py +14 -3
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_30.py +34 -21
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/convert_41.py +12 -0
- pixeltable/metadata/converters/convert_42.py +9 -0
- pixeltable/metadata/converters/convert_43.py +44 -0
- pixeltable/metadata/converters/util.py +20 -31
- pixeltable/metadata/notes.py +9 -0
- pixeltable/metadata/schema.py +140 -53
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +382 -115
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +547 -83
- pixeltable/share/protocol/__init__.py +33 -0
- pixeltable/share/protocol/common.py +165 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +119 -0
- pixeltable/share/publish.py +257 -59
- pixeltable/store.py +311 -194
- pixeltable/type_system.py +373 -211
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +131 -17
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +346 -0
- pixeltable/utils/coco.py +6 -6
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +32 -6
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +7 -18
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +86 -48
- pixeltable/utils/gcs_store.py +295 -0
- pixeltable/utils/http.py +133 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +90 -0
- pixeltable/utils/local_store.py +322 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +573 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +527 -0
- pixeltable/utils/sql.py +26 -0
- pixeltable/utils/system.py +30 -0
- pixeltable-0.5.7.dist-info/METADATA +579 -0
- pixeltable-0.5.7.dist-info/RECORD +227 -0
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
- pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/catalog/named_function.py +0 -40
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable-0.3.14.dist-info/METADATA +0 -434
- pixeltable-0.3.14.dist-info/RECORD +0 -186
- pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
- {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import re
|
|
3
|
+
import threading
|
|
4
|
+
import urllib.parse
|
|
5
|
+
import uuid
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Iterator, NamedTuple
|
|
8
|
+
|
|
9
|
+
import boto3
|
|
10
|
+
import botocore
|
|
11
|
+
import puremagic
|
|
12
|
+
from botocore.exceptions import ClientError, ConnectionError
|
|
13
|
+
|
|
14
|
+
from pixeltable import env, exceptions as excs
|
|
15
|
+
from pixeltable.config import Config
|
|
16
|
+
from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from botocore.exceptions import ClientError
|
|
20
|
+
|
|
21
|
+
from pixeltable.catalog import Column
|
|
22
|
+
|
|
23
|
+
_logger = logging.getLogger('pixeltable')
|
|
24
|
+
|
|
25
|
+
client_lock = threading.Lock()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class S3CompatClientDict(NamedTuple):
|
|
29
|
+
"""Container for S3-compatible storage access objects (R2, B2, etc.).
|
|
30
|
+
Thread-safe via the module-level 'client_lock'.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
profile: str | None # AWS-style profile used to locate credentials
|
|
34
|
+
clients: dict[str, Any] # Map of endpoint URL → boto3 client instance
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@env.register_client('r2')
|
|
38
|
+
def _() -> Any:
|
|
39
|
+
profile_name = Config.get().get_string_value('r2_profile')
|
|
40
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@env.register_client('r2_resource')
|
|
44
|
+
def _() -> Any:
|
|
45
|
+
profile_name = Config.get().get_string_value('r2_profile')
|
|
46
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@env.register_client('b2')
|
|
50
|
+
def _() -> Any:
|
|
51
|
+
profile_name = Config.get().get_string_value('b2_profile')
|
|
52
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@env.register_client('b2_resource')
|
|
56
|
+
def _() -> Any:
|
|
57
|
+
profile_name = Config.get().get_string_value('b2_profile')
|
|
58
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@env.register_client('tigris')
|
|
62
|
+
def _() -> Any:
|
|
63
|
+
profile_name = Config.get().get_string_value('tigris_profile')
|
|
64
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@env.register_client('tigris_resource')
|
|
68
|
+
def _() -> Any:
|
|
69
|
+
profile_name = Config.get().get_string_value('tigris_profile')
|
|
70
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@env.register_client('s3')
|
|
74
|
+
def _() -> Any:
|
|
75
|
+
profile_name = Config.get().get_string_value('s3_profile')
|
|
76
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@env.register_client('s3_resource')
|
|
80
|
+
def _() -> Any:
|
|
81
|
+
profile_name = Config.get().get_string_value('s3_profile')
|
|
82
|
+
return S3CompatClientDict(profile=profile_name, clients={})
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class S3Store(ObjectStoreBase):
|
|
86
|
+
"""Wrapper for an s3 storage target with all needed methods."""
|
|
87
|
+
|
|
88
|
+
# URI of the S3 bucket in the format s3://bucket_name/prefix/
|
|
89
|
+
# Always ends with a slash
|
|
90
|
+
__base_uri: str
|
|
91
|
+
|
|
92
|
+
# bucket name extracted from the URI
|
|
93
|
+
__bucket_name: str
|
|
94
|
+
|
|
95
|
+
# prefix path within the bucket, either empty or ending with a slash
|
|
96
|
+
__prefix_name: str
|
|
97
|
+
|
|
98
|
+
soa: StorageObjectAddress
|
|
99
|
+
|
|
100
|
+
def __init__(self, soa: StorageObjectAddress):
|
|
101
|
+
self.soa = soa
|
|
102
|
+
self.__bucket_name = self.soa.container
|
|
103
|
+
self.__prefix_name = self.soa.prefix
|
|
104
|
+
assert self.soa.storage_target in {
|
|
105
|
+
StorageTarget.R2_STORE,
|
|
106
|
+
StorageTarget.S3_STORE,
|
|
107
|
+
StorageTarget.B2_STORE,
|
|
108
|
+
StorageTarget.TIGRIS_STORE,
|
|
109
|
+
}, f'Expected storage_target "s3", "r2", "b2", or "tigris", but got: {self.soa.storage_target}'
|
|
110
|
+
self.__base_uri = self.soa.prefix_free_uri + self.soa.prefix
|
|
111
|
+
|
|
112
|
+
def _get_s3_compat_client(self, client_name: str) -> Any:
|
|
113
|
+
"""Helper to get S3-compatible client (R2, B2, Tigris) - caches per endpoint URI."""
|
|
114
|
+
cd = env.Env.get().get_client(client_name)
|
|
115
|
+
with client_lock:
|
|
116
|
+
if self.soa.container_free_uri not in cd.clients:
|
|
117
|
+
cd.clients[self.soa.container_free_uri] = S3Store.create_boto_client(
|
|
118
|
+
profile_name=cd.profile,
|
|
119
|
+
extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
|
|
120
|
+
)
|
|
121
|
+
return cd.clients[self.soa.container_free_uri]
|
|
122
|
+
|
|
123
|
+
def _get_s3_client_with_region(self) -> Any:
|
|
124
|
+
"""Helper to get S3 client with correct region - caches per region (not per bucket).
|
|
125
|
+
|
|
126
|
+
Clients are scoped to a region and are bucket-agnostic, allowing to use presigned URLs
|
|
127
|
+
for any bucket in that region. The bucket name is just a parameter in API calls.
|
|
128
|
+
"""
|
|
129
|
+
cd = env.Env.get().get_client('s3')
|
|
130
|
+
default_key = 'default'
|
|
131
|
+
with client_lock:
|
|
132
|
+
if default_key not in cd.clients:
|
|
133
|
+
cd.clients[default_key] = S3Store.create_boto_client(profile_name=cd.profile)
|
|
134
|
+
|
|
135
|
+
default_client = cd.clients[default_key]
|
|
136
|
+
|
|
137
|
+
# Detect bucket region
|
|
138
|
+
try:
|
|
139
|
+
bucket_location = default_client.get_bucket_location(Bucket=self.soa.container)
|
|
140
|
+
region = bucket_location.get('LocationConstraint')
|
|
141
|
+
if region is None:
|
|
142
|
+
region = 'us-east-1' # None means us-east-1
|
|
143
|
+
except ClientError:
|
|
144
|
+
return default_client
|
|
145
|
+
|
|
146
|
+
# Check if default already has the correct region
|
|
147
|
+
client_region = default_client._client_config.region_name
|
|
148
|
+
if region == client_region:
|
|
149
|
+
return default_client
|
|
150
|
+
|
|
151
|
+
# Cache per region (reusable for all buckets in that region)
|
|
152
|
+
# Reuse config from default client, just change the region
|
|
153
|
+
region_key = region
|
|
154
|
+
if region_key not in cd.clients:
|
|
155
|
+
default_config = default_client._client_config
|
|
156
|
+
session = self.create_boto_session(cd.profile)
|
|
157
|
+
config = botocore.config.Config(
|
|
158
|
+
max_pool_connections=default_config.max_pool_connections,
|
|
159
|
+
connect_timeout=default_config.connect_timeout,
|
|
160
|
+
read_timeout=default_config.read_timeout,
|
|
161
|
+
retries=default_config.retries,
|
|
162
|
+
signature_version=default_config.signature_version,
|
|
163
|
+
s3=default_config.s3,
|
|
164
|
+
user_agent_extra=default_config.user_agent_extra,
|
|
165
|
+
)
|
|
166
|
+
cd.clients[region_key] = session.client('s3', region_name=region, config=config)
|
|
167
|
+
|
|
168
|
+
return cd.clients[region_key]
|
|
169
|
+
|
|
170
|
+
def client(self) -> Any:
|
|
171
|
+
"""Return a boto3 client to access the store.
|
|
172
|
+
|
|
173
|
+
Client is the low-level API for direct AWS operations (e.g., download_file, generate_presigned_url).
|
|
174
|
+
"""
|
|
175
|
+
if self.soa.storage_target == StorageTarget.R2_STORE:
|
|
176
|
+
return self._get_s3_compat_client('r2')
|
|
177
|
+
if self.soa.storage_target == StorageTarget.B2_STORE:
|
|
178
|
+
return self._get_s3_compat_client('b2')
|
|
179
|
+
if self.soa.storage_target == StorageTarget.TIGRIS_STORE:
|
|
180
|
+
return self._get_s3_compat_client('tigris')
|
|
181
|
+
if self.soa.storage_target == StorageTarget.S3_STORE:
|
|
182
|
+
return self._get_s3_client_with_region()
|
|
183
|
+
raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
|
|
184
|
+
|
|
185
|
+
def _get_s3_compat_resource(self, client_name: str) -> Any:
|
|
186
|
+
"""Helper to get S3-compatible resource (R2, B2, Tigris) - caches per endpoint URI."""
|
|
187
|
+
cd = env.Env.get().get_client(client_name)
|
|
188
|
+
with client_lock:
|
|
189
|
+
if self.soa.container_free_uri not in cd.clients:
|
|
190
|
+
cd.clients[self.soa.container_free_uri] = S3Store.create_boto_resource(
|
|
191
|
+
profile_name=cd.profile,
|
|
192
|
+
extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
|
|
193
|
+
)
|
|
194
|
+
return cd.clients[self.soa.container_free_uri]
|
|
195
|
+
|
|
196
|
+
def _get_s3_resource_with_region(self) -> Any:
|
|
197
|
+
"""Helper to get S3 resource with correct region - caches per region (not per bucket)."""
|
|
198
|
+
cd = env.Env.get().get_client('s3_resource')
|
|
199
|
+
default_key = 'default'
|
|
200
|
+
with client_lock:
|
|
201
|
+
if default_key not in cd.clients:
|
|
202
|
+
cd.clients[default_key] = S3Store.create_boto_resource(profile_name=cd.profile)
|
|
203
|
+
|
|
204
|
+
default_resource = cd.clients[default_key]
|
|
205
|
+
|
|
206
|
+
# Detect bucket region using the resource's client
|
|
207
|
+
try:
|
|
208
|
+
bucket_location = default_resource.meta.client.get_bucket_location(Bucket=self.soa.container)
|
|
209
|
+
region = bucket_location.get('LocationConstraint')
|
|
210
|
+
if region is None:
|
|
211
|
+
region = 'us-east-1'
|
|
212
|
+
except ClientError:
|
|
213
|
+
return default_resource
|
|
214
|
+
|
|
215
|
+
# Check if default resource already has the correct region
|
|
216
|
+
resource_region = default_resource.meta.client._client_config.region_name
|
|
217
|
+
if region == resource_region:
|
|
218
|
+
return default_resource
|
|
219
|
+
|
|
220
|
+
# Cache resource per region (reusable for all buckets in that region)
|
|
221
|
+
region_key = region
|
|
222
|
+
if region_key not in cd.clients:
|
|
223
|
+
session = self.create_boto_session(cd.profile)
|
|
224
|
+
cd.clients[region_key] = session.resource('s3', region_name=region)
|
|
225
|
+
|
|
226
|
+
return cd.clients[region_key]
|
|
227
|
+
|
|
228
|
+
def get_resource(self) -> Any:
|
|
229
|
+
"""Return a boto3 resource to access the store.
|
|
230
|
+
|
|
231
|
+
Resource is the high-level object-oriented API for operations like filtering/iterating objects
|
|
232
|
+
(e.g., bucket.objects.filter(), bucket.delete_objects()).
|
|
233
|
+
"""
|
|
234
|
+
if self.soa.storage_target == StorageTarget.R2_STORE:
|
|
235
|
+
return self._get_s3_compat_resource('r2_resource')
|
|
236
|
+
if self.soa.storage_target == StorageTarget.B2_STORE:
|
|
237
|
+
return self._get_s3_compat_resource('b2_resource')
|
|
238
|
+
if self.soa.storage_target == StorageTarget.TIGRIS_STORE:
|
|
239
|
+
return self._get_s3_compat_resource('tigris_resource')
|
|
240
|
+
if self.soa.storage_target == StorageTarget.S3_STORE:
|
|
241
|
+
return self._get_s3_resource_with_region()
|
|
242
|
+
raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
|
|
243
|
+
|
|
244
|
+
@property
|
|
245
|
+
def bucket_name(self) -> str:
|
|
246
|
+
"""Return the bucket name from the base URI."""
|
|
247
|
+
return self.__bucket_name
|
|
248
|
+
|
|
249
|
+
@property
|
|
250
|
+
def prefix(self) -> str:
|
|
251
|
+
"""Return the prefix from the base URI."""
|
|
252
|
+
return self.__prefix_name
|
|
253
|
+
|
|
254
|
+
def validate(self, error_col_name: str) -> str | None:
|
|
255
|
+
"""
|
|
256
|
+
Checks if the URI exists.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
bool: True if the S3 URI exists and is accessible, False otherwise.
|
|
260
|
+
"""
|
|
261
|
+
try:
|
|
262
|
+
self.client().head_bucket(Bucket=self.bucket_name)
|
|
263
|
+
return self.__base_uri
|
|
264
|
+
except ClientError as e:
|
|
265
|
+
self.handle_s3_error(e, f'validating destination for {error_col_name}')
|
|
266
|
+
except ConnectionError as e:
|
|
267
|
+
raise excs.Error(
|
|
268
|
+
f'Connection error while validating destination {self.__base_uri!r} for {error_col_name}: {e}'
|
|
269
|
+
) from e
|
|
270
|
+
return None
|
|
271
|
+
|
|
272
|
+
def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: str | None = None) -> str:
|
|
273
|
+
"""
|
|
274
|
+
Construct a new, unique URI for a persisted media file.
|
|
275
|
+
"""
|
|
276
|
+
prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
|
|
277
|
+
parent = f'{self.__base_uri}{prefix}'
|
|
278
|
+
return f'{parent}/{filename}'
|
|
279
|
+
|
|
280
|
+
def _prepare_uri(self, col: 'Column', ext: str | None = None) -> str:
|
|
281
|
+
"""
|
|
282
|
+
Construct a new, unique URI for a persisted media file.
|
|
283
|
+
"""
|
|
284
|
+
assert col.get_tbl() is not None, 'Column must be associated with a table'
|
|
285
|
+
return self._prepare_uri_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext=ext)
|
|
286
|
+
|
|
287
|
+
def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
|
|
288
|
+
"""Copies an object to a local file. Thread safe."""
|
|
289
|
+
try:
|
|
290
|
+
self.client().download_file(Bucket=self.bucket_name, Key=self.prefix + src_path, Filename=str(dest_path))
|
|
291
|
+
except ClientError as e:
|
|
292
|
+
self.handle_s3_error(e, f'downloading file {src_path!r}')
|
|
293
|
+
raise
|
|
294
|
+
|
|
295
|
+
def copy_local_file(self, col: 'Column', src_path: Path) -> str:
|
|
296
|
+
"""Copy a local file, and return its new URL"""
|
|
297
|
+
new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
|
|
298
|
+
parsed = urllib.parse.urlparse(new_file_uri)
|
|
299
|
+
key = parsed.path.lstrip('/')
|
|
300
|
+
if self.soa.storage_target in {StorageTarget.R2_STORE, StorageTarget.B2_STORE, StorageTarget.TIGRIS_STORE}:
|
|
301
|
+
key = key.split('/', 1)[-1] # Remove the bucket name from the key for R2/B2
|
|
302
|
+
try:
|
|
303
|
+
_logger.debug(f'Media Storage: copying {src_path} to {new_file_uri} : Key: {key}')
|
|
304
|
+
content_type = puremagic.from_file(str(src_path), mime=True)
|
|
305
|
+
extra_args = {'ContentType': content_type} if content_type is not None else None
|
|
306
|
+
self.client().upload_file(Filename=str(src_path), Bucket=self.bucket_name, Key=key, ExtraArgs=extra_args)
|
|
307
|
+
_logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
|
|
308
|
+
return new_file_uri
|
|
309
|
+
except ClientError as e:
|
|
310
|
+
self.handle_s3_error(e, 'uploading file')
|
|
311
|
+
raise
|
|
312
|
+
|
|
313
|
+
def get_object_content_type(self, key: str) -> str | None:
|
|
314
|
+
"""Get the Content-Type of an object.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
key: The object key (without bucket name)
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
The Content-Type string, or None if not found
|
|
321
|
+
"""
|
|
322
|
+
try:
|
|
323
|
+
response = self.client().head_object(Bucket=self.bucket_name, Key=key)
|
|
324
|
+
return response.get('ContentType')
|
|
325
|
+
except ClientError:
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
def _get_filtered_objects(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> tuple[Iterator, Any]:
|
|
329
|
+
"""Private method to get filtered objects for a table, optionally filtered by version.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
tbl_id: Table UUID to filter by
|
|
333
|
+
tbl_version: Optional table version to filter by
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
Tuple of (iterator over S3 objects matching the criteria, bucket object)
|
|
337
|
+
"""
|
|
338
|
+
# Use ObjectPath to construct the prefix for this table
|
|
339
|
+
table_prefix = ObjectPath.table_prefix(tbl_id)
|
|
340
|
+
prefix = f'{self.prefix}{table_prefix}/'
|
|
341
|
+
|
|
342
|
+
try:
|
|
343
|
+
# Use S3 resource interface for filtering
|
|
344
|
+
s3_resource = self.get_resource()
|
|
345
|
+
bucket = s3_resource.Bucket(self.bucket_name)
|
|
346
|
+
|
|
347
|
+
if tbl_version is None:
|
|
348
|
+
# Return all objects with the table prefix
|
|
349
|
+
object_iterator = bucket.objects.filter(Prefix=prefix)
|
|
350
|
+
else:
|
|
351
|
+
# Filter by both table_id and table_version using the ObjectPath pattern
|
|
352
|
+
# Pattern: tbl_id_col_id_version_uuid
|
|
353
|
+
version_pattern = re.compile(
|
|
354
|
+
rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
|
|
355
|
+
)
|
|
356
|
+
# Return filtered collection - this still uses lazy loading
|
|
357
|
+
object_iterator = (
|
|
358
|
+
obj for obj in bucket.objects.filter(Prefix=prefix) if version_pattern.match(obj.key.split('/')[-1])
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return object_iterator, bucket
|
|
362
|
+
|
|
363
|
+
except ClientError as e:
|
|
364
|
+
self.handle_s3_error(e, f'setting up iterator {self.prefix}')
|
|
365
|
+
raise
|
|
366
|
+
|
|
367
|
+
def count(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
368
|
+
"""Count the number of files belonging to tbl_id. If tbl_version is not None,
|
|
369
|
+
count only those files belonging to the specified tbl_version.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
tbl_id: Table UUID to count objects for
|
|
373
|
+
tbl_version: Optional table version to filter by
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
Number of objects matching the criteria
|
|
377
|
+
"""
|
|
378
|
+
assert tbl_id is not None
|
|
379
|
+
|
|
380
|
+
object_iterator, _ = self._get_filtered_objects(tbl_id, tbl_version)
|
|
381
|
+
|
|
382
|
+
return sum(1 for _ in object_iterator)
|
|
383
|
+
|
|
384
|
+
def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
|
|
385
|
+
"""Delete all files belonging to tbl_id. If tbl_version is not None, delete
|
|
386
|
+
only those files belonging to the specified tbl_version.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
tbl_id: Table UUID to delete objects for
|
|
390
|
+
tbl_version: Optional table version to filter by
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Number of objects deleted
|
|
394
|
+
"""
|
|
395
|
+
assert tbl_id is not None
|
|
396
|
+
|
|
397
|
+
# Use shared method to get filtered objects and bucket
|
|
398
|
+
object_iterator, bucket = self._get_filtered_objects(tbl_id, tbl_version)
|
|
399
|
+
|
|
400
|
+
total_deleted = 0
|
|
401
|
+
|
|
402
|
+
try:
|
|
403
|
+
objects_to_delete = []
|
|
404
|
+
|
|
405
|
+
# Process objects in batches as we iterate (memory efficient)
|
|
406
|
+
for obj in object_iterator:
|
|
407
|
+
objects_to_delete.append({'Key': obj.key})
|
|
408
|
+
|
|
409
|
+
# Delete in batches of 1000 (S3 limit)
|
|
410
|
+
if len(objects_to_delete) >= 1000:
|
|
411
|
+
bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
|
|
412
|
+
total_deleted += len(objects_to_delete)
|
|
413
|
+
objects_to_delete = []
|
|
414
|
+
|
|
415
|
+
# Delete any remaining objects in the final batch
|
|
416
|
+
if len(objects_to_delete) > 0:
|
|
417
|
+
bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
|
|
418
|
+
total_deleted += len(objects_to_delete)
|
|
419
|
+
|
|
420
|
+
return total_deleted
|
|
421
|
+
|
|
422
|
+
except ClientError as e:
|
|
423
|
+
self.handle_s3_error(e, f'deleting with {self.prefix}')
|
|
424
|
+
raise
|
|
425
|
+
|
|
426
|
+
def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
|
|
427
|
+
"""Return a list of objects found in the specified destination bucket.
|
|
428
|
+
Each returned object includes the full set of prefixes.
|
|
429
|
+
if return_uri is True, full URI's are returned; otherwise, just the object keys.
|
|
430
|
+
"""
|
|
431
|
+
p = self.soa.prefix_free_uri if return_uri else ''
|
|
432
|
+
|
|
433
|
+
s3_client = self.client()
|
|
434
|
+
r: list[str] = []
|
|
435
|
+
try:
|
|
436
|
+
# Use paginator to handle more than 1000 objects
|
|
437
|
+
paginator = s3_client.get_paginator('list_objects_v2')
|
|
438
|
+
for page in paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix):
|
|
439
|
+
if 'Contents' not in page:
|
|
440
|
+
continue
|
|
441
|
+
for obj in page['Contents']:
|
|
442
|
+
if len(r) >= n_max:
|
|
443
|
+
return r
|
|
444
|
+
r.append(f'{p}{obj["Key"]}')
|
|
445
|
+
except ClientError as e:
|
|
446
|
+
self.handle_s3_error(e, f'listing objects from {self.prefix!r}')
|
|
447
|
+
return r
|
|
448
|
+
|
|
449
|
+
def handle_s3_error(self, e: 'ClientError', operation: str = '', *, ignore_404: bool = False) -> None:
|
|
450
|
+
error_code = e.response.get('Error', {}).get('Code')
|
|
451
|
+
error_message = e.response.get('Error', {}).get('Message', str(e))
|
|
452
|
+
if ignore_404 and error_code == '404':
|
|
453
|
+
return
|
|
454
|
+
if error_code == '404':
|
|
455
|
+
raise excs.Error(f'Client error while {operation}: Bucket {self.bucket_name!r} not found') from e
|
|
456
|
+
elif error_code == '403':
|
|
457
|
+
raise excs.Error(
|
|
458
|
+
f'Client error while {operation}: Access denied to bucket {self.bucket_name!r}: {error_message}'
|
|
459
|
+
) from e
|
|
460
|
+
elif error_code == 'PreconditionFailed' or 'PreconditionFailed' in error_message:
|
|
461
|
+
raise excs.Error(
|
|
462
|
+
f'Client error while {operation}: Precondition failed for bucket {self.bucket_name!r}: {error_message}'
|
|
463
|
+
) from e
|
|
464
|
+
else:
|
|
465
|
+
raise excs.Error(
|
|
466
|
+
f'Client error while {operation} in bucket {self.bucket_name!r}: {error_code} - {error_message}'
|
|
467
|
+
) from e
|
|
468
|
+
|
|
469
|
+
@classmethod
|
|
470
|
+
def create_boto_session(cls, profile_name: str | None = None) -> Any:
|
|
471
|
+
"""Create a boto session using the defined profile"""
|
|
472
|
+
if profile_name:
|
|
473
|
+
try:
|
|
474
|
+
_logger.info(f'Creating boto session with profile {profile_name}')
|
|
475
|
+
session = boto3.Session(profile_name=profile_name)
|
|
476
|
+
return session
|
|
477
|
+
except Exception as e:
|
|
478
|
+
_logger.info(f'Error occurred while creating boto session with profile {profile_name}: {e}')
|
|
479
|
+
return boto3.Session()
|
|
480
|
+
|
|
481
|
+
@classmethod
|
|
482
|
+
def create_boto_client(cls, profile_name: str | None = None, extra_args: dict[str, Any] | None = None) -> Any:
|
|
483
|
+
config_args: dict[str, Any] = {
|
|
484
|
+
'max_pool_connections': 30,
|
|
485
|
+
'connect_timeout': 15,
|
|
486
|
+
'read_timeout': 30,
|
|
487
|
+
'retries': {'max_attempts': 3, 'mode': 'adaptive'},
|
|
488
|
+
'signature_version': 's3v4', # Explicitly use v4 signing for presigned URLs (top-level config parameter)
|
|
489
|
+
's3': {'addressing_style': 'path'}, # Use path-style addressing for S3-compatible services
|
|
490
|
+
'user_agent_extra': 'pixeltable', # Marks requests as coming from Pixeltable for tracking and debugging
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
session = cls.create_boto_session(profile_name)
|
|
494
|
+
|
|
495
|
+
try:
|
|
496
|
+
# Check if credentials are available
|
|
497
|
+
session.get_credentials().get_frozen_credentials()
|
|
498
|
+
config = botocore.config.Config(**config_args)
|
|
499
|
+
return session.client('s3', config=config, **(extra_args or {})) # credentials are available
|
|
500
|
+
except Exception as e:
|
|
501
|
+
_logger.info(f'Error occurred while creating S3 client: {e}, fallback to unsigned mode')
|
|
502
|
+
# No credentials available, use unsigned mode
|
|
503
|
+
config_args = config_args.copy()
|
|
504
|
+
config_args['signature_version'] = botocore.UNSIGNED
|
|
505
|
+
config = botocore.config.Config(**config_args)
|
|
506
|
+
return boto3.client('s3', config=config)
|
|
507
|
+
|
|
508
|
+
def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
|
|
509
|
+
"""Create a presigned URL for downloading an object from S3-compatible storage."""
|
|
510
|
+
if not soa.has_object:
|
|
511
|
+
raise excs.Error(f'StorageObjectAddress does not contain an object name: {soa}')
|
|
512
|
+
|
|
513
|
+
s3_client = self.client()
|
|
514
|
+
|
|
515
|
+
# Generate presigned URL with v4 signing
|
|
516
|
+
presigned_url = s3_client.generate_presigned_url(
|
|
517
|
+
'get_object',
|
|
518
|
+
Params={'Bucket': soa.container, 'Key': soa.key},
|
|
519
|
+
ExpiresIn=expiration_seconds,
|
|
520
|
+
HttpMethod='GET',
|
|
521
|
+
)
|
|
522
|
+
return presigned_url
|
|
523
|
+
|
|
524
|
+
@classmethod
|
|
525
|
+
def create_boto_resource(cls, profile_name: str | None = None, extra_args: dict[str, Any] | None = None) -> Any:
|
|
526
|
+
# Create a session using the defined profile
|
|
527
|
+
return cls.create_boto_session(profile_name).resource('s3', **(extra_args or {}))
|
pixeltable/utils/sql.py
CHANGED
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
from sqlalchemy.dialects import postgresql
|
|
5
|
+
from sqlalchemy.engine import URL
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
def log_stmt(logger: logging.Logger, stmt: sql.sql.ClauseElement) -> None:
|
|
@@ -17,3 +18,28 @@ def log_explain(logger: logging.Logger, stmt: sql.sql.ClauseElement, conn: sql.e
|
|
|
17
18
|
logger.debug(f'SqlScanNode explain:\n{explain_str}')
|
|
18
19
|
except Exception:
|
|
19
20
|
logger.warning('EXPLAIN failed')
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def add_option_to_db_url(url: str | URL, option: str) -> URL:
|
|
24
|
+
"""Add a connection option to a database URL.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
url: Database URL as string or SQLAlchemy URL object
|
|
28
|
+
option: Option to add (e.g., '-c search_path=test_schema,public' or '-c timezone=UTC')
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Modified URL object with the option added to the query parameters
|
|
32
|
+
"""
|
|
33
|
+
db_url = sql.make_url(url) if isinstance(url, str) else url
|
|
34
|
+
|
|
35
|
+
# Get existing options and parse them
|
|
36
|
+
# Query parameters can be strings or tuples (if multiple values exist)
|
|
37
|
+
existing_options_raw = db_url.query.get('options', '') if db_url.query else ''
|
|
38
|
+
option_parts = (
|
|
39
|
+
list(existing_options_raw) if isinstance(existing_options_raw, tuple) else existing_options_raw.split()
|
|
40
|
+
)
|
|
41
|
+
option_parts.append(option)
|
|
42
|
+
options_str = ' '.join(option_parts)
|
|
43
|
+
|
|
44
|
+
# Create new URL with updated options
|
|
45
|
+
return db_url.set(query={**(dict(db_url.query) if db_url.query else {}), 'options': options_str})
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
_logger = logging.getLogger('pixeltable')
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def set_file_descriptor_limit(preferred_limit: int) -> None:
|
|
8
|
+
"""Checks and possibly updates the open file descriptor limit for the process.
|
|
9
|
+
|
|
10
|
+
Note that there may be an OS-enforced upper bound on this limit, so this function may not always succeed in setting
|
|
11
|
+
the preferred limit. It will log a warning and return normally in that case.
|
|
12
|
+
"""
|
|
13
|
+
if os.name == 'nt':
|
|
14
|
+
_logger.info('Skipping FD limit adjustment for Windows')
|
|
15
|
+
return
|
|
16
|
+
import resource
|
|
17
|
+
|
|
18
|
+
soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
|
|
19
|
+
_logger.info(f'Current RLIMIT_NOFILE soft limit: {soft_limit}, hard limit: {hard_limit}')
|
|
20
|
+
if soft_limit < preferred_limit and soft_limit < hard_limit:
|
|
21
|
+
new_limit = min(hard_limit, preferred_limit)
|
|
22
|
+
_logger.info(f'Setting RLIMIT_NOFILE soft limit to: {new_limit}')
|
|
23
|
+
resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, hard_limit))
|
|
24
|
+
soft_limit = new_limit
|
|
25
|
+
|
|
26
|
+
if soft_limit < preferred_limit:
|
|
27
|
+
_logger.warning(
|
|
28
|
+
f'RLIMIT_NOFILE soft limit is {soft_limit}, which is less than the preferred {preferred_limit}. '
|
|
29
|
+
'You may experience suboptimal network performance.'
|
|
30
|
+
)
|