pixeltable 0.3.14__py3-none-any.whl → 0.5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. pixeltable/__init__.py +42 -8
  2. pixeltable/{dataframe.py → _query.py} +470 -206
  3. pixeltable/_version.py +1 -0
  4. pixeltable/catalog/__init__.py +5 -4
  5. pixeltable/catalog/catalog.py +1785 -432
  6. pixeltable/catalog/column.py +190 -113
  7. pixeltable/catalog/dir.py +2 -4
  8. pixeltable/catalog/globals.py +19 -46
  9. pixeltable/catalog/insertable_table.py +191 -98
  10. pixeltable/catalog/path.py +63 -23
  11. pixeltable/catalog/schema_object.py +11 -15
  12. pixeltable/catalog/table.py +843 -436
  13. pixeltable/catalog/table_metadata.py +103 -0
  14. pixeltable/catalog/table_version.py +978 -657
  15. pixeltable/catalog/table_version_handle.py +72 -16
  16. pixeltable/catalog/table_version_path.py +112 -43
  17. pixeltable/catalog/tbl_ops.py +53 -0
  18. pixeltable/catalog/update_status.py +191 -0
  19. pixeltable/catalog/view.py +134 -90
  20. pixeltable/config.py +134 -22
  21. pixeltable/env.py +471 -157
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +4 -1
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +268 -0
  27. pixeltable/exec/cell_reconstruction_node.py +168 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +11 -7
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +106 -56
  37. pixeltable/exec/globals.py +35 -0
  38. pixeltable/exec/in_memory_data_node.py +19 -19
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +351 -84
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +36 -23
  46. pixeltable/exprs/column_ref.py +213 -89
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +164 -54
  50. pixeltable/exprs/expr.py +70 -44
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +100 -40
  54. pixeltable/exprs/globals.py +2 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +18 -32
  57. pixeltable/exprs/is_null.py +7 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +27 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +167 -67
  64. pixeltable/exprs/rowid_ref.py +25 -10
  65. pixeltable/exprs/similarity_expr.py +58 -40
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +17 -11
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +29 -27
  78. pixeltable/func/signature.py +46 -19
  79. pixeltable/func/tools.py +31 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +16 -0
  82. pixeltable/functions/anthropic.py +123 -77
  83. pixeltable/functions/audio.py +147 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +7 -4
  86. pixeltable/functions/deepseek.py +35 -43
  87. pixeltable/functions/document.py +81 -0
  88. pixeltable/functions/fal.py +76 -0
  89. pixeltable/functions/fireworks.py +11 -20
  90. pixeltable/functions/gemini.py +195 -39
  91. pixeltable/functions/globals.py +142 -14
  92. pixeltable/functions/groq.py +108 -0
  93. pixeltable/functions/huggingface.py +1056 -24
  94. pixeltable/functions/image.py +115 -57
  95. pixeltable/functions/json.py +1 -1
  96. pixeltable/functions/llama_cpp.py +28 -13
  97. pixeltable/functions/math.py +67 -5
  98. pixeltable/functions/mistralai.py +18 -55
  99. pixeltable/functions/net.py +70 -0
  100. pixeltable/functions/ollama.py +20 -13
  101. pixeltable/functions/openai.py +240 -226
  102. pixeltable/functions/openrouter.py +143 -0
  103. pixeltable/functions/replicate.py +4 -4
  104. pixeltable/functions/reve.py +250 -0
  105. pixeltable/functions/string.py +239 -69
  106. pixeltable/functions/timestamp.py +16 -16
  107. pixeltable/functions/together.py +24 -84
  108. pixeltable/functions/twelvelabs.py +188 -0
  109. pixeltable/functions/util.py +6 -1
  110. pixeltable/functions/uuid.py +30 -0
  111. pixeltable/functions/video.py +1515 -107
  112. pixeltable/functions/vision.py +8 -8
  113. pixeltable/functions/voyageai.py +289 -0
  114. pixeltable/functions/whisper.py +16 -8
  115. pixeltable/functions/whisperx.py +179 -0
  116. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  117. pixeltable/globals.py +362 -115
  118. pixeltable/index/base.py +17 -21
  119. pixeltable/index/btree.py +28 -22
  120. pixeltable/index/embedding_index.py +100 -118
  121. pixeltable/io/__init__.py +4 -2
  122. pixeltable/io/datarows.py +8 -7
  123. pixeltable/io/external_store.py +56 -105
  124. pixeltable/io/fiftyone.py +13 -13
  125. pixeltable/io/globals.py +31 -30
  126. pixeltable/io/hf_datasets.py +61 -16
  127. pixeltable/io/label_studio.py +74 -70
  128. pixeltable/io/lancedb.py +3 -0
  129. pixeltable/io/pandas.py +21 -12
  130. pixeltable/io/parquet.py +25 -105
  131. pixeltable/io/table_data_conduit.py +250 -123
  132. pixeltable/io/utils.py +4 -4
  133. pixeltable/iterators/__init__.py +2 -1
  134. pixeltable/iterators/audio.py +26 -25
  135. pixeltable/iterators/base.py +9 -3
  136. pixeltable/iterators/document.py +112 -78
  137. pixeltable/iterators/image.py +12 -15
  138. pixeltable/iterators/string.py +11 -4
  139. pixeltable/iterators/video.py +523 -120
  140. pixeltable/metadata/__init__.py +14 -3
  141. pixeltable/metadata/converters/convert_13.py +2 -2
  142. pixeltable/metadata/converters/convert_18.py +2 -2
  143. pixeltable/metadata/converters/convert_19.py +2 -2
  144. pixeltable/metadata/converters/convert_20.py +2 -2
  145. pixeltable/metadata/converters/convert_21.py +2 -2
  146. pixeltable/metadata/converters/convert_22.py +2 -2
  147. pixeltable/metadata/converters/convert_24.py +2 -2
  148. pixeltable/metadata/converters/convert_25.py +2 -2
  149. pixeltable/metadata/converters/convert_26.py +2 -2
  150. pixeltable/metadata/converters/convert_29.py +4 -4
  151. pixeltable/metadata/converters/convert_30.py +34 -21
  152. pixeltable/metadata/converters/convert_34.py +2 -2
  153. pixeltable/metadata/converters/convert_35.py +9 -0
  154. pixeltable/metadata/converters/convert_36.py +38 -0
  155. pixeltable/metadata/converters/convert_37.py +15 -0
  156. pixeltable/metadata/converters/convert_38.py +39 -0
  157. pixeltable/metadata/converters/convert_39.py +124 -0
  158. pixeltable/metadata/converters/convert_40.py +73 -0
  159. pixeltable/metadata/converters/convert_41.py +12 -0
  160. pixeltable/metadata/converters/convert_42.py +9 -0
  161. pixeltable/metadata/converters/convert_43.py +44 -0
  162. pixeltable/metadata/converters/util.py +20 -31
  163. pixeltable/metadata/notes.py +9 -0
  164. pixeltable/metadata/schema.py +140 -53
  165. pixeltable/metadata/utils.py +74 -0
  166. pixeltable/mypy/__init__.py +3 -0
  167. pixeltable/mypy/mypy_plugin.py +123 -0
  168. pixeltable/plan.py +382 -115
  169. pixeltable/share/__init__.py +1 -1
  170. pixeltable/share/packager.py +547 -83
  171. pixeltable/share/protocol/__init__.py +33 -0
  172. pixeltable/share/protocol/common.py +165 -0
  173. pixeltable/share/protocol/operation_types.py +33 -0
  174. pixeltable/share/protocol/replica.py +119 -0
  175. pixeltable/share/publish.py +257 -59
  176. pixeltable/store.py +311 -194
  177. pixeltable/type_system.py +373 -211
  178. pixeltable/utils/__init__.py +2 -3
  179. pixeltable/utils/arrow.py +131 -17
  180. pixeltable/utils/av.py +298 -0
  181. pixeltable/utils/azure_store.py +346 -0
  182. pixeltable/utils/coco.py +6 -6
  183. pixeltable/utils/code.py +3 -3
  184. pixeltable/utils/console_output.py +4 -1
  185. pixeltable/utils/coroutine.py +6 -23
  186. pixeltable/utils/dbms.py +32 -6
  187. pixeltable/utils/description_helper.py +4 -5
  188. pixeltable/utils/documents.py +7 -18
  189. pixeltable/utils/exception_handler.py +7 -30
  190. pixeltable/utils/filecache.py +6 -6
  191. pixeltable/utils/formatter.py +86 -48
  192. pixeltable/utils/gcs_store.py +295 -0
  193. pixeltable/utils/http.py +133 -0
  194. pixeltable/utils/http_server.py +2 -3
  195. pixeltable/utils/iceberg.py +1 -2
  196. pixeltable/utils/image.py +17 -0
  197. pixeltable/utils/lancedb.py +90 -0
  198. pixeltable/utils/local_store.py +322 -0
  199. pixeltable/utils/misc.py +5 -0
  200. pixeltable/utils/object_stores.py +573 -0
  201. pixeltable/utils/pydantic.py +60 -0
  202. pixeltable/utils/pytorch.py +5 -6
  203. pixeltable/utils/s3_store.py +527 -0
  204. pixeltable/utils/sql.py +26 -0
  205. pixeltable/utils/system.py +30 -0
  206. pixeltable-0.5.7.dist-info/METADATA +579 -0
  207. pixeltable-0.5.7.dist-info/RECORD +227 -0
  208. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
  209. pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
  210. pixeltable/__version__.py +0 -3
  211. pixeltable/catalog/named_function.py +0 -40
  212. pixeltable/ext/__init__.py +0 -17
  213. pixeltable/ext/functions/__init__.py +0 -11
  214. pixeltable/ext/functions/whisperx.py +0 -77
  215. pixeltable/utils/media_store.py +0 -77
  216. pixeltable/utils/s3.py +0 -17
  217. pixeltable-0.3.14.dist-info/METADATA +0 -434
  218. pixeltable-0.3.14.dist-info/RECORD +0 -186
  219. pixeltable-0.3.14.dist-info/entry_points.txt +0 -3
  220. {pixeltable-0.3.14.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,527 @@
1
+ import logging
2
+ import re
3
+ import threading
4
+ import urllib.parse
5
+ import uuid
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING, Any, Iterator, NamedTuple
8
+
9
+ import boto3
10
+ import botocore
11
+ import puremagic
12
+ from botocore.exceptions import ClientError, ConnectionError
13
+
14
+ from pixeltable import env, exceptions as excs
15
+ from pixeltable.config import Config
16
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress, StorageTarget
17
+
18
+ if TYPE_CHECKING:
19
+ from botocore.exceptions import ClientError
20
+
21
+ from pixeltable.catalog import Column
22
+
23
+ _logger = logging.getLogger('pixeltable')
24
+
25
+ client_lock = threading.Lock()
26
+
27
+
28
+ class S3CompatClientDict(NamedTuple):
29
+ """Container for S3-compatible storage access objects (R2, B2, etc.).
30
+ Thread-safe via the module-level 'client_lock'.
31
+ """
32
+
33
+ profile: str | None # AWS-style profile used to locate credentials
34
+ clients: dict[str, Any] # Map of endpoint URL → boto3 client instance
35
+
36
+
37
+ @env.register_client('r2')
38
+ def _() -> Any:
39
+ profile_name = Config.get().get_string_value('r2_profile')
40
+ return S3CompatClientDict(profile=profile_name, clients={})
41
+
42
+
43
+ @env.register_client('r2_resource')
44
+ def _() -> Any:
45
+ profile_name = Config.get().get_string_value('r2_profile')
46
+ return S3CompatClientDict(profile=profile_name, clients={})
47
+
48
+
49
+ @env.register_client('b2')
50
+ def _() -> Any:
51
+ profile_name = Config.get().get_string_value('b2_profile')
52
+ return S3CompatClientDict(profile=profile_name, clients={})
53
+
54
+
55
+ @env.register_client('b2_resource')
56
+ def _() -> Any:
57
+ profile_name = Config.get().get_string_value('b2_profile')
58
+ return S3CompatClientDict(profile=profile_name, clients={})
59
+
60
+
61
+ @env.register_client('tigris')
62
+ def _() -> Any:
63
+ profile_name = Config.get().get_string_value('tigris_profile')
64
+ return S3CompatClientDict(profile=profile_name, clients={})
65
+
66
+
67
+ @env.register_client('tigris_resource')
68
+ def _() -> Any:
69
+ profile_name = Config.get().get_string_value('tigris_profile')
70
+ return S3CompatClientDict(profile=profile_name, clients={})
71
+
72
+
73
+ @env.register_client('s3')
74
+ def _() -> Any:
75
+ profile_name = Config.get().get_string_value('s3_profile')
76
+ return S3CompatClientDict(profile=profile_name, clients={})
77
+
78
+
79
+ @env.register_client('s3_resource')
80
+ def _() -> Any:
81
+ profile_name = Config.get().get_string_value('s3_profile')
82
+ return S3CompatClientDict(profile=profile_name, clients={})
83
+
84
+
85
+ class S3Store(ObjectStoreBase):
86
+ """Wrapper for an s3 storage target with all needed methods."""
87
+
88
+ # URI of the S3 bucket in the format s3://bucket_name/prefix/
89
+ # Always ends with a slash
90
+ __base_uri: str
91
+
92
+ # bucket name extracted from the URI
93
+ __bucket_name: str
94
+
95
+ # prefix path within the bucket, either empty or ending with a slash
96
+ __prefix_name: str
97
+
98
+ soa: StorageObjectAddress
99
+
100
+ def __init__(self, soa: StorageObjectAddress):
101
+ self.soa = soa
102
+ self.__bucket_name = self.soa.container
103
+ self.__prefix_name = self.soa.prefix
104
+ assert self.soa.storage_target in {
105
+ StorageTarget.R2_STORE,
106
+ StorageTarget.S3_STORE,
107
+ StorageTarget.B2_STORE,
108
+ StorageTarget.TIGRIS_STORE,
109
+ }, f'Expected storage_target "s3", "r2", "b2", or "tigris", but got: {self.soa.storage_target}'
110
+ self.__base_uri = self.soa.prefix_free_uri + self.soa.prefix
111
+
112
+ def _get_s3_compat_client(self, client_name: str) -> Any:
113
+ """Helper to get S3-compatible client (R2, B2, Tigris) - caches per endpoint URI."""
114
+ cd = env.Env.get().get_client(client_name)
115
+ with client_lock:
116
+ if self.soa.container_free_uri not in cd.clients:
117
+ cd.clients[self.soa.container_free_uri] = S3Store.create_boto_client(
118
+ profile_name=cd.profile,
119
+ extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
120
+ )
121
+ return cd.clients[self.soa.container_free_uri]
122
+
123
+ def _get_s3_client_with_region(self) -> Any:
124
+ """Helper to get S3 client with correct region - caches per region (not per bucket).
125
+
126
+ Clients are scoped to a region and are bucket-agnostic, allowing to use presigned URLs
127
+ for any bucket in that region. The bucket name is just a parameter in API calls.
128
+ """
129
+ cd = env.Env.get().get_client('s3')
130
+ default_key = 'default'
131
+ with client_lock:
132
+ if default_key not in cd.clients:
133
+ cd.clients[default_key] = S3Store.create_boto_client(profile_name=cd.profile)
134
+
135
+ default_client = cd.clients[default_key]
136
+
137
+ # Detect bucket region
138
+ try:
139
+ bucket_location = default_client.get_bucket_location(Bucket=self.soa.container)
140
+ region = bucket_location.get('LocationConstraint')
141
+ if region is None:
142
+ region = 'us-east-1' # None means us-east-1
143
+ except ClientError:
144
+ return default_client
145
+
146
+ # Check if default already has the correct region
147
+ client_region = default_client._client_config.region_name
148
+ if region == client_region:
149
+ return default_client
150
+
151
+ # Cache per region (reusable for all buckets in that region)
152
+ # Reuse config from default client, just change the region
153
+ region_key = region
154
+ if region_key not in cd.clients:
155
+ default_config = default_client._client_config
156
+ session = self.create_boto_session(cd.profile)
157
+ config = botocore.config.Config(
158
+ max_pool_connections=default_config.max_pool_connections,
159
+ connect_timeout=default_config.connect_timeout,
160
+ read_timeout=default_config.read_timeout,
161
+ retries=default_config.retries,
162
+ signature_version=default_config.signature_version,
163
+ s3=default_config.s3,
164
+ user_agent_extra=default_config.user_agent_extra,
165
+ )
166
+ cd.clients[region_key] = session.client('s3', region_name=region, config=config)
167
+
168
+ return cd.clients[region_key]
169
+
170
+ def client(self) -> Any:
171
+ """Return a boto3 client to access the store.
172
+
173
+ Client is the low-level API for direct AWS operations (e.g., download_file, generate_presigned_url).
174
+ """
175
+ if self.soa.storage_target == StorageTarget.R2_STORE:
176
+ return self._get_s3_compat_client('r2')
177
+ if self.soa.storage_target == StorageTarget.B2_STORE:
178
+ return self._get_s3_compat_client('b2')
179
+ if self.soa.storage_target == StorageTarget.TIGRIS_STORE:
180
+ return self._get_s3_compat_client('tigris')
181
+ if self.soa.storage_target == StorageTarget.S3_STORE:
182
+ return self._get_s3_client_with_region()
183
+ raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
184
+
185
+ def _get_s3_compat_resource(self, client_name: str) -> Any:
186
+ """Helper to get S3-compatible resource (R2, B2, Tigris) - caches per endpoint URI."""
187
+ cd = env.Env.get().get_client(client_name)
188
+ with client_lock:
189
+ if self.soa.container_free_uri not in cd.clients:
190
+ cd.clients[self.soa.container_free_uri] = S3Store.create_boto_resource(
191
+ profile_name=cd.profile,
192
+ extra_args={'endpoint_url': self.soa.container_free_uri, 'region_name': 'auto'},
193
+ )
194
+ return cd.clients[self.soa.container_free_uri]
195
+
196
+ def _get_s3_resource_with_region(self) -> Any:
197
+ """Helper to get S3 resource with correct region - caches per region (not per bucket)."""
198
+ cd = env.Env.get().get_client('s3_resource')
199
+ default_key = 'default'
200
+ with client_lock:
201
+ if default_key not in cd.clients:
202
+ cd.clients[default_key] = S3Store.create_boto_resource(profile_name=cd.profile)
203
+
204
+ default_resource = cd.clients[default_key]
205
+
206
+ # Detect bucket region using the resource's client
207
+ try:
208
+ bucket_location = default_resource.meta.client.get_bucket_location(Bucket=self.soa.container)
209
+ region = bucket_location.get('LocationConstraint')
210
+ if region is None:
211
+ region = 'us-east-1'
212
+ except ClientError:
213
+ return default_resource
214
+
215
+ # Check if default resource already has the correct region
216
+ resource_region = default_resource.meta.client._client_config.region_name
217
+ if region == resource_region:
218
+ return default_resource
219
+
220
+ # Cache resource per region (reusable for all buckets in that region)
221
+ region_key = region
222
+ if region_key not in cd.clients:
223
+ session = self.create_boto_session(cd.profile)
224
+ cd.clients[region_key] = session.resource('s3', region_name=region)
225
+
226
+ return cd.clients[region_key]
227
+
228
+ def get_resource(self) -> Any:
229
+ """Return a boto3 resource to access the store.
230
+
231
+ Resource is the high-level object-oriented API for operations like filtering/iterating objects
232
+ (e.g., bucket.objects.filter(), bucket.delete_objects()).
233
+ """
234
+ if self.soa.storage_target == StorageTarget.R2_STORE:
235
+ return self._get_s3_compat_resource('r2_resource')
236
+ if self.soa.storage_target == StorageTarget.B2_STORE:
237
+ return self._get_s3_compat_resource('b2_resource')
238
+ if self.soa.storage_target == StorageTarget.TIGRIS_STORE:
239
+ return self._get_s3_compat_resource('tigris_resource')
240
+ if self.soa.storage_target == StorageTarget.S3_STORE:
241
+ return self._get_s3_resource_with_region()
242
+ raise AssertionError(f'Unexpected storage_target: {self.soa.storage_target}')
243
+
244
+ @property
245
+ def bucket_name(self) -> str:
246
+ """Return the bucket name from the base URI."""
247
+ return self.__bucket_name
248
+
249
+ @property
250
+ def prefix(self) -> str:
251
+ """Return the prefix from the base URI."""
252
+ return self.__prefix_name
253
+
254
+ def validate(self, error_col_name: str) -> str | None:
255
+ """
256
+ Checks if the URI exists.
257
+
258
+ Returns:
259
+ bool: True if the S3 URI exists and is accessible, False otherwise.
260
+ """
261
+ try:
262
+ self.client().head_bucket(Bucket=self.bucket_name)
263
+ return self.__base_uri
264
+ except ClientError as e:
265
+ self.handle_s3_error(e, f'validating destination for {error_col_name}')
266
+ except ConnectionError as e:
267
+ raise excs.Error(
268
+ f'Connection error while validating destination {self.__base_uri!r} for {error_col_name}: {e}'
269
+ ) from e
270
+ return None
271
+
272
+ def _prepare_uri_raw(self, tbl_id: uuid.UUID, col_id: int, tbl_version: int, ext: str | None = None) -> str:
273
+ """
274
+ Construct a new, unique URI for a persisted media file.
275
+ """
276
+ prefix, filename = ObjectPath.create_prefix_raw(tbl_id, col_id, tbl_version, ext)
277
+ parent = f'{self.__base_uri}{prefix}'
278
+ return f'{parent}/{filename}'
279
+
280
+ def _prepare_uri(self, col: 'Column', ext: str | None = None) -> str:
281
+ """
282
+ Construct a new, unique URI for a persisted media file.
283
+ """
284
+ assert col.get_tbl() is not None, 'Column must be associated with a table'
285
+ return self._prepare_uri_raw(col.get_tbl().id, col.id, col.get_tbl().version, ext=ext)
286
+
287
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
288
+ """Copies an object to a local file. Thread safe."""
289
+ try:
290
+ self.client().download_file(Bucket=self.bucket_name, Key=self.prefix + src_path, Filename=str(dest_path))
291
+ except ClientError as e:
292
+ self.handle_s3_error(e, f'downloading file {src_path!r}')
293
+ raise
294
+
295
+ def copy_local_file(self, col: 'Column', src_path: Path) -> str:
296
+ """Copy a local file, and return its new URL"""
297
+ new_file_uri = self._prepare_uri(col, ext=src_path.suffix)
298
+ parsed = urllib.parse.urlparse(new_file_uri)
299
+ key = parsed.path.lstrip('/')
300
+ if self.soa.storage_target in {StorageTarget.R2_STORE, StorageTarget.B2_STORE, StorageTarget.TIGRIS_STORE}:
301
+ key = key.split('/', 1)[-1] # Remove the bucket name from the key for R2/B2
302
+ try:
303
+ _logger.debug(f'Media Storage: copying {src_path} to {new_file_uri} : Key: {key}')
304
+ content_type = puremagic.from_file(str(src_path), mime=True)
305
+ extra_args = {'ContentType': content_type} if content_type is not None else None
306
+ self.client().upload_file(Filename=str(src_path), Bucket=self.bucket_name, Key=key, ExtraArgs=extra_args)
307
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
308
+ return new_file_uri
309
+ except ClientError as e:
310
+ self.handle_s3_error(e, 'uploading file')
311
+ raise
312
+
313
+ def get_object_content_type(self, key: str) -> str | None:
314
+ """Get the Content-Type of an object.
315
+
316
+ Args:
317
+ key: The object key (without bucket name)
318
+
319
+ Returns:
320
+ The Content-Type string, or None if not found
321
+ """
322
+ try:
323
+ response = self.client().head_object(Bucket=self.bucket_name, Key=key)
324
+ return response.get('ContentType')
325
+ except ClientError:
326
+ return None
327
+
328
+ def _get_filtered_objects(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> tuple[Iterator, Any]:
329
+ """Private method to get filtered objects for a table, optionally filtered by version.
330
+
331
+ Args:
332
+ tbl_id: Table UUID to filter by
333
+ tbl_version: Optional table version to filter by
334
+
335
+ Returns:
336
+ Tuple of (iterator over S3 objects matching the criteria, bucket object)
337
+ """
338
+ # Use ObjectPath to construct the prefix for this table
339
+ table_prefix = ObjectPath.table_prefix(tbl_id)
340
+ prefix = f'{self.prefix}{table_prefix}/'
341
+
342
+ try:
343
+ # Use S3 resource interface for filtering
344
+ s3_resource = self.get_resource()
345
+ bucket = s3_resource.Bucket(self.bucket_name)
346
+
347
+ if tbl_version is None:
348
+ # Return all objects with the table prefix
349
+ object_iterator = bucket.objects.filter(Prefix=prefix)
350
+ else:
351
+ # Filter by both table_id and table_version using the ObjectPath pattern
352
+ # Pattern: tbl_id_col_id_version_uuid
353
+ version_pattern = re.compile(
354
+ rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
355
+ )
356
+ # Return filtered collection - this still uses lazy loading
357
+ object_iterator = (
358
+ obj for obj in bucket.objects.filter(Prefix=prefix) if version_pattern.match(obj.key.split('/')[-1])
359
+ )
360
+
361
+ return object_iterator, bucket
362
+
363
+ except ClientError as e:
364
+ self.handle_s3_error(e, f'setting up iterator {self.prefix}')
365
+ raise
366
+
367
+ def count(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
368
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
369
+ count only those files belonging to the specified tbl_version.
370
+
371
+ Args:
372
+ tbl_id: Table UUID to count objects for
373
+ tbl_version: Optional table version to filter by
374
+
375
+ Returns:
376
+ Number of objects matching the criteria
377
+ """
378
+ assert tbl_id is not None
379
+
380
+ object_iterator, _ = self._get_filtered_objects(tbl_id, tbl_version)
381
+
382
+ return sum(1 for _ in object_iterator)
383
+
384
+ def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
385
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
386
+ only those files belonging to the specified tbl_version.
387
+
388
+ Args:
389
+ tbl_id: Table UUID to delete objects for
390
+ tbl_version: Optional table version to filter by
391
+
392
+ Returns:
393
+ Number of objects deleted
394
+ """
395
+ assert tbl_id is not None
396
+
397
+ # Use shared method to get filtered objects and bucket
398
+ object_iterator, bucket = self._get_filtered_objects(tbl_id, tbl_version)
399
+
400
+ total_deleted = 0
401
+
402
+ try:
403
+ objects_to_delete = []
404
+
405
+ # Process objects in batches as we iterate (memory efficient)
406
+ for obj in object_iterator:
407
+ objects_to_delete.append({'Key': obj.key})
408
+
409
+ # Delete in batches of 1000 (S3 limit)
410
+ if len(objects_to_delete) >= 1000:
411
+ bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
412
+ total_deleted += len(objects_to_delete)
413
+ objects_to_delete = []
414
+
415
+ # Delete any remaining objects in the final batch
416
+ if len(objects_to_delete) > 0:
417
+ bucket.delete_objects(Delete={'Objects': objects_to_delete, 'Quiet': True})
418
+ total_deleted += len(objects_to_delete)
419
+
420
+ return total_deleted
421
+
422
+ except ClientError as e:
423
+ self.handle_s3_error(e, f'deleting with {self.prefix}')
424
+ raise
425
+
426
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
427
+ """Return a list of objects found in the specified destination bucket.
428
+ Each returned object includes the full set of prefixes.
429
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
430
+ """
431
+ p = self.soa.prefix_free_uri if return_uri else ''
432
+
433
+ s3_client = self.client()
434
+ r: list[str] = []
435
+ try:
436
+ # Use paginator to handle more than 1000 objects
437
+ paginator = s3_client.get_paginator('list_objects_v2')
438
+ for page in paginator.paginate(Bucket=self.bucket_name, Prefix=self.prefix):
439
+ if 'Contents' not in page:
440
+ continue
441
+ for obj in page['Contents']:
442
+ if len(r) >= n_max:
443
+ return r
444
+ r.append(f'{p}{obj["Key"]}')
445
+ except ClientError as e:
446
+ self.handle_s3_error(e, f'listing objects from {self.prefix!r}')
447
+ return r
448
+
449
+ def handle_s3_error(self, e: 'ClientError', operation: str = '', *, ignore_404: bool = False) -> None:
450
+ error_code = e.response.get('Error', {}).get('Code')
451
+ error_message = e.response.get('Error', {}).get('Message', str(e))
452
+ if ignore_404 and error_code == '404':
453
+ return
454
+ if error_code == '404':
455
+ raise excs.Error(f'Client error while {operation}: Bucket {self.bucket_name!r} not found') from e
456
+ elif error_code == '403':
457
+ raise excs.Error(
458
+ f'Client error while {operation}: Access denied to bucket {self.bucket_name!r}: {error_message}'
459
+ ) from e
460
+ elif error_code == 'PreconditionFailed' or 'PreconditionFailed' in error_message:
461
+ raise excs.Error(
462
+ f'Client error while {operation}: Precondition failed for bucket {self.bucket_name!r}: {error_message}'
463
+ ) from e
464
+ else:
465
+ raise excs.Error(
466
+ f'Client error while {operation} in bucket {self.bucket_name!r}: {error_code} - {error_message}'
467
+ ) from e
468
+
469
+ @classmethod
470
+ def create_boto_session(cls, profile_name: str | None = None) -> Any:
471
+ """Create a boto session using the defined profile"""
472
+ if profile_name:
473
+ try:
474
+ _logger.info(f'Creating boto session with profile {profile_name}')
475
+ session = boto3.Session(profile_name=profile_name)
476
+ return session
477
+ except Exception as e:
478
+ _logger.info(f'Error occurred while creating boto session with profile {profile_name}: {e}')
479
+ return boto3.Session()
480
+
481
+ @classmethod
482
+ def create_boto_client(cls, profile_name: str | None = None, extra_args: dict[str, Any] | None = None) -> Any:
483
+ config_args: dict[str, Any] = {
484
+ 'max_pool_connections': 30,
485
+ 'connect_timeout': 15,
486
+ 'read_timeout': 30,
487
+ 'retries': {'max_attempts': 3, 'mode': 'adaptive'},
488
+ 'signature_version': 's3v4', # Explicitly use v4 signing for presigned URLs (top-level config parameter)
489
+ 's3': {'addressing_style': 'path'}, # Use path-style addressing for S3-compatible services
490
+ 'user_agent_extra': 'pixeltable', # Marks requests as coming from Pixeltable for tracking and debugging
491
+ }
492
+
493
+ session = cls.create_boto_session(profile_name)
494
+
495
+ try:
496
+ # Check if credentials are available
497
+ session.get_credentials().get_frozen_credentials()
498
+ config = botocore.config.Config(**config_args)
499
+ return session.client('s3', config=config, **(extra_args or {})) # credentials are available
500
+ except Exception as e:
501
+ _logger.info(f'Error occurred while creating S3 client: {e}, fallback to unsigned mode')
502
+ # No credentials available, use unsigned mode
503
+ config_args = config_args.copy()
504
+ config_args['signature_version'] = botocore.UNSIGNED
505
+ config = botocore.config.Config(**config_args)
506
+ return boto3.client('s3', config=config)
507
+
508
+ def create_presigned_url(self, soa: StorageObjectAddress, expiration_seconds: int) -> str:
509
+ """Create a presigned URL for downloading an object from S3-compatible storage."""
510
+ if not soa.has_object:
511
+ raise excs.Error(f'StorageObjectAddress does not contain an object name: {soa}')
512
+
513
+ s3_client = self.client()
514
+
515
+ # Generate presigned URL with v4 signing
516
+ presigned_url = s3_client.generate_presigned_url(
517
+ 'get_object',
518
+ Params={'Bucket': soa.container, 'Key': soa.key},
519
+ ExpiresIn=expiration_seconds,
520
+ HttpMethod='GET',
521
+ )
522
+ return presigned_url
523
+
524
+ @classmethod
525
+ def create_boto_resource(cls, profile_name: str | None = None, extra_args: dict[str, Any] | None = None) -> Any:
526
+ # Create a session using the defined profile
527
+ return cls.create_boto_session(profile_name).resource('s3', **(extra_args or {}))
pixeltable/utils/sql.py CHANGED
@@ -2,6 +2,7 @@ import logging
2
2
 
3
3
  import sqlalchemy as sql
4
4
  from sqlalchemy.dialects import postgresql
5
+ from sqlalchemy.engine import URL
5
6
 
6
7
 
7
8
  def log_stmt(logger: logging.Logger, stmt: sql.sql.ClauseElement) -> None:
@@ -17,3 +18,28 @@ def log_explain(logger: logging.Logger, stmt: sql.sql.ClauseElement, conn: sql.e
17
18
  logger.debug(f'SqlScanNode explain:\n{explain_str}')
18
19
  except Exception:
19
20
  logger.warning('EXPLAIN failed')
21
+
22
+
23
+ def add_option_to_db_url(url: str | URL, option: str) -> URL:
24
+ """Add a connection option to a database URL.
25
+
26
+ Args:
27
+ url: Database URL as string or SQLAlchemy URL object
28
+ option: Option to add (e.g., '-c search_path=test_schema,public' or '-c timezone=UTC')
29
+
30
+ Returns:
31
+ Modified URL object with the option added to the query parameters
32
+ """
33
+ db_url = sql.make_url(url) if isinstance(url, str) else url
34
+
35
+ # Get existing options and parse them
36
+ # Query parameters can be strings or tuples (if multiple values exist)
37
+ existing_options_raw = db_url.query.get('options', '') if db_url.query else ''
38
+ option_parts = (
39
+ list(existing_options_raw) if isinstance(existing_options_raw, tuple) else existing_options_raw.split()
40
+ )
41
+ option_parts.append(option)
42
+ options_str = ' '.join(option_parts)
43
+
44
+ # Create new URL with updated options
45
+ return db_url.set(query={**(dict(db_url.query) if db_url.query else {}), 'options': options_str})
@@ -0,0 +1,30 @@
1
+ import logging
2
+ import os
3
+
4
+ _logger = logging.getLogger('pixeltable')
5
+
6
+
7
+ def set_file_descriptor_limit(preferred_limit: int) -> None:
8
+ """Checks and possibly updates the open file descriptor limit for the process.
9
+
10
+ Note that there may be an OS-enforced upper bound on this limit, so this function may not always succeed in setting
11
+ the preferred limit. It will log a warning and return normally in that case.
12
+ """
13
+ if os.name == 'nt':
14
+ _logger.info('Skipping FD limit adjustment for Windows')
15
+ return
16
+ import resource
17
+
18
+ soft_limit, hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
19
+ _logger.info(f'Current RLIMIT_NOFILE soft limit: {soft_limit}, hard limit: {hard_limit}')
20
+ if soft_limit < preferred_limit and soft_limit < hard_limit:
21
+ new_limit = min(hard_limit, preferred_limit)
22
+ _logger.info(f'Setting RLIMIT_NOFILE soft limit to: {new_limit}')
23
+ resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, hard_limit))
24
+ soft_limit = new_limit
25
+
26
+ if soft_limit < preferred_limit:
27
+ _logger.warning(
28
+ f'RLIMIT_NOFILE soft limit is {soft_limit}, which is less than the preferred {preferred_limit}. '
29
+ 'You may experience suboptimal network performance.'
30
+ )