pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (202) hide show
  1. pixeltable/__init__.py +23 -5
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/__init__.py +5 -3
  4. pixeltable/catalog/catalog.py +1318 -404
  5. pixeltable/catalog/column.py +186 -115
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +11 -43
  8. pixeltable/catalog/insertable_table.py +167 -79
  9. pixeltable/catalog/path.py +61 -23
  10. pixeltable/catalog/schema_object.py +9 -10
  11. pixeltable/catalog/table.py +626 -308
  12. pixeltable/catalog/table_metadata.py +101 -0
  13. pixeltable/catalog/table_version.py +713 -569
  14. pixeltable/catalog/table_version_handle.py +37 -6
  15. pixeltable/catalog/table_version_path.py +42 -29
  16. pixeltable/catalog/tbl_ops.py +50 -0
  17. pixeltable/catalog/update_status.py +191 -0
  18. pixeltable/catalog/view.py +108 -94
  19. pixeltable/config.py +128 -22
  20. pixeltable/dataframe.py +188 -100
  21. pixeltable/env.py +407 -136
  22. pixeltable/exceptions.py +6 -0
  23. pixeltable/exec/__init__.py +3 -0
  24. pixeltable/exec/aggregation_node.py +7 -8
  25. pixeltable/exec/cache_prefetch_node.py +83 -110
  26. pixeltable/exec/cell_materialization_node.py +231 -0
  27. pixeltable/exec/cell_reconstruction_node.py +135 -0
  28. pixeltable/exec/component_iteration_node.py +4 -3
  29. pixeltable/exec/data_row_batch.py +8 -65
  30. pixeltable/exec/exec_context.py +16 -4
  31. pixeltable/exec/exec_node.py +13 -36
  32. pixeltable/exec/expr_eval/evaluators.py +7 -6
  33. pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
  34. pixeltable/exec/expr_eval/globals.py +8 -5
  35. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  36. pixeltable/exec/expr_eval/schedulers.py +190 -30
  37. pixeltable/exec/globals.py +32 -0
  38. pixeltable/exec/in_memory_data_node.py +18 -18
  39. pixeltable/exec/object_store_save_node.py +293 -0
  40. pixeltable/exec/row_update_node.py +16 -9
  41. pixeltable/exec/sql_node.py +206 -101
  42. pixeltable/exprs/__init__.py +1 -1
  43. pixeltable/exprs/arithmetic_expr.py +27 -22
  44. pixeltable/exprs/array_slice.py +3 -3
  45. pixeltable/exprs/column_property_ref.py +34 -30
  46. pixeltable/exprs/column_ref.py +92 -96
  47. pixeltable/exprs/comparison.py +5 -5
  48. pixeltable/exprs/compound_predicate.py +5 -4
  49. pixeltable/exprs/data_row.py +152 -55
  50. pixeltable/exprs/expr.py +62 -43
  51. pixeltable/exprs/expr_dict.py +3 -3
  52. pixeltable/exprs/expr_set.py +17 -10
  53. pixeltable/exprs/function_call.py +75 -37
  54. pixeltable/exprs/globals.py +1 -2
  55. pixeltable/exprs/in_predicate.py +4 -4
  56. pixeltable/exprs/inline_expr.py +10 -27
  57. pixeltable/exprs/is_null.py +1 -3
  58. pixeltable/exprs/json_mapper.py +8 -8
  59. pixeltable/exprs/json_path.py +56 -22
  60. pixeltable/exprs/literal.py +5 -5
  61. pixeltable/exprs/method_ref.py +2 -2
  62. pixeltable/exprs/object_ref.py +2 -2
  63. pixeltable/exprs/row_builder.py +127 -53
  64. pixeltable/exprs/rowid_ref.py +8 -12
  65. pixeltable/exprs/similarity_expr.py +50 -25
  66. pixeltable/exprs/sql_element_cache.py +4 -4
  67. pixeltable/exprs/string_op.py +5 -5
  68. pixeltable/exprs/type_cast.py +3 -5
  69. pixeltable/func/__init__.py +1 -0
  70. pixeltable/func/aggregate_function.py +8 -8
  71. pixeltable/func/callable_function.py +9 -9
  72. pixeltable/func/expr_template_function.py +10 -10
  73. pixeltable/func/function.py +18 -20
  74. pixeltable/func/function_registry.py +6 -7
  75. pixeltable/func/globals.py +2 -3
  76. pixeltable/func/mcp.py +74 -0
  77. pixeltable/func/query_template_function.py +20 -18
  78. pixeltable/func/signature.py +43 -16
  79. pixeltable/func/tools.py +23 -13
  80. pixeltable/func/udf.py +18 -20
  81. pixeltable/functions/__init__.py +6 -0
  82. pixeltable/functions/anthropic.py +93 -33
  83. pixeltable/functions/audio.py +114 -10
  84. pixeltable/functions/bedrock.py +13 -6
  85. pixeltable/functions/date.py +1 -1
  86. pixeltable/functions/deepseek.py +20 -9
  87. pixeltable/functions/fireworks.py +2 -2
  88. pixeltable/functions/gemini.py +28 -11
  89. pixeltable/functions/globals.py +13 -13
  90. pixeltable/functions/groq.py +108 -0
  91. pixeltable/functions/huggingface.py +1046 -23
  92. pixeltable/functions/image.py +9 -18
  93. pixeltable/functions/llama_cpp.py +23 -8
  94. pixeltable/functions/math.py +3 -4
  95. pixeltable/functions/mistralai.py +4 -15
  96. pixeltable/functions/ollama.py +16 -9
  97. pixeltable/functions/openai.py +104 -82
  98. pixeltable/functions/openrouter.py +143 -0
  99. pixeltable/functions/replicate.py +2 -2
  100. pixeltable/functions/reve.py +250 -0
  101. pixeltable/functions/string.py +21 -28
  102. pixeltable/functions/timestamp.py +13 -14
  103. pixeltable/functions/together.py +4 -6
  104. pixeltable/functions/twelvelabs.py +92 -0
  105. pixeltable/functions/util.py +6 -1
  106. pixeltable/functions/video.py +1388 -106
  107. pixeltable/functions/vision.py +7 -7
  108. pixeltable/functions/whisper.py +15 -7
  109. pixeltable/functions/whisperx.py +179 -0
  110. pixeltable/{ext/functions → functions}/yolox.py +2 -4
  111. pixeltable/globals.py +332 -105
  112. pixeltable/index/base.py +13 -22
  113. pixeltable/index/btree.py +23 -22
  114. pixeltable/index/embedding_index.py +32 -44
  115. pixeltable/io/__init__.py +4 -2
  116. pixeltable/io/datarows.py +7 -6
  117. pixeltable/io/external_store.py +49 -77
  118. pixeltable/io/fiftyone.py +11 -11
  119. pixeltable/io/globals.py +29 -28
  120. pixeltable/io/hf_datasets.py +17 -9
  121. pixeltable/io/label_studio.py +70 -66
  122. pixeltable/io/lancedb.py +3 -0
  123. pixeltable/io/pandas.py +12 -11
  124. pixeltable/io/parquet.py +13 -93
  125. pixeltable/io/table_data_conduit.py +71 -47
  126. pixeltable/io/utils.py +3 -3
  127. pixeltable/iterators/__init__.py +2 -1
  128. pixeltable/iterators/audio.py +21 -11
  129. pixeltable/iterators/document.py +116 -55
  130. pixeltable/iterators/image.py +5 -2
  131. pixeltable/iterators/video.py +293 -13
  132. pixeltable/metadata/__init__.py +4 -2
  133. pixeltable/metadata/converters/convert_18.py +2 -2
  134. pixeltable/metadata/converters/convert_19.py +2 -2
  135. pixeltable/metadata/converters/convert_20.py +2 -2
  136. pixeltable/metadata/converters/convert_21.py +2 -2
  137. pixeltable/metadata/converters/convert_22.py +2 -2
  138. pixeltable/metadata/converters/convert_24.py +2 -2
  139. pixeltable/metadata/converters/convert_25.py +2 -2
  140. pixeltable/metadata/converters/convert_26.py +2 -2
  141. pixeltable/metadata/converters/convert_29.py +4 -4
  142. pixeltable/metadata/converters/convert_34.py +2 -2
  143. pixeltable/metadata/converters/convert_36.py +2 -2
  144. pixeltable/metadata/converters/convert_37.py +15 -0
  145. pixeltable/metadata/converters/convert_38.py +39 -0
  146. pixeltable/metadata/converters/convert_39.py +124 -0
  147. pixeltable/metadata/converters/convert_40.py +73 -0
  148. pixeltable/metadata/converters/util.py +13 -12
  149. pixeltable/metadata/notes.py +4 -0
  150. pixeltable/metadata/schema.py +79 -42
  151. pixeltable/metadata/utils.py +74 -0
  152. pixeltable/mypy/__init__.py +3 -0
  153. pixeltable/mypy/mypy_plugin.py +123 -0
  154. pixeltable/plan.py +274 -223
  155. pixeltable/share/__init__.py +1 -1
  156. pixeltable/share/packager.py +259 -129
  157. pixeltable/share/protocol/__init__.py +34 -0
  158. pixeltable/share/protocol/common.py +170 -0
  159. pixeltable/share/protocol/operation_types.py +33 -0
  160. pixeltable/share/protocol/replica.py +109 -0
  161. pixeltable/share/publish.py +213 -57
  162. pixeltable/store.py +238 -175
  163. pixeltable/type_system.py +104 -63
  164. pixeltable/utils/__init__.py +2 -3
  165. pixeltable/utils/arrow.py +108 -13
  166. pixeltable/utils/av.py +298 -0
  167. pixeltable/utils/azure_store.py +305 -0
  168. pixeltable/utils/code.py +3 -3
  169. pixeltable/utils/console_output.py +4 -1
  170. pixeltable/utils/coroutine.py +6 -23
  171. pixeltable/utils/dbms.py +31 -5
  172. pixeltable/utils/description_helper.py +4 -5
  173. pixeltable/utils/documents.py +5 -6
  174. pixeltable/utils/exception_handler.py +7 -30
  175. pixeltable/utils/filecache.py +6 -6
  176. pixeltable/utils/formatter.py +4 -6
  177. pixeltable/utils/gcs_store.py +283 -0
  178. pixeltable/utils/http_server.py +2 -3
  179. pixeltable/utils/iceberg.py +1 -2
  180. pixeltable/utils/image.py +17 -0
  181. pixeltable/utils/lancedb.py +88 -0
  182. pixeltable/utils/local_store.py +316 -0
  183. pixeltable/utils/misc.py +5 -0
  184. pixeltable/utils/object_stores.py +528 -0
  185. pixeltable/utils/pydantic.py +60 -0
  186. pixeltable/utils/pytorch.py +5 -6
  187. pixeltable/utils/s3_store.py +392 -0
  188. pixeltable-0.4.20.dist-info/METADATA +587 -0
  189. pixeltable-0.4.20.dist-info/RECORD +218 -0
  190. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
  191. pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
  192. pixeltable/__version__.py +0 -3
  193. pixeltable/ext/__init__.py +0 -17
  194. pixeltable/ext/functions/__init__.py +0 -11
  195. pixeltable/ext/functions/whisperx.py +0 -77
  196. pixeltable/utils/media_store.py +0 -77
  197. pixeltable/utils/s3.py +0 -17
  198. pixeltable/utils/sample.py +0 -25
  199. pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
  200. pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
  201. pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
  202. {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/utils/av.py ADDED
@@ -0,0 +1,298 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from fractions import Fraction
5
+ from pathlib import Path
6
+ from types import TracebackType
7
+ from typing import Any, Iterator
8
+
9
+ import av
10
+ import av.stream
11
+ import PIL.Image
12
+ from typing_extensions import Self
13
+
14
+ from pixeltable.env import Env
15
+
16
+ # format -> (codec, extension)
17
+ AUDIO_FORMATS: dict[str, tuple[str, str]] = {
18
+ 'wav': ('pcm_s16le', 'wav'),
19
+ 'mp3': ('libmp3lame', 'mp3'),
20
+ 'flac': ('flac', 'flac'),
21
+ 'mp4': ('aac', 'm4a'),
22
+ }
23
+
24
+
25
+ def get_metadata(path: str) -> dict:
26
+ with av.open(path) as container:
27
+ assert isinstance(container, av.container.InputContainer)
28
+ streams_info = [__get_stream_metadata(stream) for stream in container.streams]
29
+ result = {
30
+ 'bit_exact': getattr(container, 'bit_exact', False),
31
+ 'bit_rate': container.bit_rate,
32
+ 'size': container.size,
33
+ 'metadata': container.metadata,
34
+ 'streams': streams_info,
35
+ }
36
+ return result
37
+
38
+
39
+ def __get_stream_metadata(stream: av.stream.Stream) -> dict:
40
+ if stream.type not in ('audio', 'video'):
41
+ return {'type': stream.type} # Currently unsupported
42
+
43
+ codec_context = stream.codec_context
44
+ codec_context_md: dict[str, Any] = {
45
+ 'name': codec_context.name,
46
+ 'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
47
+ 'profile': codec_context.profile,
48
+ }
49
+ metadata = {
50
+ 'type': stream.type,
51
+ 'duration': stream.duration,
52
+ 'time_base': float(stream.time_base) if stream.time_base is not None else None,
53
+ 'duration_seconds': float(stream.duration * stream.time_base)
54
+ if stream.duration is not None and stream.time_base is not None
55
+ else None,
56
+ 'frames': stream.frames,
57
+ 'metadata': stream.metadata,
58
+ 'codec_context': codec_context_md,
59
+ }
60
+
61
+ if stream.type == 'audio':
62
+ # Additional metadata for audio
63
+ channels = getattr(stream.codec_context, 'channels', None)
64
+ codec_context_md['channels'] = int(channels) if channels is not None else None
65
+ else:
66
+ assert stream.type == 'video'
67
+ assert isinstance(stream, av.video.stream.VideoStream)
68
+ # Additional metadata for video
69
+ codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
70
+ metadata.update(
71
+ **{
72
+ 'width': stream.width,
73
+ 'height': stream.height,
74
+ 'frames': stream.frames,
75
+ 'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
76
+ 'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
77
+ 'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
78
+ }
79
+ )
80
+
81
+ return metadata
82
+
83
+
84
+ def get_video_duration(path: str) -> float | None:
85
+ """Return video duration in seconds."""
86
+ with av.open(path) as container:
87
+ video_stream = container.streams.video[0]
88
+ if video_stream is None:
89
+ return None
90
+ if video_stream.duration is not None:
91
+ return float(video_stream.duration * video_stream.time_base)
92
+
93
+ # if duration is not in the header, look for it in the last packet
94
+ last_pts: int | None = None
95
+ for packet in container.demux(video_stream):
96
+ if packet.pts is not None:
97
+ last_pts = packet.pts
98
+ if last_pts is not None:
99
+ return float(last_pts * video_stream.time_base)
100
+
101
+ return None
102
+
103
+
104
+ def has_audio_stream(path: str) -> bool:
105
+ """Check if video has audio stream using PyAV."""
106
+ md = get_metadata(path)
107
+ return any(stream['type'] == 'audio' for stream in md['streams'])
108
+
109
+
110
+ def ffmpeg_clip_cmd(
111
+ input_path: str,
112
+ output_path: str,
113
+ start_time: float,
114
+ duration: float | None = None,
115
+ fast: bool = True,
116
+ video_encoder: str | None = None,
117
+ video_encoder_args: dict[str, Any] | None = None,
118
+ ) -> list[str]:
119
+ cmd = ['ffmpeg']
120
+ if fast:
121
+ # fast: -ss before -i
122
+ cmd.extend(
123
+ [
124
+ '-ss',
125
+ str(start_time),
126
+ '-i',
127
+ input_path,
128
+ '-map',
129
+ '0', # Copy all streams from input
130
+ '-c',
131
+ 'copy', # Stream copy (no re-encoding)
132
+ ]
133
+ )
134
+ else:
135
+ if video_encoder is None:
136
+ video_encoder = Env.get().default_video_encoder
137
+
138
+ # accurate: -ss after -i
139
+ cmd.extend(
140
+ [
141
+ '-i',
142
+ input_path,
143
+ '-ss',
144
+ str(start_time),
145
+ '-map',
146
+ '0', # Copy all streams from input
147
+ '-c:a',
148
+ 'copy', # audio copy
149
+ '-c:s',
150
+ 'copy', # subtitle copy
151
+ '-c:v',
152
+ video_encoder, # re-encode video
153
+ ]
154
+ )
155
+ if video_encoder_args is not None:
156
+ for k, v in video_encoder_args.items():
157
+ cmd.extend([f'-{k}', str(v)])
158
+
159
+ if duration is not None:
160
+ cmd.extend(['-t', str(duration)])
161
+ cmd.extend(['-loglevel', 'error', output_path])
162
+ return cmd
163
+
164
+
165
+ def ffmpeg_segment_cmd(
166
+ input_path: str,
167
+ output_pattern: str,
168
+ segment_duration: float | None = None,
169
+ segment_times: list[float] | None = None,
170
+ video_encoder: str | None = None,
171
+ video_encoder_args: dict[str, Any] | None = None,
172
+ ) -> list[str]:
173
+ """Commandline for frame-accurate segmentation"""
174
+ assert (segment_duration is None) != (segment_times is None)
175
+ if video_encoder is None:
176
+ video_encoder = Env.get().default_video_encoder
177
+
178
+ cmd = [
179
+ 'ffmpeg',
180
+ '-i',
181
+ input_path,
182
+ '-map',
183
+ '0', # Copy all streams from input
184
+ '-c:a',
185
+ 'copy', # don't re-encode audio
186
+ '-c:v',
187
+ video_encoder, # re-encode video
188
+ ]
189
+ if video_encoder_args is not None:
190
+ for k, v in video_encoder_args.items():
191
+ cmd.extend([f'-{k}', str(v)])
192
+ cmd.extend(['-f', 'segment'])
193
+
194
+ # -force_key_frames needs to precede -f segment
195
+ if segment_duration is not None:
196
+ cmd.extend(
197
+ [
198
+ '-force_key_frames',
199
+ f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
200
+ '-f',
201
+ 'segment',
202
+ '-segment_time',
203
+ str(segment_duration),
204
+ ]
205
+ )
206
+ else:
207
+ assert segment_times is not None
208
+ times_str = ','.join([str(t) for t in segment_times])
209
+ cmd.extend(['-force_key_frames', times_str, '-f', 'segment', '-segment_times', times_str])
210
+
211
+ cmd.extend(
212
+ [
213
+ '-reset_timestamps',
214
+ '1', # Reset timestamps for each segment
215
+ '-loglevel',
216
+ 'error', # Only show errors
217
+ output_pattern,
218
+ ]
219
+ )
220
+ return cmd
221
+
222
+
223
+ class VideoFrames:
224
+ """
225
+ Context manager for iterating over video frames at a specified frame rate.
226
+
227
+ Args:
228
+ path: Path to the video file
229
+ fps: Number of frames to extract per second. If None or 0.0, extracts all frames.
230
+ """
231
+
232
+ path: Path
233
+ fps: float
234
+ container: av.container.input.InputContainer | None
235
+ video_framerate: Fraction | None
236
+ video_time_base: Fraction | None
237
+ video_start_time: int | None
238
+
239
+ @dataclass
240
+ class Item:
241
+ frame_idx: int
242
+ pts: int
243
+ dts: int
244
+ time: float
245
+ is_corrupt: bool
246
+ key_frame: bool
247
+ pict_type: int
248
+ interlaced_frame: bool
249
+ frame: PIL.Image.Image
250
+
251
+ def __init__(self, path: Path, fps: float | None = None) -> None:
252
+ self.path = path
253
+ self.fps = 0.0 if fps is None else fps
254
+ self.container = None
255
+ self.video_framerate = None
256
+ self.video_time_base = None
257
+ self.video_start_time = None
258
+
259
+ def __enter__(self) -> Self:
260
+ self.container = av.open(self.path)
261
+ stream = self.container.streams.video[0]
262
+ self.video_framerate = stream.average_rate
263
+ self.video_time_base = stream.time_base
264
+ self.video_start_time = stream.start_time or 0
265
+ return self
266
+
267
+ def __exit__(
268
+ self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: TracebackType | None
269
+ ) -> None:
270
+ # Clean up
271
+ if self.container:
272
+ self.container.close()
273
+
274
+ def __iter__(self) -> Iterator[Item]:
275
+ num_returned = 0
276
+ frame_idx = -1
277
+ while True:
278
+ try:
279
+ frame = next(self.container.decode(video=0))
280
+ except (StopIteration, EOFError):
281
+ return
282
+
283
+ frame_idx += 1
284
+ if self.fps == 0.0 or (num_returned <= frame.time * self.fps):
285
+ img = frame.to_image()
286
+ assert isinstance(img, PIL.Image.Image)
287
+ yield VideoFrames.Item(
288
+ frame_idx=frame_idx,
289
+ pts=frame.pts,
290
+ dts=frame.dts,
291
+ time=frame.time,
292
+ is_corrupt=frame.is_corrupt,
293
+ key_frame=frame.key_frame,
294
+ pict_type=frame.pict_type,
295
+ interlaced_frame=frame.interlaced_frame,
296
+ frame=img,
297
+ )
298
+ num_returned += 1
@@ -0,0 +1,305 @@
1
+ import logging
2
+ import re
3
+ import threading
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Iterator
7
+
8
+ from azure.core.exceptions import AzureError
9
+
10
+ from pixeltable import env, exceptions as excs
11
+ from pixeltable.config import Config
12
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress
13
+
14
+ if TYPE_CHECKING:
15
+ from azure.storage.blob import BlobProperties, BlobServiceClient
16
+
17
+ from pixeltable.catalog import Column
18
+
19
+
20
+ _logger = logging.getLogger('pixeltable')
21
+
22
+
23
+ client_lock = threading.Lock()
24
+
25
+
26
+ @env.register_client('azure_blob')
27
+ def _() -> dict[str, 'BlobServiceClient']:
28
+ return {}
29
+
30
+
31
+ class AzureBlobStore(ObjectStoreBase):
32
+ """Class to handle Azure Blob Storage operations."""
33
+
34
+ # TODO: This needs to be redesigned to use asyncio.
35
+
36
+ # URI of the Azure Blob Storage container
37
+ # Always ends with a slash
38
+ __base_uri: str
39
+
40
+ # Storage account name
41
+ __account_name: str
42
+
43
+ # Container name extracted from the URI
44
+ __container_name: str
45
+
46
+ # Prefix path within the container, either empty or ending with a slash
47
+ __prefix_name: str
48
+
49
+ # URI scheme (wasb, wasbs, abfs, abfss, https)
50
+ __scheme: str
51
+
52
+ soa: StorageObjectAddress
53
+
54
+ def __init__(self, soa: StorageObjectAddress):
55
+ self.soa = soa
56
+ self.__scheme = soa.scheme
57
+ self.__account_name = soa.account
58
+ self.__container_name = soa.container
59
+ self.__prefix_name = soa.prefix
60
+
61
+ # Reconstruct base URI in normalized format
62
+ self.__base_uri = self.soa.prefix_free_uri + self.__prefix_name
63
+ _logger.info(
64
+ f'Initialized AzureBlobStore with base URI: {self.__base_uri},',
65
+ f'account: {self.__account_name}, container: {self.__container_name}, prefix: {self.__prefix_name}',
66
+ )
67
+
68
+ def client(self) -> 'BlobServiceClient':
69
+ """Return the Azure Blob Storage client."""
70
+ client_dict: dict[str, 'BlobServiceClient'] = env.Env.get().get_client('azure_blob')
71
+ with client_lock:
72
+ uri = self.soa.container_free_uri
73
+ if uri not in client_dict:
74
+ storage_account_name = Config.get().get_string_value('storage_account_name', section='azure')
75
+ storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
76
+ if (storage_account_name is None) != (storage_account_key is None):
77
+ raise excs.Error(
78
+ "Azure 'storage_account_name' and 'storage_account_key' must be specified together."
79
+ )
80
+ if storage_account_name is None or storage_account_name != self.__account_name:
81
+ # Attempt a connection to a public resource, with no account key
82
+ client_dict[uri] = self.create_client(endpoint_url=uri)
83
+ else:
84
+ client_dict[uri] = self.create_client(
85
+ endpoint_url=uri, account_name=self.__account_name, account_key=storage_account_key
86
+ )
87
+ return client_dict[uri]
88
+
89
+ @property
90
+ def account_name(self) -> str:
91
+ """Return the storage account name."""
92
+ return self.__account_name
93
+
94
+ @property
95
+ def container_name(self) -> str:
96
+ """Return the container name from the base URI."""
97
+ return self.__container_name
98
+
99
+ @property
100
+ def prefix(self) -> str:
101
+ """Return the prefix from the base URI."""
102
+ return self.__prefix_name
103
+
104
+ def validate(self, error_col_name: str) -> str | None:
105
+ """
106
+ Checks if the URI exists and is accessible.
107
+
108
+ Returns:
109
+ str: The base URI if the container exists and is accessible, None otherwise.
110
+ """
111
+ try:
112
+ container_client = self.client().get_container_client(self.container_name)
113
+ # Check if container exists by trying to get its properties
114
+ container_client.get_container_properties()
115
+ return self.__base_uri
116
+ except AzureError as e:
117
+ self.handle_azure_error(e, self.container_name, f'validate container {error_col_name}')
118
+ return None
119
+
120
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
121
+ """Copies a blob to a local file. Thread safe."""
122
+ try:
123
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=self.prefix + src_path)
124
+ with open(dest_path, 'wb') as download_file:
125
+ download_stream = blob_client.download_blob()
126
+ download_file.write(download_stream.readall())
127
+ except AzureError as e:
128
+ self.handle_azure_error(e, self.container_name, f'download file {src_path}')
129
+ raise
130
+
131
+ # TODO: utils package should not include back-references to `Column`
132
+ def copy_local_file(self, col: 'Column', src_path: Path) -> str:
133
+ """Copy a local file to Azure Blob Storage, and return its new URL"""
134
+ prefix, filename = ObjectPath.create_prefix_raw(
135
+ col.get_tbl().id, col.id, col.get_tbl().version, ext=src_path.suffix
136
+ )
137
+ blob_name = f'{self.prefix}{prefix}/{filename}'
138
+ new_file_uri = f'{self.__base_uri}{prefix}/{filename}'
139
+
140
+ try:
141
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=blob_name)
142
+ with open(src_path, 'rb') as data:
143
+ blob_client.upload_blob(data, overwrite=True)
144
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
145
+ return new_file_uri
146
+ except AzureError as e:
147
+ self.handle_azure_error(e, self.container_name, f'upload file {src_path}')
148
+ raise
149
+
150
+ def _get_filtered_blobs(
151
+ self, tbl_id: uuid.UUID | None, tbl_version: int | None = None
152
+ ) -> Iterator['BlobProperties']:
153
+ """Private method to get filtered blobs for a table, optionally filtered by version.
154
+
155
+ Args:
156
+ tbl_id: Table UUID to filter by
157
+ tbl_version: Optional table version to filter by
158
+
159
+ Returns:
160
+ Iterator over blob objects matching the criteria
161
+ """
162
+ # Use ObjectPath to construct the prefix for this table
163
+ if tbl_id is None:
164
+ prefix = self.prefix
165
+ assert tbl_version is None, 'tbl_version must be None if tbl_id is None'
166
+ else:
167
+ table_prefix = ObjectPath.table_prefix(tbl_id)
168
+ prefix = f'{self.prefix}{table_prefix}/'
169
+
170
+ try:
171
+ container_client = self.client().get_container_client(self.container_name)
172
+
173
+ blob_iterator: Iterator['BlobProperties']
174
+ if tbl_version is None:
175
+ # Return all blobs with the table prefix
176
+ blob_iterator = container_client.list_blobs(name_starts_with=prefix)
177
+ else:
178
+ # Filter by both table_id and table_version using the ObjectPath pattern
179
+ # Pattern: tbl_id_col_id_version_uuid
180
+ version_pattern = re.compile(
181
+ rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
182
+ )
183
+ # Get all blobs with the prefix and filter by version pattern
184
+ all_blobs = container_client.list_blobs(name_starts_with=prefix)
185
+ blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
186
+
187
+ return blob_iterator
188
+
189
+ except AzureError as e:
190
+ self.handle_azure_error(e, self.container_name, f'setup iterator {self.prefix}')
191
+ raise
192
+
193
+ def count(self, tbl_id: uuid.UUID | None, tbl_version: int | None = None) -> int:
194
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
195
+ count only those files belonging to the specified tbl_version.
196
+
197
+ Args:
198
+ tbl_id: Table UUID to count blobs for
199
+ tbl_version: Optional table version to filter by
200
+
201
+ Returns:
202
+ Number of blobs matching the criteria
203
+ """
204
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
205
+ return sum(1 for _ in blob_iterator)
206
+
207
+ def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
208
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
209
+ only those files belonging to the specified tbl_version.
210
+
211
+ Args:
212
+ tbl_id: Table UUID to delete blobs for
213
+ tbl_version: Optional table version to filter by
214
+
215
+ Returns:
216
+ Number of blobs deleted
217
+ """
218
+ assert tbl_id is not None
219
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
220
+ total_deleted = 0
221
+
222
+ try:
223
+ container_client = self.client().get_container_client(self.container_name)
224
+
225
+ for blob in blob_iterator:
226
+ # TODO: Figure out now to properly use batch method delete_blobs(), it doesn't seem to work properly
227
+ container_client.delete_blob(blob.name)
228
+ total_deleted += 1
229
+
230
+ # print(f"Deleted {total_deleted} blobs from container '{self.container_name}'.")
231
+ return total_deleted
232
+
233
+ except AzureError as e:
234
+ self.handle_azure_error(e, self.container_name, f'deleting with {self.prefix}')
235
+ raise
236
+
237
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
238
+ """Return a list of objects found in the specified destination bucket.
239
+ Each returned object includes the full set of prefixes.
240
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
241
+ """
242
+ p = self.soa.prefix_free_uri if return_uri else ''
243
+ r: list[str] = []
244
+ try:
245
+ blob_iterator = self._get_filtered_blobs(tbl_id=None, tbl_version=None)
246
+ for blob in blob_iterator:
247
+ r.append(f'{p}{blob.name}')
248
+ if len(r) >= n_max:
249
+ return r
250
+
251
+ except AzureError as e:
252
+ self.handle_azure_error(e, self.__container_name, f'list objects from {self.__base_uri}')
253
+ return r
254
+
255
+ @classmethod
256
+ def handle_azure_error(
257
+ cls, e: 'AzureError', container_name: str, operation: str = '', *, ignore_404: bool = False
258
+ ) -> None:
259
+ from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
260
+
261
+ if ignore_404 and isinstance(e, ResourceNotFoundError):
262
+ return
263
+
264
+ if isinstance(e, ResourceNotFoundError):
265
+ raise excs.Error(f'Container {container_name} or blob not found during {operation}: {str(e)!r}')
266
+ elif isinstance(e, ClientAuthenticationError):
267
+ raise excs.Error(f'Authentication failed for container {container_name} during {operation}: {str(e)!r}')
268
+ elif isinstance(e, HttpResponseError):
269
+ if e.status_code == 403:
270
+ raise excs.Error(f'Access denied to container {container_name} during {operation}: {str(e)!r}')
271
+ elif e.status_code == 412:
272
+ raise excs.Error(f'Precondition failed for container {container_name} during {operation}: {str(e)!r}')
273
+ else:
274
+ raise excs.Error(
275
+ f'HTTP error during {operation} in container {container_name}: {e.status_code} - {str(e)!r}'
276
+ )
277
+ else:
278
+ raise excs.Error(f'Error during {operation} in container {container_name}: {str(e)!r}')
279
+
280
+ @classmethod
281
+ def create_client(
282
+ cls, endpoint_url: str, account_name: str | None = None, account_key: str | None = None
283
+ ) -> 'BlobServiceClient':
284
+ from azure.core.credentials import AzureNamedKeyCredential
285
+ from azure.storage.blob import BlobServiceClient # TODO: Use azure.storage.blob.aio instead
286
+
287
+ assert (account_name is None) == (account_key is None)
288
+ try:
289
+ # e.g. endpoint_url: str = f'https://{account_name}.blob.core.windows.net'
290
+ assert endpoint_url is not None, 'No Azure Storage account information provided'
291
+
292
+ # Use empty SAS token for anonymous authentication
293
+ credential = None
294
+ if account_name is not None:
295
+ credential = AzureNamedKeyCredential(name=account_name, key=account_key)
296
+ return BlobServiceClient(
297
+ account_url=endpoint_url,
298
+ credential=credential,
299
+ max_single_get_size=(32 * 2**20),
300
+ max_chunk_get_size=(4 * 2**20),
301
+ connection_timeout=15,
302
+ read_timeout=30,
303
+ )
304
+ except Exception as e:
305
+ raise excs.Error(f'Failed to create Azure Blob Storage client: {str(e)!r}') from e
pixeltable/utils/code.py CHANGED
@@ -1,12 +1,11 @@
1
1
  import types
2
- from typing import Optional
3
2
 
4
3
  from pixeltable.func import Function
5
4
 
6
5
  # Utilities related to the organization of the Pixeltable codebase.
7
6
 
8
7
 
9
- def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> list[str]:
8
+ def local_public_names(mod_name: str, exclude: list[str] | None = None) -> list[str]:
10
9
  """
11
10
  Returns a list of all functions and submodules that are local to the specified module and are
12
11
  publicly accessible. Intended to facilitate implementation of module __dir__() methods for
@@ -21,7 +20,8 @@ def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> li
21
20
  for obj in mod.__dict__.values():
22
21
  if isinstance(obj, Function):
23
22
  # Pixeltable function
24
- names.append(obj.name)
23
+ if not obj.name.startswith('_'):
24
+ names.append(obj.name)
25
25
  elif isinstance(obj, types.FunctionType):
26
26
  # Python function
27
27
  if obj.__module__ == mod.__name__ and not obj.__name__.startswith('_'):
@@ -1,6 +1,8 @@
1
1
  import logging
2
2
  from typing import TextIO
3
3
 
4
+ from pixeltable import exceptions as excs
5
+
4
6
 
5
7
  def map_level(verbosity: int) -> int:
6
8
  """
@@ -19,7 +21,8 @@ def map_level(verbosity: int) -> int:
19
21
  return logging.INFO
20
22
  if verbosity == 2:
21
23
  return logging.DEBUG
22
- return logging.INFO
24
+
25
+ raise excs.Error(f'Invalid verbosity level: {verbosity}')
23
26
 
24
27
 
25
28
  class ConsoleOutputHandler(logging.StreamHandler):
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
2
  import threading
3
- from concurrent.futures import ThreadPoolExecutor
4
3
  from typing import Any, Coroutine, TypeVar
5
4
 
6
- T = TypeVar('T')
5
+ from pixeltable.env import Env
7
6
 
7
+ T = TypeVar('T')
8
8
 
9
9
  # TODO This is a temporary hack to be able to run async UDFs in contexts that are not properly handled by the existing
10
10
  # scheduler logic (e.g., as an embedding function as part of a similarity lookup). Once the scheduler is fully
@@ -15,27 +15,10 @@ def run_coroutine_synchronously(coroutine: Coroutine[Any, Any, T], timeout: floa
15
15
  """
16
16
  Runs the given coroutine synchronously, even if called in the context of a running event loop.
17
17
  """
18
-
19
- def run_in_new_loop() -> T:
20
- new_loop = asyncio.new_event_loop()
21
- asyncio.set_event_loop(new_loop)
22
- try:
23
- return new_loop.run_until_complete(coroutine)
24
- finally:
25
- new_loop.close()
26
-
27
- try:
28
- loop = asyncio.get_running_loop()
29
- except RuntimeError:
30
- # No event loop; just call `asyncio.run()`
31
- return asyncio.run(coroutine)
18
+ loop = Env.get().event_loop
32
19
 
33
20
  if threading.current_thread() is threading.main_thread():
34
- if not loop.is_running():
35
- return loop.run_until_complete(coroutine)
36
- else:
37
- with ThreadPoolExecutor() as pool:
38
- future = pool.submit(run_in_new_loop)
39
- return future.result(timeout=timeout)
21
+ return loop.run_until_complete(coroutine)
40
22
  else:
41
- return asyncio.run_coroutine_threadsafe(coroutine, loop).result()
23
+ # Not in main thread, use run_coroutine_threadsafe
24
+ return asyncio.run_coroutine_threadsafe(coroutine, loop).result(timeout)