pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (153) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/_version.py +1 -0
  3. pixeltable/catalog/catalog.py +144 -118
  4. pixeltable/catalog/column.py +104 -115
  5. pixeltable/catalog/globals.py +1 -2
  6. pixeltable/catalog/insertable_table.py +44 -49
  7. pixeltable/catalog/path.py +3 -4
  8. pixeltable/catalog/schema_object.py +4 -4
  9. pixeltable/catalog/table.py +139 -124
  10. pixeltable/catalog/table_metadata.py +6 -6
  11. pixeltable/catalog/table_version.py +315 -246
  12. pixeltable/catalog/table_version_handle.py +4 -4
  13. pixeltable/catalog/table_version_path.py +9 -10
  14. pixeltable/catalog/tbl_ops.py +9 -3
  15. pixeltable/catalog/view.py +34 -28
  16. pixeltable/config.py +14 -10
  17. pixeltable/dataframe.py +69 -78
  18. pixeltable/env.py +78 -64
  19. pixeltable/exec/aggregation_node.py +6 -6
  20. pixeltable/exec/cache_prefetch_node.py +10 -10
  21. pixeltable/exec/data_row_batch.py +3 -3
  22. pixeltable/exec/exec_context.py +16 -4
  23. pixeltable/exec/exec_node.py +5 -5
  24. pixeltable/exec/expr_eval/evaluators.py +6 -6
  25. pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
  26. pixeltable/exec/expr_eval/globals.py +6 -6
  27. pixeltable/exec/expr_eval/row_buffer.py +1 -2
  28. pixeltable/exec/expr_eval/schedulers.py +11 -11
  29. pixeltable/exec/in_memory_data_node.py +2 -2
  30. pixeltable/exec/object_store_save_node.py +14 -17
  31. pixeltable/exec/sql_node.py +28 -27
  32. pixeltable/exprs/arithmetic_expr.py +4 -4
  33. pixeltable/exprs/array_slice.py +2 -2
  34. pixeltable/exprs/column_property_ref.py +3 -3
  35. pixeltable/exprs/column_ref.py +61 -74
  36. pixeltable/exprs/comparison.py +5 -5
  37. pixeltable/exprs/compound_predicate.py +3 -3
  38. pixeltable/exprs/data_row.py +12 -12
  39. pixeltable/exprs/expr.py +41 -31
  40. pixeltable/exprs/expr_dict.py +3 -3
  41. pixeltable/exprs/expr_set.py +3 -3
  42. pixeltable/exprs/function_call.py +14 -14
  43. pixeltable/exprs/in_predicate.py +4 -4
  44. pixeltable/exprs/inline_expr.py +8 -8
  45. pixeltable/exprs/is_null.py +1 -3
  46. pixeltable/exprs/json_mapper.py +8 -8
  47. pixeltable/exprs/json_path.py +6 -6
  48. pixeltable/exprs/literal.py +5 -5
  49. pixeltable/exprs/method_ref.py +2 -2
  50. pixeltable/exprs/object_ref.py +2 -2
  51. pixeltable/exprs/row_builder.py +14 -14
  52. pixeltable/exprs/rowid_ref.py +8 -8
  53. pixeltable/exprs/similarity_expr.py +50 -25
  54. pixeltable/exprs/sql_element_cache.py +4 -4
  55. pixeltable/exprs/string_op.py +2 -2
  56. pixeltable/exprs/type_cast.py +3 -5
  57. pixeltable/func/aggregate_function.py +8 -8
  58. pixeltable/func/callable_function.py +9 -9
  59. pixeltable/func/expr_template_function.py +3 -3
  60. pixeltable/func/function.py +15 -17
  61. pixeltable/func/function_registry.py +6 -7
  62. pixeltable/func/globals.py +2 -3
  63. pixeltable/func/mcp.py +2 -2
  64. pixeltable/func/query_template_function.py +16 -16
  65. pixeltable/func/signature.py +14 -14
  66. pixeltable/func/tools.py +11 -11
  67. pixeltable/func/udf.py +16 -18
  68. pixeltable/functions/__init__.py +1 -0
  69. pixeltable/functions/anthropic.py +7 -7
  70. pixeltable/functions/audio.py +76 -0
  71. pixeltable/functions/bedrock.py +6 -6
  72. pixeltable/functions/deepseek.py +4 -4
  73. pixeltable/functions/fireworks.py +2 -2
  74. pixeltable/functions/gemini.py +6 -6
  75. pixeltable/functions/globals.py +12 -12
  76. pixeltable/functions/groq.py +4 -4
  77. pixeltable/functions/huggingface.py +1033 -6
  78. pixeltable/functions/image.py +7 -10
  79. pixeltable/functions/llama_cpp.py +7 -7
  80. pixeltable/functions/math.py +2 -3
  81. pixeltable/functions/mistralai.py +3 -3
  82. pixeltable/functions/ollama.py +9 -9
  83. pixeltable/functions/openai.py +21 -21
  84. pixeltable/functions/openrouter.py +7 -7
  85. pixeltable/functions/string.py +21 -28
  86. pixeltable/functions/timestamp.py +7 -8
  87. pixeltable/functions/together.py +4 -6
  88. pixeltable/functions/twelvelabs.py +92 -0
  89. pixeltable/functions/video.py +36 -31
  90. pixeltable/functions/vision.py +6 -6
  91. pixeltable/functions/whisper.py +7 -7
  92. pixeltable/functions/whisperx.py +16 -16
  93. pixeltable/globals.py +75 -40
  94. pixeltable/index/base.py +12 -8
  95. pixeltable/index/btree.py +19 -22
  96. pixeltable/index/embedding_index.py +30 -39
  97. pixeltable/io/datarows.py +3 -3
  98. pixeltable/io/external_store.py +13 -16
  99. pixeltable/io/fiftyone.py +5 -5
  100. pixeltable/io/globals.py +5 -5
  101. pixeltable/io/hf_datasets.py +4 -4
  102. pixeltable/io/label_studio.py +12 -12
  103. pixeltable/io/pandas.py +6 -6
  104. pixeltable/io/parquet.py +2 -2
  105. pixeltable/io/table_data_conduit.py +12 -12
  106. pixeltable/io/utils.py +2 -2
  107. pixeltable/iterators/audio.py +2 -2
  108. pixeltable/iterators/document.py +88 -57
  109. pixeltable/iterators/video.py +66 -37
  110. pixeltable/metadata/converters/convert_18.py +2 -2
  111. pixeltable/metadata/converters/convert_19.py +2 -2
  112. pixeltable/metadata/converters/convert_20.py +2 -2
  113. pixeltable/metadata/converters/convert_21.py +2 -2
  114. pixeltable/metadata/converters/convert_22.py +2 -2
  115. pixeltable/metadata/converters/convert_24.py +2 -2
  116. pixeltable/metadata/converters/convert_25.py +2 -2
  117. pixeltable/metadata/converters/convert_26.py +2 -2
  118. pixeltable/metadata/converters/convert_29.py +4 -4
  119. pixeltable/metadata/converters/convert_34.py +2 -2
  120. pixeltable/metadata/converters/convert_36.py +2 -2
  121. pixeltable/metadata/converters/convert_38.py +2 -2
  122. pixeltable/metadata/converters/convert_39.py +1 -2
  123. pixeltable/metadata/converters/util.py +11 -13
  124. pixeltable/metadata/schema.py +22 -21
  125. pixeltable/metadata/utils.py +2 -6
  126. pixeltable/mypy/mypy_plugin.py +5 -5
  127. pixeltable/plan.py +32 -34
  128. pixeltable/share/packager.py +7 -7
  129. pixeltable/share/publish.py +3 -3
  130. pixeltable/store.py +126 -41
  131. pixeltable/type_system.py +43 -46
  132. pixeltable/utils/__init__.py +1 -2
  133. pixeltable/utils/arrow.py +4 -4
  134. pixeltable/utils/av.py +74 -38
  135. pixeltable/utils/azure_store.py +305 -0
  136. pixeltable/utils/code.py +1 -2
  137. pixeltable/utils/dbms.py +15 -19
  138. pixeltable/utils/description_helper.py +2 -3
  139. pixeltable/utils/documents.py +5 -6
  140. pixeltable/utils/exception_handler.py +2 -2
  141. pixeltable/utils/filecache.py +5 -5
  142. pixeltable/utils/formatter.py +4 -6
  143. pixeltable/utils/gcs_store.py +9 -9
  144. pixeltable/utils/local_store.py +17 -17
  145. pixeltable/utils/object_stores.py +59 -43
  146. pixeltable/utils/s3_store.py +35 -30
  147. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
  148. pixeltable-0.4.19.dist-info/RECORD +213 -0
  149. pixeltable/__version__.py +0 -3
  150. pixeltable-0.4.17.dist-info/RECORD +0 -211
  151. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
  152. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
  153. {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/utils/arrow.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import io
3
3
  import json
4
- from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
4
+ from typing import TYPE_CHECKING, Any, Iterator, cast
5
5
 
6
6
  import numpy as np
7
7
  import PIL.Image
@@ -48,7 +48,7 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
48
48
  }
49
49
 
50
50
 
51
- def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
51
+ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> ts.ColumnType | None:
52
52
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
53
53
  Returns None if no conversion is currently implemented.
54
54
  """
@@ -66,7 +66,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
66
66
  return None
67
67
 
68
68
 
69
- def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
69
+ def to_arrow_type(pixeltable_type: ts.ColumnType) -> pa.DataType | None:
70
70
  """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
71
71
  Returns None if no conversion is currently implemented.
72
72
  """
@@ -240,7 +240,7 @@ def _ar_val_to_pxt_val(val: Any, pxt_type: ts.ColumnType) -> Any:
240
240
 
241
241
 
242
242
  def iter_tuples2(
243
- batch: pa.Table | pa.RecordBatch, col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
243
+ batch: pa.Table | pa.RecordBatch, col_mapping: dict[str, str] | None, schema: dict[str, ts.ColumnType]
244
244
  ) -> Iterator[dict[str, Any]]:
245
245
  """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
246
246
  pydict = to_pydict(batch)
pixeltable/utils/av.py CHANGED
@@ -5,6 +5,14 @@ import av.stream
5
5
 
6
6
  from pixeltable.env import Env
7
7
 
8
+ # format -> (codec, extension)
9
+ AUDIO_FORMATS: dict[str, tuple[str, str]] = {
10
+ 'wav': ('pcm_s16le', 'wav'),
11
+ 'mp3': ('libmp3lame', 'mp3'),
12
+ 'flac': ('flac', 'flac'),
13
+ 'mp4': ('aac', 'm4a'),
14
+ }
15
+
8
16
 
9
17
  def get_metadata(path: str) -> dict:
10
18
  with av.open(path) as container:
@@ -91,25 +99,58 @@ def has_audio_stream(path: str) -> bool:
91
99
  return any(stream['type'] == 'audio' for stream in md['streams'])
92
100
 
93
101
 
94
- def ffmpeg_clip_cmd(input_path: str, output_path: str, start_time: float, duration: float | None = None) -> list[str]:
95
- # the order of arguments is critical: -ss <start> -t <duration> -i <input>
96
- cmd = ['ffmpeg', '-ss', str(start_time)]
102
+ def ffmpeg_clip_cmd(
103
+ input_path: str,
104
+ output_path: str,
105
+ start_time: float,
106
+ duration: float | None = None,
107
+ fast: bool = True,
108
+ video_encoder: str | None = None,
109
+ video_encoder_args: dict[str, Any] | None = None,
110
+ ) -> list[str]:
111
+ cmd = ['ffmpeg']
112
+ if fast:
113
+ # fast: -ss before -i
114
+ cmd.extend(
115
+ [
116
+ '-ss',
117
+ str(start_time),
118
+ '-i',
119
+ input_path,
120
+ '-map',
121
+ '0', # Copy all streams from input
122
+ '-c',
123
+ 'copy', # Stream copy (no re-encoding)
124
+ ]
125
+ )
126
+ else:
127
+ if video_encoder is None:
128
+ video_encoder = Env.get().default_video_encoder
129
+
130
+ # accurate: -ss after -i
131
+ cmd.extend(
132
+ [
133
+ '-i',
134
+ input_path,
135
+ '-ss',
136
+ str(start_time),
137
+ '-map',
138
+ '0', # Copy all streams from input
139
+ '-c:a',
140
+ 'copy', # audio copy
141
+ '-c:s',
142
+ 'copy', # subtitle copy
143
+ '-c:v',
144
+ video_encoder, # re-encode video
145
+ ]
146
+ )
147
+ if video_encoder_args is not None:
148
+ for k, v in video_encoder_args.items():
149
+ cmd.extend([f'-{k}', str(v)])
150
+
97
151
  if duration is not None:
98
152
  cmd.extend(['-t', str(duration)])
99
- cmd.extend(
100
- [
101
- '-i', # Input file
102
- input_path,
103
- '-y', # Overwrite output file
104
- '-loglevel',
105
- 'error', # Only show errors
106
- '-c',
107
- 'copy', # Stream copy (no re-encoding)
108
- '-map',
109
- '0', # Copy all streams from input
110
- output_path,
111
- ]
112
- )
153
+ cmd.extend(['-loglevel', 'error', output_path])
113
154
  return cmd
114
155
 
115
156
 
@@ -130,44 +171,39 @@ def ffmpeg_segment_cmd(
130
171
  'ffmpeg',
131
172
  '-i',
132
173
  input_path,
133
- '-f',
134
- 'segment', # Use segment muxer
174
+ '-map',
175
+ '0', # Copy all streams from input
176
+ '-c:a',
177
+ 'copy', # don't re-encode audio
178
+ '-c:v',
179
+ video_encoder, # re-encode video
135
180
  ]
181
+ if video_encoder_args is not None:
182
+ for k, v in video_encoder_args.items():
183
+ cmd.extend([f'-{k}', str(v)])
184
+ cmd.extend(['-f', 'segment'])
136
185
 
186
+ # -force_key_frames needs to precede -f segment
137
187
  if segment_duration is not None:
138
188
  cmd.extend(
139
189
  [
140
- '-segment_time',
141
- str(segment_duration), # Target segment duration
142
- '-break_non_keyframes',
143
- '1', # need to break at non-keyframes to get frame-accurate segments
144
190
  '-force_key_frames',
145
191
  f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
192
+ '-f',
193
+ 'segment',
194
+ '-segment_time',
195
+ str(segment_duration),
146
196
  ]
147
197
  )
148
198
  else:
149
199
  assert segment_times is not None
150
200
  times_str = ','.join([str(t) for t in segment_times])
151
- cmd.extend(['-segment_times', times_str, '-force_key_frames', times_str])
201
+ cmd.extend(['-force_key_frames', times_str, '-f', 'segment', '-segment_times', times_str])
152
202
 
153
203
  cmd.extend(
154
204
  [
155
205
  '-reset_timestamps',
156
206
  '1', # Reset timestamps for each segment
157
- '-map',
158
- '0', # Copy all streams from input
159
- '-c:a',
160
- 'copy', # don't re-encode audio
161
- '-c:v',
162
- video_encoder, # re-encode video
163
- ]
164
- )
165
- if video_encoder_args is not None:
166
- for k, v in video_encoder_args.items():
167
- cmd.extend([f'-{k}', str(v)])
168
-
169
- cmd.extend(
170
- [
171
207
  '-loglevel',
172
208
  'error', # Only show errors
173
209
  output_pattern,
@@ -0,0 +1,305 @@
1
+ import logging
2
+ import re
3
+ import threading
4
+ import uuid
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Iterator
7
+
8
+ from azure.core.exceptions import AzureError
9
+
10
+ from pixeltable import env, exceptions as excs
11
+ from pixeltable.config import Config
12
+ from pixeltable.utils.object_stores import ObjectPath, ObjectStoreBase, StorageObjectAddress
13
+
14
+ if TYPE_CHECKING:
15
+ from azure.storage.blob import BlobProperties, BlobServiceClient
16
+
17
+ from pixeltable.catalog import Column
18
+
19
+
20
+ _logger = logging.getLogger('pixeltable')
21
+
22
+
23
+ client_lock = threading.Lock()
24
+
25
+
26
+ @env.register_client('azure_blob')
27
+ def _() -> dict[str, 'BlobServiceClient']:
28
+ return {}
29
+
30
+
31
+ class AzureBlobStore(ObjectStoreBase):
32
+ """Class to handle Azure Blob Storage operations."""
33
+
34
+ # TODO: This needs to be redesigned to use asyncio.
35
+
36
+ # URI of the Azure Blob Storage container
37
+ # Always ends with a slash
38
+ __base_uri: str
39
+
40
+ # Storage account name
41
+ __account_name: str
42
+
43
+ # Container name extracted from the URI
44
+ __container_name: str
45
+
46
+ # Prefix path within the container, either empty or ending with a slash
47
+ __prefix_name: str
48
+
49
+ # URI scheme (wasb, wasbs, abfs, abfss, https)
50
+ __scheme: str
51
+
52
+ soa: StorageObjectAddress
53
+
54
+ def __init__(self, soa: StorageObjectAddress):
55
+ self.soa = soa
56
+ self.__scheme = soa.scheme
57
+ self.__account_name = soa.account
58
+ self.__container_name = soa.container
59
+ self.__prefix_name = soa.prefix
60
+
61
+ # Reconstruct base URI in normalized format
62
+ self.__base_uri = self.soa.prefix_free_uri + self.__prefix_name
63
+ _logger.info(
64
+ f'Initialized AzureBlobStore with base URI: {self.__base_uri},',
65
+ f'account: {self.__account_name}, container: {self.__container_name}, prefix: {self.__prefix_name}',
66
+ )
67
+
68
+ def client(self) -> 'BlobServiceClient':
69
+ """Return the Azure Blob Storage client."""
70
+ client_dict: dict[str, 'BlobServiceClient'] = env.Env.get().get_client('azure_blob')
71
+ with client_lock:
72
+ uri = self.soa.container_free_uri
73
+ if uri not in client_dict:
74
+ storage_account_name = Config.get().get_string_value('storage_account_name', section='azure')
75
+ storage_account_key = Config.get().get_string_value('storage_account_key', section='azure')
76
+ if (storage_account_name is None) != (storage_account_key is None):
77
+ raise excs.Error(
78
+ "Azure 'storage_account_name' and 'storage_account_key' must be specified together."
79
+ )
80
+ if storage_account_name is None or storage_account_name != self.__account_name:
81
+ # Attempt a connection to a public resource, with no account key
82
+ client_dict[uri] = self.create_client(endpoint_url=uri)
83
+ else:
84
+ client_dict[uri] = self.create_client(
85
+ endpoint_url=uri, account_name=self.__account_name, account_key=storage_account_key
86
+ )
87
+ return client_dict[uri]
88
+
89
+ @property
90
+ def account_name(self) -> str:
91
+ """Return the storage account name."""
92
+ return self.__account_name
93
+
94
+ @property
95
+ def container_name(self) -> str:
96
+ """Return the container name from the base URI."""
97
+ return self.__container_name
98
+
99
+ @property
100
+ def prefix(self) -> str:
101
+ """Return the prefix from the base URI."""
102
+ return self.__prefix_name
103
+
104
+ def validate(self, error_col_name: str) -> str | None:
105
+ """
106
+ Checks if the URI exists and is accessible.
107
+
108
+ Returns:
109
+ str: The base URI if the container exists and is accessible, None otherwise.
110
+ """
111
+ try:
112
+ container_client = self.client().get_container_client(self.container_name)
113
+ # Check if container exists by trying to get its properties
114
+ container_client.get_container_properties()
115
+ return self.__base_uri
116
+ except AzureError as e:
117
+ self.handle_azure_error(e, self.container_name, f'validate container {error_col_name}')
118
+ return None
119
+
120
+ def copy_object_to_local_file(self, src_path: str, dest_path: Path) -> None:
121
+ """Copies a blob to a local file. Thread safe."""
122
+ try:
123
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=self.prefix + src_path)
124
+ with open(dest_path, 'wb') as download_file:
125
+ download_stream = blob_client.download_blob()
126
+ download_file.write(download_stream.readall())
127
+ except AzureError as e:
128
+ self.handle_azure_error(e, self.container_name, f'download file {src_path}')
129
+ raise
130
+
131
+ # TODO: utils package should not include back-references to `Column`
132
+ def copy_local_file(self, col: 'Column', src_path: Path) -> str:
133
+ """Copy a local file to Azure Blob Storage, and return its new URL"""
134
+ prefix, filename = ObjectPath.create_prefix_raw(
135
+ col.get_tbl().id, col.id, col.get_tbl().version, ext=src_path.suffix
136
+ )
137
+ blob_name = f'{self.prefix}{prefix}/{filename}'
138
+ new_file_uri = f'{self.__base_uri}{prefix}/{filename}'
139
+
140
+ try:
141
+ blob_client = self.client().get_blob_client(container=self.container_name, blob=blob_name)
142
+ with open(src_path, 'rb') as data:
143
+ blob_client.upload_blob(data, overwrite=True)
144
+ _logger.debug(f'Media Storage: copied {src_path} to {new_file_uri}')
145
+ return new_file_uri
146
+ except AzureError as e:
147
+ self.handle_azure_error(e, self.container_name, f'upload file {src_path}')
148
+ raise
149
+
150
+ def _get_filtered_blobs(
151
+ self, tbl_id: uuid.UUID | None, tbl_version: int | None = None
152
+ ) -> Iterator['BlobProperties']:
153
+ """Private method to get filtered blobs for a table, optionally filtered by version.
154
+
155
+ Args:
156
+ tbl_id: Table UUID to filter by
157
+ tbl_version: Optional table version to filter by
158
+
159
+ Returns:
160
+ Iterator over blob objects matching the criteria
161
+ """
162
+ # Use ObjectPath to construct the prefix for this table
163
+ if tbl_id is None:
164
+ prefix = self.prefix
165
+ assert tbl_version is None, 'tbl_version must be None if tbl_id is None'
166
+ else:
167
+ table_prefix = ObjectPath.table_prefix(tbl_id)
168
+ prefix = f'{self.prefix}{table_prefix}/'
169
+
170
+ try:
171
+ container_client = self.client().get_container_client(self.container_name)
172
+
173
+ blob_iterator: Iterator['BlobProperties']
174
+ if tbl_version is None:
175
+ # Return all blobs with the table prefix
176
+ blob_iterator = container_client.list_blobs(name_starts_with=prefix)
177
+ else:
178
+ # Filter by both table_id and table_version using the ObjectPath pattern
179
+ # Pattern: tbl_id_col_id_version_uuid
180
+ version_pattern = re.compile(
181
+ rf'{re.escape(table_prefix)}_\d+_{re.escape(str(tbl_version))}_[0-9a-fA-F]+.*'
182
+ )
183
+ # Get all blobs with the prefix and filter by version pattern
184
+ all_blobs = container_client.list_blobs(name_starts_with=prefix)
185
+ blob_iterator = (blob for blob in all_blobs if version_pattern.match(blob.name.split('/')[-1]))
186
+
187
+ return blob_iterator
188
+
189
+ except AzureError as e:
190
+ self.handle_azure_error(e, self.container_name, f'setup iterator {self.prefix}')
191
+ raise
192
+
193
+ def count(self, tbl_id: uuid.UUID | None, tbl_version: int | None = None) -> int:
194
+ """Count the number of files belonging to tbl_id. If tbl_version is not None,
195
+ count only those files belonging to the specified tbl_version.
196
+
197
+ Args:
198
+ tbl_id: Table UUID to count blobs for
199
+ tbl_version: Optional table version to filter by
200
+
201
+ Returns:
202
+ Number of blobs matching the criteria
203
+ """
204
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
205
+ return sum(1 for _ in blob_iterator)
206
+
207
+ def delete(self, tbl_id: uuid.UUID, tbl_version: int | None = None) -> int:
208
+ """Delete all files belonging to tbl_id. If tbl_version is not None, delete
209
+ only those files belonging to the specified tbl_version.
210
+
211
+ Args:
212
+ tbl_id: Table UUID to delete blobs for
213
+ tbl_version: Optional table version to filter by
214
+
215
+ Returns:
216
+ Number of blobs deleted
217
+ """
218
+ assert tbl_id is not None
219
+ blob_iterator = self._get_filtered_blobs(tbl_id, tbl_version)
220
+ total_deleted = 0
221
+
222
+ try:
223
+ container_client = self.client().get_container_client(self.container_name)
224
+
225
+ for blob in blob_iterator:
226
+ # TODO: Figure out now to properly use batch method delete_blobs(), it doesn't seem to work properly
227
+ container_client.delete_blob(blob.name)
228
+ total_deleted += 1
229
+
230
+ # print(f"Deleted {total_deleted} blobs from container '{self.container_name}'.")
231
+ return total_deleted
232
+
233
+ except AzureError as e:
234
+ self.handle_azure_error(e, self.container_name, f'deleting with {self.prefix}')
235
+ raise
236
+
237
+ def list_objects(self, return_uri: bool, n_max: int = 10) -> list[str]:
238
+ """Return a list of objects found in the specified destination bucket.
239
+ Each returned object includes the full set of prefixes.
240
+ if return_uri is True, full URI's are returned; otherwise, just the object keys.
241
+ """
242
+ p = self.soa.prefix_free_uri if return_uri else ''
243
+ r: list[str] = []
244
+ try:
245
+ blob_iterator = self._get_filtered_blobs(tbl_id=None, tbl_version=None)
246
+ for blob in blob_iterator:
247
+ r.append(f'{p}{blob.name}')
248
+ if len(r) >= n_max:
249
+ return r
250
+
251
+ except AzureError as e:
252
+ self.handle_azure_error(e, self.__container_name, f'list objects from {self.__base_uri}')
253
+ return r
254
+
255
+ @classmethod
256
+ def handle_azure_error(
257
+ cls, e: 'AzureError', container_name: str, operation: str = '', *, ignore_404: bool = False
258
+ ) -> None:
259
+ from azure.core.exceptions import ClientAuthenticationError, HttpResponseError, ResourceNotFoundError
260
+
261
+ if ignore_404 and isinstance(e, ResourceNotFoundError):
262
+ return
263
+
264
+ if isinstance(e, ResourceNotFoundError):
265
+ raise excs.Error(f'Container {container_name} or blob not found during {operation}: {str(e)!r}')
266
+ elif isinstance(e, ClientAuthenticationError):
267
+ raise excs.Error(f'Authentication failed for container {container_name} during {operation}: {str(e)!r}')
268
+ elif isinstance(e, HttpResponseError):
269
+ if e.status_code == 403:
270
+ raise excs.Error(f'Access denied to container {container_name} during {operation}: {str(e)!r}')
271
+ elif e.status_code == 412:
272
+ raise excs.Error(f'Precondition failed for container {container_name} during {operation}: {str(e)!r}')
273
+ else:
274
+ raise excs.Error(
275
+ f'HTTP error during {operation} in container {container_name}: {e.status_code} - {str(e)!r}'
276
+ )
277
+ else:
278
+ raise excs.Error(f'Error during {operation} in container {container_name}: {str(e)!r}')
279
+
280
+ @classmethod
281
+ def create_client(
282
+ cls, endpoint_url: str, account_name: str | None = None, account_key: str | None = None
283
+ ) -> 'BlobServiceClient':
284
+ from azure.core.credentials import AzureNamedKeyCredential
285
+ from azure.storage.blob import BlobServiceClient # TODO: Use azure.storage.blob.aio instead
286
+
287
+ assert (account_name is None) == (account_key is None)
288
+ try:
289
+ # e.g. endpoint_url: str = f'https://{account_name}.blob.core.windows.net'
290
+ assert endpoint_url is not None, 'No Azure Storage account information provided'
291
+
292
+ # Use empty SAS token for anonymous authentication
293
+ credential = None
294
+ if account_name is not None:
295
+ credential = AzureNamedKeyCredential(name=account_name, key=account_key)
296
+ return BlobServiceClient(
297
+ account_url=endpoint_url,
298
+ credential=credential,
299
+ max_single_get_size=(32 * 2**20),
300
+ max_chunk_get_size=(4 * 2**20),
301
+ connection_timeout=15,
302
+ read_timeout=30,
303
+ )
304
+ except Exception as e:
305
+ raise excs.Error(f'Failed to create Azure Blob Storage client: {str(e)!r}') from e
pixeltable/utils/code.py CHANGED
@@ -1,12 +1,11 @@
1
1
  import types
2
- from typing import Optional
3
2
 
4
3
  from pixeltable.func import Function
5
4
 
6
5
  # Utilities related to the organization of the Pixeltable codebase.
7
6
 
8
7
 
9
- def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> list[str]:
8
+ def local_public_names(mod_name: str, exclude: list[str] | None = None) -> list[str]:
10
9
  """
11
10
  Returns a list of all functions and submodules that are local to the specified module and are
12
11
  publicly accessible. Intended to facilitate implementation of module __dir__() methods for
pixeltable/utils/dbms.py CHANGED
@@ -29,9 +29,7 @@ class Dbms(abc.ABC):
29
29
  def default_system_db_url(self) -> str: ...
30
30
 
31
31
  @abc.abstractmethod
32
- def create_vector_index(
33
- self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
34
- ) -> None: ...
32
+ def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index: ...
35
33
 
36
34
 
37
35
  class PostgresqlDbms(Dbms):
@@ -52,17 +50,14 @@ class PostgresqlDbms(Dbms):
52
50
  a = self.db_url.set(database='postgres').render_as_string(hide_password=False)
53
51
  return a
54
52
 
55
- def create_vector_index(
56
- self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
57
- ) -> None:
58
- idx = sql.Index(
59
- index_name,
60
- index_value_sa_col,
53
+ def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index:
54
+ return sql.Index(
55
+ store_index_name,
56
+ sa_value_col,
61
57
  postgresql_using='hnsw',
62
58
  postgresql_with={'m': 16, 'ef_construction': 64},
63
- postgresql_ops={index_value_sa_col.name: metric},
59
+ postgresql_ops={sa_value_col.name: metric},
64
60
  )
65
- idx.create(bind=conn)
66
61
 
67
62
 
68
63
  class CockroachDbms(Dbms):
@@ -82,11 +77,12 @@ class CockroachDbms(Dbms):
82
77
  def default_system_db_url(self) -> str:
83
78
  return self.db_url.set(database='defaultdb').render_as_string(hide_password=False)
84
79
 
85
- def create_vector_index(
86
- self, index_name: str, index_value_sa_col: sql.schema.Column, conn: sql.Connection, metric: str
87
- ) -> None:
88
- create_index_sql = sql.text(
89
- f"""CREATE VECTOR INDEX {index_name} ON {index_value_sa_col.table.name}
90
- ({index_value_sa_col.name} {metric})"""
91
- )
92
- conn.execute(create_index_sql)
80
+ def sa_vector_index(self, store_index_name: str, sa_value_col: sql.schema.Column, metric: str) -> sql.Index:
81
+ # TODO: can the Create Index statement be generated via sqlalchemy?
82
+ # if not, change this method to create_vector_index_stmt(...) -> str
83
+ # original code:
84
+ # create_index_sql = sql.text(
85
+ # f"""CREATE VECTOR INDEX {store_index_name} ON {sa_value_col.table.name}
86
+ # ({sa_value_col.name} {metric})"""
87
+ # )
88
+ return None
@@ -1,5 +1,4 @@
1
1
  import dataclasses
2
- from typing import Optional
3
2
 
4
3
  import pandas as pd
5
4
  from pandas.io.formats.style import Styler
@@ -11,7 +10,7 @@ class _Descriptor:
11
10
  # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
12
11
  show_index: bool
13
12
  show_header: bool
14
- styler: Optional[Styler] = None
13
+ styler: Styler | None = None
15
14
 
16
15
 
17
16
  class DescriptionHelper:
@@ -36,7 +35,7 @@ class DescriptionHelper:
36
35
  descriptor: str | pd.DataFrame,
37
36
  show_index: bool = False,
38
37
  show_header: bool = True,
39
- styler: Optional[Styler] = None,
38
+ styler: Styler | None = None,
40
39
  ) -> None:
41
40
  self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
42
41
 
@@ -1,6 +1,5 @@
1
1
  import dataclasses
2
2
  import os
3
- from typing import Optional
4
3
 
5
4
  import bs4
6
5
  import fitz # type: ignore[import-untyped]
@@ -13,10 +12,10 @@ from pixeltable.env import Env
13
12
  @dataclasses.dataclass
14
13
  class DocumentHandle:
15
14
  format: ts.DocumentType.DocumentFormat
16
- bs_doc: Optional[bs4.BeautifulSoup] = None
17
- md_ast: Optional[dict] = None
18
- pdf_doc: Optional[fitz.Document] = None
19
- txt_doc: Optional[str] = None
15
+ bs_doc: bs4.BeautifulSoup | None = None
16
+ md_ast: dict | None = None
17
+ pdf_doc: fitz.Document | None = None
18
+ txt_doc: str | None = None
20
19
 
21
20
 
22
21
  def get_document_handle(path: str) -> DocumentHandle:
@@ -34,7 +33,7 @@ def get_document_handle(path: str) -> DocumentHandle:
34
33
  raise excs.Error(f'Unrecognized document format: {path}')
35
34
 
36
35
 
37
- def get_handle_by_extension(path: str, extension: str) -> Optional[DocumentHandle]:
36
+ def get_handle_by_extension(path: str, extension: str) -> DocumentHandle | None:
38
37
  doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
39
38
 
40
39
  try:
@@ -1,12 +1,12 @@
1
1
  import logging
2
- from typing import Any, Callable, Optional, TypeVar
2
+ from typing import Any, Callable, TypeVar
3
3
 
4
4
  R = TypeVar('R')
5
5
 
6
6
  logger = logging.getLogger('pixeltable')
7
7
 
8
8
 
9
- def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
9
+ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> R | None:
10
10
  """
11
11
  Runs a cleanup function. If interrupted, retry cleanup.
12
12
  The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
@@ -9,7 +9,7 @@ from collections import OrderedDict, defaultdict
9
9
  from dataclasses import dataclass
10
10
  from datetime import datetime, timezone
11
11
  from pathlib import Path
12
- from typing import NamedTuple, Optional
12
+ from typing import NamedTuple
13
13
  from uuid import UUID
14
14
 
15
15
  import pixeltable.exceptions as excs
@@ -58,7 +58,7 @@ class FileCache:
58
58
  - implement MRU eviction for queries that exceed the capacity
59
59
  """
60
60
 
61
- __instance: Optional[FileCache] = None
61
+ __instance: FileCache | None = None
62
62
 
63
63
  cache: OrderedDict[str, CacheEntry]
64
64
  total_size: int
@@ -126,12 +126,12 @@ class FileCache:
126
126
  return 0
127
127
  return int(self.total_size / len(self.cache))
128
128
 
129
- def num_files(self, tbl_id: Optional[UUID] = None) -> int:
129
+ def num_files(self, tbl_id: UUID | None = None) -> int:
130
130
  if tbl_id is None:
131
131
  return len(self.cache)
132
132
  return sum(e.tbl_id == tbl_id for e in self.cache.values())
133
133
 
134
- def clear(self, tbl_id: Optional[UUID] = None) -> None:
134
+ def clear(self, tbl_id: UUID | None = None) -> None:
135
135
  """
136
136
  For testing purposes: allow resetting capacity and stats.
137
137
  """
@@ -174,7 +174,7 @@ class FileCache:
174
174
  h.update(url.encode())
175
175
  return h.hexdigest()
176
176
 
177
- def lookup(self, url: str) -> Optional[Path]:
177
+ def lookup(self, url: str) -> Path | None:
178
178
  self.num_requests += 1
179
179
  key = self._url_hash(url)
180
180
  entry = self.cache.get(key, None)