pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +7 -19
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +7 -7
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +21 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +227 -148
- pixeltable/catalog/table_version.py +66 -28
- pixeltable/catalog/table_version_path.py +0 -8
- pixeltable/catalog/view.py +18 -19
- pixeltable/dataframe.py +16 -32
- pixeltable/env.py +6 -1
- pixeltable/exec/__init__.py +1 -2
- pixeltable/exec/aggregation_node.py +27 -17
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +9 -26
- pixeltable/exec/exec_node.py +36 -7
- pixeltable/exec/expr_eval_node.py +19 -11
- pixeltable/exec/in_memory_data_node.py +14 -11
- pixeltable/exec/sql_node.py +266 -138
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +56 -36
- pixeltable/exprs/expr.py +65 -63
- pixeltable/exprs/expr_dict.py +55 -0
- pixeltable/exprs/expr_set.py +26 -15
- pixeltable/exprs/function_call.py +53 -24
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +14 -13
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +12 -6
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/ext/functions/whisperx.py +7 -2
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -2
- pixeltable/func/function.py +11 -10
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/query_template_function.py +11 -12
- pixeltable/func/signature.py +17 -15
- pixeltable/func/udf.py +0 -4
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/audio.py +4 -6
- pixeltable/functions/globals.py +84 -42
- pixeltable/functions/huggingface.py +31 -34
- pixeltable/functions/image.py +59 -45
- pixeltable/functions/json.py +0 -1
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/mistralai.py +2 -2
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +22 -25
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +59 -50
- pixeltable/functions/timestamp.py +20 -20
- pixeltable/functions/together.py +2 -2
- pixeltable/functions/video.py +11 -20
- pixeltable/functions/whisper.py +2 -20
- pixeltable/globals.py +65 -74
- pixeltable/index/base.py +2 -2
- pixeltable/index/btree.py +20 -7
- pixeltable/index/embedding_index.py +12 -14
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/external_store.py +11 -5
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +98 -2
- pixeltable/io/hf_datasets.py +1 -1
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/parquet.py +14 -13
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +10 -8
- pixeltable/iterators/video.py +126 -60
- pixeltable/metadata/__init__.py +4 -3
- pixeltable/metadata/converters/convert_14.py +4 -2
- pixeltable/metadata/converters/convert_15.py +1 -1
- pixeltable/metadata/converters/convert_19.py +1 -0
- pixeltable/metadata/converters/convert_20.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +54 -12
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +40 -21
- pixeltable/plan.py +149 -165
- pixeltable/py.typed +0 -0
- pixeltable/store.py +57 -37
- pixeltable/tool/create_test_db_dump.py +6 -6
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/doc_plugins/griffe.py +3 -34
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +55 -0
- pixeltable/type_system.py +260 -61
- pixeltable/utils/arrow.py +10 -9
- pixeltable/utils/coco.py +4 -4
- pixeltable/utils/documents.py +16 -2
- pixeltable/utils/filecache.py +9 -9
- pixeltable/utils/formatter.py +10 -11
- pixeltable/utils/http_server.py +2 -5
- pixeltable/utils/media_store.py +6 -6
- pixeltable/utils/pytorch.py +10 -11
- pixeltable/utils/sql.py +2 -1
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable/utils/help.py +0 -11
- pixeltable-0.2.20.dist-info/RECORD +0 -147
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/functions/whisper.py
CHANGED
|
@@ -14,27 +14,9 @@ from pixeltable.env import Env
|
|
|
14
14
|
if TYPE_CHECKING:
|
|
15
15
|
from whisper import Whisper # type: ignore[import-untyped]
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
@pxt.udf(
|
|
19
|
-
param_types=[
|
|
20
|
-
pxt.AudioType(),
|
|
21
|
-
pxt.StringType(),
|
|
22
|
-
pxt.JsonType(nullable=True),
|
|
23
|
-
pxt.FloatType(nullable=True),
|
|
24
|
-
pxt.FloatType(nullable=True),
|
|
25
|
-
pxt.FloatType(nullable=True),
|
|
26
|
-
pxt.BoolType(),
|
|
27
|
-
pxt.StringType(nullable=True),
|
|
28
|
-
pxt.BoolType(),
|
|
29
|
-
pxt.StringType(),
|
|
30
|
-
pxt.StringType(),
|
|
31
|
-
pxt.StringType(),
|
|
32
|
-
pxt.FloatType(nullable=True),
|
|
33
|
-
pxt.JsonType(nullable=True),
|
|
34
|
-
]
|
|
35
|
-
)
|
|
17
|
+
@pxt.udf
|
|
36
18
|
def transcribe(
|
|
37
|
-
audio:
|
|
19
|
+
audio: pxt.Audio,
|
|
38
20
|
*,
|
|
39
21
|
model: str,
|
|
40
22
|
temperature: Optional[list[float]] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
|
pixeltable/globals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Iterable, Optional, Union
|
|
3
|
+
from typing import Any, Iterable, Optional, Union, Literal
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
@@ -33,6 +33,7 @@ def create_table(
|
|
|
33
33
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
34
34
|
num_retained_versions: int = 10,
|
|
35
35
|
comment: str = '',
|
|
36
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write'
|
|
36
37
|
) -> catalog.Table:
|
|
37
38
|
"""Create a new base table.
|
|
38
39
|
|
|
@@ -44,6 +45,9 @@ def create_table(
|
|
|
44
45
|
table.
|
|
45
46
|
num_retained_versions: Number of versions of the table to retain.
|
|
46
47
|
comment: An optional comment; its meaning is user-defined.
|
|
48
|
+
media_validation: Media validation policy for the table.
|
|
49
|
+
- `'on_read'`: validate media files at query time
|
|
50
|
+
- `'on_write'`: validate media files during insert/update operations
|
|
47
51
|
|
|
48
52
|
Returns:
|
|
49
53
|
A handle to the newly created [`Table`][pixeltable.Table].
|
|
@@ -54,11 +58,13 @@ def create_table(
|
|
|
54
58
|
Examples:
|
|
55
59
|
Create a table with an int and a string column:
|
|
56
60
|
|
|
57
|
-
>>>
|
|
61
|
+
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String})
|
|
58
62
|
|
|
59
|
-
Create a table from a select statement over an existing table `
|
|
63
|
+
Create a table from a select statement over an existing table `orig_table` (this will create a new table
|
|
64
|
+
containing the exact contents of the query):
|
|
60
65
|
|
|
61
|
-
>>>
|
|
66
|
+
>>> tbl1 = pxt.get_table('orig_table')
|
|
67
|
+
... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
|
|
62
68
|
"""
|
|
63
69
|
path = catalog.Path(path_str)
|
|
64
70
|
Catalog.get().paths.check_is_valid(path, expected=None)
|
|
@@ -87,14 +93,8 @@ def create_table(
|
|
|
87
93
|
raise excs.Error('primary_key must be a single column name or a list of column names')
|
|
88
94
|
|
|
89
95
|
tbl = catalog.InsertableTable._create(
|
|
90
|
-
dir._id,
|
|
91
|
-
|
|
92
|
-
schema,
|
|
93
|
-
df,
|
|
94
|
-
primary_key=primary_key,
|
|
95
|
-
num_retained_versions=num_retained_versions,
|
|
96
|
-
comment=comment,
|
|
97
|
-
)
|
|
96
|
+
dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
|
|
97
|
+
comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
98
98
|
Catalog.get().paths[path] = tbl
|
|
99
99
|
|
|
100
100
|
_logger.info(f'Created table `{path_str}`.')
|
|
@@ -105,22 +105,24 @@ def create_view(
|
|
|
105
105
|
path_str: str,
|
|
106
106
|
base: Union[catalog.Table, DataFrame],
|
|
107
107
|
*,
|
|
108
|
-
|
|
109
|
-
filter: Optional[exprs.Expr] = None,
|
|
108
|
+
additional_columns: Optional[dict[str, Any]] = None,
|
|
110
109
|
is_snapshot: bool = False,
|
|
111
110
|
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
|
|
112
111
|
num_retained_versions: int = 10,
|
|
113
112
|
comment: str = '',
|
|
113
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
114
114
|
ignore_errors: bool = False,
|
|
115
115
|
) -> Optional[catalog.Table]:
|
|
116
116
|
"""Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
117
117
|
|
|
118
118
|
Args:
|
|
119
|
-
path_str:
|
|
119
|
+
path_str: A name for the view; can be either a simple name such as `my_view`, or a pathname such as
|
|
120
|
+
`dir1.my_view`.
|
|
120
121
|
base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
|
|
121
122
|
base the view on.
|
|
122
|
-
|
|
123
|
-
|
|
123
|
+
additional_columns: If specified, will add these columns to the view once it is created. The format
|
|
124
|
+
of the `additional_columns` parameter is identical to the format of the `schema_or_df` parameter in
|
|
125
|
+
[`create_table`][pixeltable.create_table].
|
|
124
126
|
is_snapshot: Whether the view is a snapshot.
|
|
125
127
|
iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
|
|
126
128
|
the base table.
|
|
@@ -130,36 +132,29 @@ def create_view(
|
|
|
130
132
|
|
|
131
133
|
Returns:
|
|
132
134
|
A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
|
|
133
|
-
|
|
135
|
+
exists or is invalid and `ignore_errors=True`, returns `None`.
|
|
134
136
|
|
|
135
137
|
Raises:
|
|
136
138
|
Error: if the path already exists or is invalid and `ignore_errors=False`.
|
|
137
139
|
|
|
138
140
|
Examples:
|
|
139
|
-
Create a view
|
|
140
|
-
|
|
141
|
-
>>> view = cl.create_view(
|
|
142
|
-
'my_view', base, schema={'col3': IntType(), 'col4': StringType()}, filter=base.col1 > 10)
|
|
141
|
+
Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10:
|
|
143
142
|
|
|
144
|
-
|
|
143
|
+
>>> tbl = pxt.get_table('my_table')
|
|
144
|
+
... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10))
|
|
145
145
|
|
|
146
|
-
|
|
146
|
+
Create a snapshot of `my_table`:
|
|
147
147
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
>>> snapshot_view = cl.create_view(
|
|
151
|
-
'my_snapshot', base, schema={'col3': base.col2 + 1}, filter=base.col1 > 10, is_snapshot=True)
|
|
148
|
+
>>> tbl = pxt.get_table('my_table')
|
|
149
|
+
... snapshot_view = pxt.create_view('my_snapshot_view', tbl, is_snapshot=True)
|
|
152
150
|
"""
|
|
151
|
+
where: Optional[exprs.Expr] = None
|
|
153
152
|
if isinstance(base, catalog.Table):
|
|
154
153
|
tbl_version_path = base._tbl_version_path
|
|
155
154
|
elif isinstance(base, DataFrame):
|
|
156
155
|
base._validate_mutable('create_view')
|
|
157
156
|
tbl_version_path = base.tbl
|
|
158
|
-
|
|
159
|
-
raise excs.Error(
|
|
160
|
-
'Cannot specify a `filter` directly if one is already declared in a `DataFrame.where` clause'
|
|
161
|
-
)
|
|
162
|
-
filter = base.where_clause
|
|
157
|
+
where = base.where_clause
|
|
163
158
|
else:
|
|
164
159
|
raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
|
|
165
160
|
assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
|
|
@@ -173,25 +168,18 @@ def create_view(
|
|
|
173
168
|
raise e
|
|
174
169
|
dir = Catalog.get().paths[path.parent]
|
|
175
170
|
|
|
176
|
-
if
|
|
177
|
-
|
|
171
|
+
if additional_columns is None:
|
|
172
|
+
additional_columns = {}
|
|
178
173
|
if iterator is None:
|
|
179
174
|
iterator_class, iterator_args = None, None
|
|
180
175
|
else:
|
|
181
176
|
iterator_class, iterator_args = iterator
|
|
182
177
|
|
|
183
178
|
view = catalog.View._create(
|
|
184
|
-
dir._id,
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
predicate=filter,
|
|
189
|
-
is_snapshot=is_snapshot,
|
|
190
|
-
iterator_cls=iterator_class,
|
|
191
|
-
iterator_args=iterator_args,
|
|
192
|
-
num_retained_versions=num_retained_versions,
|
|
193
|
-
comment=comment,
|
|
194
|
-
)
|
|
179
|
+
dir._id, path.name, base=tbl_version_path, additional_columns=additional_columns, predicate=where,
|
|
180
|
+
is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
|
|
181
|
+
num_retained_versions=num_retained_versions, comment=comment,
|
|
182
|
+
media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
195
183
|
Catalog.get().paths[path] = view
|
|
196
184
|
_logger.info(f'Created view `{path_str}`.')
|
|
197
185
|
FileCache.get().emit_eviction_warnings()
|
|
@@ -199,7 +187,7 @@ def create_view(
|
|
|
199
187
|
|
|
200
188
|
|
|
201
189
|
def get_table(path: str) -> catalog.Table:
|
|
202
|
-
"""Get a handle to an existing table
|
|
190
|
+
"""Get a handle to an existing table, view, or snapshot.
|
|
203
191
|
|
|
204
192
|
Args:
|
|
205
193
|
path: Path to the table.
|
|
@@ -213,15 +201,15 @@ def get_table(path: str) -> catalog.Table:
|
|
|
213
201
|
Examples:
|
|
214
202
|
Get handle for a table in the top-level directory:
|
|
215
203
|
|
|
216
|
-
>>>
|
|
204
|
+
>>> tbl = pxt.get_table('my_table')
|
|
217
205
|
|
|
218
206
|
For a table in a subdirectory:
|
|
219
207
|
|
|
220
|
-
>>>
|
|
208
|
+
>>> tbl = pxt.get_table('subdir.my_table')
|
|
221
209
|
|
|
222
|
-
|
|
210
|
+
Handles to views and snapshots are retrieved in the same way:
|
|
223
211
|
|
|
224
|
-
>>>
|
|
212
|
+
>>> tbl = pxt.get_table('my_snapshot')
|
|
225
213
|
"""
|
|
226
214
|
p = catalog.Path(path)
|
|
227
215
|
Catalog.get().paths.check_is_valid(p, expected=catalog.Table)
|
|
@@ -243,11 +231,11 @@ def move(path: str, new_path: str) -> None:
|
|
|
243
231
|
Examples:
|
|
244
232
|
Move a table to a different directory:
|
|
245
233
|
|
|
246
|
-
>>>>
|
|
234
|
+
>>>> pxt.move('dir1.my_table', 'dir2.my_table')
|
|
247
235
|
|
|
248
236
|
Rename a table:
|
|
249
237
|
|
|
250
|
-
>>>>
|
|
238
|
+
>>>> pxt.move('dir1.my_table', 'dir1.new_name')
|
|
251
239
|
"""
|
|
252
240
|
p = catalog.Path(path)
|
|
253
241
|
Catalog.get().paths.check_is_valid(p, expected=catalog.SchemaObject)
|
|
@@ -260,18 +248,18 @@ def move(path: str, new_path: str) -> None:
|
|
|
260
248
|
|
|
261
249
|
|
|
262
250
|
def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> None:
|
|
263
|
-
"""Drop a table
|
|
251
|
+
"""Drop a table, view, or snapshot.
|
|
264
252
|
|
|
265
253
|
Args:
|
|
266
254
|
path: Path to the [`Table`][pixeltable.Table].
|
|
267
|
-
force: If `True`, will also drop all views
|
|
268
|
-
ignore_errors:
|
|
255
|
+
force: If `True`, will also drop all views and sub-views of this table.
|
|
256
|
+
ignore_errors: If `True`, return silently if the table does not exist (without throwing an exception).
|
|
269
257
|
|
|
270
258
|
Raises:
|
|
271
|
-
Error: If the path does not exist or does not designate a table object and ignore_errors
|
|
259
|
+
Error: If the path does not exist or does not designate a table object, and `ignore_errors=False`.
|
|
272
260
|
|
|
273
261
|
Examples:
|
|
274
|
-
>>>
|
|
262
|
+
>>> pxt.drop_table('my_table')
|
|
275
263
|
"""
|
|
276
264
|
cat = Catalog.get()
|
|
277
265
|
path_obj = catalog.Path(path)
|
|
@@ -302,7 +290,8 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
302
290
|
|
|
303
291
|
Args:
|
|
304
292
|
dir_path: Path to the directory. Defaults to the root directory.
|
|
305
|
-
recursive:
|
|
293
|
+
recursive: If `False`, returns only those tables that are directly contained in specified directory; if
|
|
294
|
+
`True`, returns all tables that are descendants of the specified directory, recursively.
|
|
306
295
|
|
|
307
296
|
Returns:
|
|
308
297
|
A list of [`Table`][pixeltable.Table] paths.
|
|
@@ -313,13 +302,11 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
313
302
|
Examples:
|
|
314
303
|
List tables in top-level directory:
|
|
315
304
|
|
|
316
|
-
>>>
|
|
317
|
-
['my_table', ...]
|
|
305
|
+
>>> pxt.list_tables()
|
|
318
306
|
|
|
319
307
|
List tables in 'dir1':
|
|
320
308
|
|
|
321
|
-
>>>
|
|
322
|
-
[...]
|
|
309
|
+
>>> pxt.list_tables('dir1')
|
|
323
310
|
"""
|
|
324
311
|
assert dir_path is not None
|
|
325
312
|
path = catalog.Path(dir_path, empty_is_valid=True)
|
|
@@ -332,17 +319,17 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.D
|
|
|
332
319
|
|
|
333
320
|
Args:
|
|
334
321
|
path_str: Path to the directory.
|
|
335
|
-
ignore_errors: if True
|
|
322
|
+
ignore_errors: if `True`, will return silently instead of throwing an exception if an error occurs.
|
|
336
323
|
|
|
337
324
|
Raises:
|
|
338
|
-
Error: If the path already exists or the parent is not a directory
|
|
325
|
+
Error: If the path already exists or the parent is not a directory, and `ignore_errors=False`.
|
|
339
326
|
|
|
340
327
|
Examples:
|
|
341
|
-
>>>
|
|
328
|
+
>>> pxt.create_dir('my_dir')
|
|
342
329
|
|
|
343
330
|
Create a subdirectory:
|
|
344
331
|
|
|
345
|
-
>>>
|
|
332
|
+
>>> pxt.create_dir('my_dir.sub_dir')
|
|
346
333
|
"""
|
|
347
334
|
try:
|
|
348
335
|
path = catalog.Path(path_str)
|
|
@@ -373,17 +360,21 @@ def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) ->
|
|
|
373
360
|
"""Remove a directory.
|
|
374
361
|
|
|
375
362
|
Args:
|
|
376
|
-
path_str:
|
|
363
|
+
path_str: Name or path of the directory.
|
|
364
|
+
force: If `True`, will also drop all tables and subdirectories of this directory, recursively, along
|
|
365
|
+
with any views or snapshots that depend on any of the dropped tables.
|
|
366
|
+
ignore_errors: if `True`, will return silently instead of throwing an exception if the directory
|
|
367
|
+
does not exist.
|
|
377
368
|
|
|
378
369
|
Raises:
|
|
379
|
-
Error: If the path does not exist or does not designate a directory or if the directory is not empty.
|
|
370
|
+
Error: If the path does not exist or does not designate a directory, or if the directory is not empty.
|
|
380
371
|
|
|
381
372
|
Examples:
|
|
382
|
-
>>>
|
|
373
|
+
>>> pxt.drop_dir('my_dir')
|
|
383
374
|
|
|
384
375
|
Remove a subdirectory:
|
|
385
376
|
|
|
386
|
-
>>>
|
|
377
|
+
>>> pxt.drop_dir('my_dir.sub_dir')
|
|
387
378
|
"""
|
|
388
379
|
cat = Catalog.get()
|
|
389
380
|
path = catalog.Path(path_str)
|
|
@@ -428,14 +419,14 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
|
|
|
428
419
|
"""List the directories in a directory.
|
|
429
420
|
|
|
430
421
|
Args:
|
|
431
|
-
path_str:
|
|
432
|
-
recursive:
|
|
422
|
+
path_str: Name or path of the directory.
|
|
423
|
+
recursive: If `True`, lists all descendants of this directory recursively.
|
|
433
424
|
|
|
434
425
|
Returns:
|
|
435
426
|
List of directory paths.
|
|
436
427
|
|
|
437
428
|
Raises:
|
|
438
|
-
Error: If
|
|
429
|
+
Error: If `path_str` does not exist or does not designate a directory.
|
|
439
430
|
|
|
440
431
|
Examples:
|
|
441
432
|
>>> cl.list_dirs('my_dir', recursive=True)
|
pixeltable/index/base.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any
|
|
|
5
5
|
|
|
6
6
|
import sqlalchemy as sql
|
|
7
7
|
|
|
8
|
-
|
|
8
|
+
from pixeltable import catalog, exprs
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class IndexBase(abc.ABC):
|
|
@@ -22,7 +22,7 @@ class IndexBase(abc.ABC):
|
|
|
22
22
|
pass
|
|
23
23
|
|
|
24
24
|
@abc.abstractmethod
|
|
25
|
-
def index_value_expr(self) ->
|
|
25
|
+
def index_value_expr(self) -> exprs.Expr:
|
|
26
26
|
"""Return expression that computes the value that goes into the index"""
|
|
27
27
|
pass
|
|
28
28
|
|
pixeltable/index/btree.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Optional, TYPE_CHECKING
|
|
2
2
|
|
|
3
3
|
import sqlalchemy as sql
|
|
4
4
|
|
|
5
5
|
# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
|
|
6
|
-
#import pixeltable.catalog as catalog
|
|
6
|
+
# import pixeltable.catalog as catalog
|
|
7
7
|
import pixeltable.exceptions as excs
|
|
8
|
-
|
|
8
|
+
from pixeltable import catalog, exprs
|
|
9
|
+
from pixeltable.func.udf import udf
|
|
9
10
|
from .base import IndexBase
|
|
10
11
|
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import pixeltable.exprs
|
|
11
14
|
|
|
12
15
|
class BtreeIndex(IndexBase):
|
|
13
16
|
"""
|
|
@@ -15,7 +18,10 @@ class BtreeIndex(IndexBase):
|
|
|
15
18
|
"""
|
|
16
19
|
MAX_STRING_LEN = 256
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
value_expr: 'pixeltable.exprs.Expr'
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
@udf
|
|
19
25
|
def str_filter(s: Optional[str]) -> Optional[str]:
|
|
20
26
|
if s is None:
|
|
21
27
|
return None
|
|
@@ -24,10 +30,16 @@ class BtreeIndex(IndexBase):
|
|
|
24
30
|
def __init__(self, c: 'catalog.Column'):
|
|
25
31
|
if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
|
|
26
32
|
raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
|
|
27
|
-
|
|
28
|
-
|
|
33
|
+
if c.col_type.is_media_type():
|
|
34
|
+
# an index on a media column is an index on the file url
|
|
35
|
+
# no validation for media columns: we're only interested in the string value
|
|
36
|
+
self.value_expr = exprs.ColumnRef(c, perform_validation=False)
|
|
37
|
+
else:
|
|
38
|
+
self.value_expr = (
|
|
39
|
+
BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
|
|
40
|
+
)
|
|
29
41
|
|
|
30
|
-
def index_value_expr(self) -> '
|
|
42
|
+
def index_value_expr(self) -> 'exprs.Expr':
|
|
31
43
|
return self.value_expr
|
|
32
44
|
|
|
33
45
|
def records_value_errors(self) -> bool:
|
|
@@ -52,3 +64,4 @@ class BtreeIndex(IndexBase):
|
|
|
52
64
|
@classmethod
|
|
53
65
|
def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
|
|
54
66
|
return cls(c)
|
|
67
|
+
|
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Optional, Any
|
|
4
3
|
import enum
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
|
-
import PIL.Image
|
|
7
6
|
import numpy as np
|
|
8
|
-
import pgvector.sqlalchemy
|
|
7
|
+
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
9
8
|
import PIL.Image
|
|
10
9
|
import sqlalchemy as sql
|
|
11
10
|
|
|
12
|
-
import pixeltable.catalog as catalog
|
|
13
11
|
import pixeltable.exceptions as excs
|
|
14
|
-
import pixeltable.func as func
|
|
15
12
|
import pixeltable.type_system as ts
|
|
13
|
+
from pixeltable import catalog, exprs, func
|
|
14
|
+
|
|
16
15
|
from .base import IndexBase
|
|
17
16
|
|
|
18
17
|
|
|
@@ -58,16 +57,15 @@ class EmbeddingIndex(IndexBase):
|
|
|
58
57
|
self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
|
|
59
58
|
|
|
60
59
|
self.metric = self.Metric[metric.upper()]
|
|
61
|
-
|
|
62
|
-
self.value_expr
|
|
63
|
-
assert self.value_expr.col_type.is_array_type()
|
|
60
|
+
self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
|
|
61
|
+
assert isinstance(self.value_expr.col_type, ts.ArrayType)
|
|
64
62
|
self.string_embed = string_embed
|
|
65
63
|
self.image_embed = image_embed
|
|
66
64
|
vector_size = self.value_expr.col_type.shape[0]
|
|
67
65
|
assert vector_size is not None
|
|
68
66
|
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
69
67
|
|
|
70
|
-
def index_value_expr(self) ->
|
|
68
|
+
def index_value_expr(self) -> exprs.Expr:
|
|
71
69
|
"""Return expression that computes the value that goes into the index"""
|
|
72
70
|
return self.value_expr
|
|
73
71
|
|
|
@@ -88,8 +86,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
88
86
|
)
|
|
89
87
|
idx.create(bind=conn)
|
|
90
88
|
|
|
91
|
-
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.
|
|
92
|
-
"""Create a
|
|
89
|
+
def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
|
|
90
|
+
"""Create a ColumnElement that represents '<val_column> <op> <item>'"""
|
|
93
91
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
94
92
|
if isinstance(item, str):
|
|
95
93
|
assert self.string_embed is not None
|
|
@@ -106,8 +104,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
106
104
|
assert self.metric == self.Metric.L2
|
|
107
105
|
return val_column.sa_col.l2_distance(embedding)
|
|
108
106
|
|
|
109
|
-
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.
|
|
110
|
-
"""Create a
|
|
107
|
+
def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
|
|
108
|
+
"""Create a ColumnElement that is used in an ORDER BY clause"""
|
|
111
109
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
112
110
|
embedding: Optional[np.ndarray] = None
|
|
113
111
|
if isinstance(item, str):
|
|
@@ -151,7 +149,7 @@ class EmbeddingIndex(IndexBase):
|
|
|
151
149
|
img = PIL.Image.new('RGB', (512, 512))
|
|
152
150
|
return_type = embed_fn.call_return_type({param_name: img})
|
|
153
151
|
assert return_type is not None
|
|
154
|
-
if not return_type.
|
|
152
|
+
if not isinstance(return_type, ts.ArrayType):
|
|
155
153
|
raise excs.Error(f'{name} must return an array, but returns {return_type}')
|
|
156
154
|
else:
|
|
157
155
|
shape = return_type.shape
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from .external_store import ExternalStore, SyncStatus
|
|
2
|
-
from .globals import create_label_studio_project,
|
|
2
|
+
from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
|
|
3
3
|
from .hf_datasets import import_huggingface_dataset
|
|
4
4
|
from .pandas import import_csv, import_excel, import_pandas
|
|
5
5
|
from .parquet import import_parquet
|
|
6
6
|
|
|
7
|
-
|
|
8
7
|
__default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
|
|
9
8
|
__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
|
|
10
9
|
__all__ = sorted(list(__default_dir - __removed_symbols))
|
pixeltable/io/external_store.py
CHANGED
|
@@ -69,6 +69,9 @@ class Project(ExternalStore, abc.ABC):
|
|
|
69
69
|
An `ExternalStore` that represents a labeling project. Extends `ExternalStore` with a few
|
|
70
70
|
additional capabilities specific to such projects.
|
|
71
71
|
"""
|
|
72
|
+
|
|
73
|
+
stored_proxies: dict[Column, Column]
|
|
74
|
+
|
|
72
75
|
def __init__(self, name: str, col_mapping: dict[Column, str], stored_proxies: Optional[dict[Column, Column]]):
|
|
73
76
|
super().__init__(name)
|
|
74
77
|
self._col_mapping = col_mapping
|
|
@@ -116,7 +119,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
116
119
|
tbl_version.schema_version = tbl_version.version
|
|
117
120
|
proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
|
|
118
121
|
# Add the columns; this will also update table metadata.
|
|
119
|
-
tbl_version._add_columns(proxy_cols, conn)
|
|
122
|
+
tbl_version._add_columns(proxy_cols, conn, print_stats=False, on_error='ignore')
|
|
120
123
|
# We don't need to retain `UpdateStatus` since the stored proxies are intended to be
|
|
121
124
|
# invisible to the user.
|
|
122
125
|
tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
|
|
@@ -126,7 +129,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
126
129
|
# any *other* external store for this table.)
|
|
127
130
|
deletions_needed: set[Column] = set(self.stored_proxies.values())
|
|
128
131
|
for name, store in tbl_version.external_stores.items():
|
|
129
|
-
if name != self.name:
|
|
132
|
+
if isinstance(store, Project) and name != self.name:
|
|
130
133
|
deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
|
|
131
134
|
if len(deletions_needed) > 0:
|
|
132
135
|
_logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
|
|
@@ -210,6 +213,8 @@ class Project(ExternalStore, abc.ABC):
|
|
|
210
213
|
If validation fails, an exception will be raised. If validation succeeds, a new mapping will be returned
|
|
211
214
|
in which the Pixeltable column names are resolved to the corresponding `Column` objects.
|
|
212
215
|
"""
|
|
216
|
+
from pixeltable import exprs
|
|
217
|
+
|
|
213
218
|
is_user_specified_col_mapping = col_mapping is not None
|
|
214
219
|
if col_mapping is None:
|
|
215
220
|
col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
|
|
@@ -235,8 +240,9 @@ class Project(ExternalStore, abc.ABC):
|
|
|
235
240
|
f'Column name `{ext_col}` appears as a value in `col_mapping`, but the external store '
|
|
236
241
|
f'configuration has no column `{ext_col}`.'
|
|
237
242
|
)
|
|
238
|
-
|
|
239
|
-
|
|
243
|
+
col_ref = table[t_col]
|
|
244
|
+
assert isinstance(col_ref, exprs.ColumnRef)
|
|
245
|
+
resolved_col_mapping[col_ref.col] = ext_col
|
|
240
246
|
# Validate column specs
|
|
241
247
|
t_col_types = table._schema
|
|
242
248
|
for t_col, ext_col in col_mapping.items():
|
|
@@ -329,7 +335,7 @@ class MockProject(Project):
|
|
|
329
335
|
def get_import_columns(self) -> dict[str, ts.ColumnType]:
|
|
330
336
|
return self.import_cols
|
|
331
337
|
|
|
332
|
-
def sync(self, t: Table, export_data: bool, import_data: bool) ->
|
|
338
|
+
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
333
339
|
raise NotImplementedError()
|
|
334
340
|
|
|
335
341
|
def delete(self) -> None:
|