pixeltable 0.2.27__py3-none-any.whl → 0.2.29__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/dir.py +6 -0
- pixeltable/catalog/globals.py +13 -0
- pixeltable/catalog/named_function.py +4 -0
- pixeltable/catalog/path_dict.py +37 -11
- pixeltable/catalog/schema_object.py +6 -0
- pixeltable/catalog/table.py +22 -5
- pixeltable/catalog/table_version.py +22 -8
- pixeltable/dataframe.py +201 -3
- pixeltable/env.py +9 -3
- pixeltable/exec/expr_eval_node.py +1 -1
- pixeltable/exec/sql_node.py +2 -2
- pixeltable/exprs/expr.py +1 -0
- pixeltable/exprs/function_call.py +134 -24
- pixeltable/exprs/inline_expr.py +22 -2
- pixeltable/exprs/row_builder.py +1 -1
- pixeltable/exprs/similarity_expr.py +9 -2
- pixeltable/func/aggregate_function.py +148 -68
- pixeltable/func/callable_function.py +49 -13
- pixeltable/func/expr_template_function.py +55 -24
- pixeltable/func/function.py +183 -22
- pixeltable/func/function_registry.py +2 -1
- pixeltable/func/query_template_function.py +11 -6
- pixeltable/func/signature.py +64 -7
- pixeltable/func/udf.py +57 -35
- pixeltable/functions/globals.py +54 -34
- pixeltable/functions/json.py +3 -8
- pixeltable/functions/ollama.py +4 -4
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/video.py +3 -9
- pixeltable/functions/vision.py +1 -1
- pixeltable/globals.py +218 -59
- pixeltable/index/embedding_index.py +44 -24
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_16.py +2 -1
- pixeltable/metadata/converters/convert_17.py +2 -1
- pixeltable/metadata/converters/convert_23.py +35 -0
- pixeltable/metadata/converters/convert_24.py +47 -0
- pixeltable/metadata/converters/util.py +4 -2
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +1 -0
- pixeltable/tool/create_test_db_dump.py +11 -0
- pixeltable/tool/doc_plugins/griffe.py +4 -3
- pixeltable/type_system.py +182 -47
- {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/METADATA +3 -2
- {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/RECORD +50 -48
- {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Any, Iterable, Optional, Union, Literal
|
|
3
|
+
from typing import Any, Iterable, Optional, Union, Literal, Type
|
|
4
4
|
from uuid import UUID
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
@@ -20,11 +20,68 @@ from pixeltable.utils.filecache import FileCache
|
|
|
20
20
|
|
|
21
21
|
_logger = logging.getLogger('pixeltable')
|
|
22
22
|
|
|
23
|
-
|
|
24
23
|
def init() -> None:
|
|
25
24
|
"""Initializes the Pixeltable environment."""
|
|
26
25
|
_ = Catalog.get()
|
|
27
26
|
|
|
27
|
+
def _get_or_drop_existing_path(
|
|
28
|
+
path_str: str,
|
|
29
|
+
expected_obj_type: Type[catalog.SchemaObject],
|
|
30
|
+
expected_snapshot: bool,
|
|
31
|
+
if_exists: catalog.IfExistsParam
|
|
32
|
+
) -> Optional[catalog.SchemaObject]:
|
|
33
|
+
"""Handle schema object path collision during creation according to the if_exists parameter.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
path_str: An existing and valid path to the dir, table, view, or snapshot.
|
|
37
|
+
expected_obj_type: Whether the caller of this function is creating a dir, table, or view at the existing path.
|
|
38
|
+
expected_snapshot: Whether the caller of this function is creating a snapshot at the existing path.
|
|
39
|
+
if_exists: Directive regarding how to handle the existing path.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
A handle to the existing dir, table, view, or snapshot, if `if_exists='ignore'`, otherwise `None`.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
Error: If the existing path is not of the expected type, or if the existing path has dependents and
|
|
46
|
+
`if_exists='replace'` or `if_exists='replace_force'`.
|
|
47
|
+
"""
|
|
48
|
+
cat = Catalog.get()
|
|
49
|
+
path = catalog.Path(path_str)
|
|
50
|
+
assert cat.paths.get_object(path) is not None
|
|
51
|
+
|
|
52
|
+
if if_exists == catalog.IfExistsParam.ERROR:
|
|
53
|
+
raise excs.Error(f'Path `{path_str}` already exists.')
|
|
54
|
+
|
|
55
|
+
existing_path = cat.paths[path]
|
|
56
|
+
existing_path_is_snapshot = 'is_snapshot' in existing_path.get_metadata() and existing_path.get_metadata()['is_snapshot']
|
|
57
|
+
obj_type_str = 'Snapshot' if expected_snapshot else expected_obj_type._display_name().capitalize()
|
|
58
|
+
# Check if the existing path is of expected type.
|
|
59
|
+
if (not isinstance(existing_path, expected_obj_type)
|
|
60
|
+
or (expected_snapshot and not existing_path_is_snapshot)):
|
|
61
|
+
raise excs.Error(f'Path `{path_str}` already exists but is not a {obj_type_str}. Cannot {if_exists.name.lower()} it.')
|
|
62
|
+
|
|
63
|
+
# if_exists='ignore' return the handle to the existing object.
|
|
64
|
+
assert isinstance(existing_path, expected_obj_type)
|
|
65
|
+
if if_exists == catalog.IfExistsParam.IGNORE:
|
|
66
|
+
return existing_path
|
|
67
|
+
|
|
68
|
+
# Check if the existing object has dependents. If so, cannot replace it
|
|
69
|
+
# unless if_exists='replace_force'.
|
|
70
|
+
has_dependents = existing_path._has_dependents
|
|
71
|
+
if if_exists == catalog.IfExistsParam.REPLACE and has_dependents:
|
|
72
|
+
raise excs.Error(f"{obj_type_str} `{path_str}` already exists and has dependents. Use `if_exists='replace_force'` to replace it.")
|
|
73
|
+
else:
|
|
74
|
+
assert if_exists == catalog.IfExistsParam.REPLACE_FORCE or not has_dependents
|
|
75
|
+
# Drop the existing path so it can be replaced.
|
|
76
|
+
# Any errors during drop will be raised.
|
|
77
|
+
_logger.info(f"Dropping {obj_type_str} `{path_str}` to replace it.")
|
|
78
|
+
if isinstance(existing_path, catalog.Dir):
|
|
79
|
+
drop_dir(path_str, force=True, ignore_errors=False)
|
|
80
|
+
else:
|
|
81
|
+
drop_table(path_str, force=True, ignore_errors=False)
|
|
82
|
+
assert cat.paths.get_object(path) is None
|
|
83
|
+
|
|
84
|
+
return None
|
|
28
85
|
|
|
29
86
|
def create_table(
|
|
30
87
|
path_str: str,
|
|
@@ -33,7 +90,8 @@ def create_table(
|
|
|
33
90
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
34
91
|
num_retained_versions: int = 10,
|
|
35
92
|
comment: str = '',
|
|
36
|
-
media_validation: Literal['on_read', 'on_write'] = 'on_write'
|
|
93
|
+
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
94
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
|
|
37
95
|
) -> catalog.Table:
|
|
38
96
|
"""Create a new base table.
|
|
39
97
|
|
|
@@ -46,15 +104,25 @@ def create_table(
|
|
|
46
104
|
num_retained_versions: Number of versions of the table to retain.
|
|
47
105
|
comment: An optional comment; its meaning is user-defined.
|
|
48
106
|
media_validation: Media validation policy for the table.
|
|
49
|
-
|
|
50
107
|
- `'on_read'`: validate media files at query time
|
|
51
108
|
- `'on_write'`: validate media files during insert/update operations
|
|
109
|
+
if_exists: Directive regarding how to handle if the path already exists.
|
|
110
|
+
Must be one of the following:
|
|
111
|
+
- `'error'`: raise an error
|
|
112
|
+
- `'ignore'`: do nothing and return the existing table handle
|
|
113
|
+
- `'replace'`: if the existing table has no views, drop and replace it with a new one
|
|
114
|
+
- `'replace_force'`: drop the existing table and all its views, and create a new one
|
|
115
|
+
Default is `'error'`.
|
|
52
116
|
|
|
53
117
|
Returns:
|
|
54
|
-
A handle to the newly created
|
|
118
|
+
A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
|
|
119
|
+
Please note the schema of the existing table may not match the schema provided in the call.
|
|
55
120
|
|
|
56
121
|
Raises:
|
|
57
|
-
Error: if the path
|
|
122
|
+
Error: if the path is invalid,
|
|
123
|
+
or if the path already exists and `if_exists='error'`,
|
|
124
|
+
or if the path already exists and is not a table,
|
|
125
|
+
or an error occurs while attempting to create the table.
|
|
58
126
|
|
|
59
127
|
Examples:
|
|
60
128
|
Create a table with an int and a string column:
|
|
@@ -66,10 +134,27 @@ def create_table(
|
|
|
66
134
|
|
|
67
135
|
>>> tbl1 = pxt.get_table('orig_table')
|
|
68
136
|
... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
|
|
137
|
+
|
|
138
|
+
Create a table if does not already exist, otherwise get the existing table:
|
|
139
|
+
|
|
140
|
+
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
|
|
141
|
+
|
|
142
|
+
Create a table with an int and a float column, and replace any existing table:
|
|
143
|
+
|
|
144
|
+
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.Float}, if_exists='replace')
|
|
69
145
|
"""
|
|
70
146
|
path = catalog.Path(path_str)
|
|
71
|
-
Catalog.get()
|
|
72
|
-
|
|
147
|
+
cat = Catalog.get()
|
|
148
|
+
|
|
149
|
+
if cat.paths.get_object(path) is not None:
|
|
150
|
+
# The table already exists. Handle it as per user directive.
|
|
151
|
+
_if_exists = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
152
|
+
existing_table = _get_or_drop_existing_path(path_str, catalog.InsertableTable, False, _if_exists)
|
|
153
|
+
if existing_table is not None:
|
|
154
|
+
assert isinstance(existing_table, catalog.Table)
|
|
155
|
+
return existing_table
|
|
156
|
+
|
|
157
|
+
dir = cat.paths[path.parent]
|
|
73
158
|
|
|
74
159
|
df: Optional[DataFrame] = None
|
|
75
160
|
if isinstance(schema_or_df, dict):
|
|
@@ -96,7 +181,7 @@ def create_table(
|
|
|
96
181
|
tbl = catalog.InsertableTable._create(
|
|
97
182
|
dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
|
|
98
183
|
comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
99
|
-
|
|
184
|
+
cat.paths[path] = tbl
|
|
100
185
|
|
|
101
186
|
_logger.info(f'Created table `{path_str}`.')
|
|
102
187
|
return tbl
|
|
@@ -112,7 +197,7 @@ def create_view(
|
|
|
112
197
|
num_retained_versions: int = 10,
|
|
113
198
|
comment: str = '',
|
|
114
199
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
115
|
-
|
|
200
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
116
201
|
) -> Optional[catalog.Table]:
|
|
117
202
|
"""Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
118
203
|
|
|
@@ -130,20 +215,45 @@ def create_view(
|
|
|
130
215
|
the base table.
|
|
131
216
|
num_retained_versions: Number of versions of the view to retain.
|
|
132
217
|
comment: Optional comment for the view.
|
|
133
|
-
|
|
218
|
+
media_validation: Media validation policy for the view.
|
|
219
|
+
- `'on_read'`: validate media files at query time
|
|
220
|
+
- `'on_write'`: validate media files during insert/update operations
|
|
221
|
+
if_exists: Directive regarding how to handle if the path already exists.
|
|
222
|
+
Must be one of the following:
|
|
223
|
+
- `'error'`: raise an error
|
|
224
|
+
- `'ignore'`: do nothing and return the existing view handle
|
|
225
|
+
- `'replace'`: if the existing view has no dependents, drop and replace it with a new one
|
|
226
|
+
- `'replace_force'`: drop the existing view and all its dependents, and create a new one
|
|
227
|
+
Default is `'error'`.
|
|
134
228
|
|
|
135
229
|
Returns:
|
|
136
230
|
A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
|
|
137
|
-
exists
|
|
231
|
+
exists and `if_exists='ignore'`, returns a handle to the existing view. Please note the schema
|
|
232
|
+
or the base of the existing view may not match those provided in the call.
|
|
138
233
|
|
|
139
234
|
Raises:
|
|
140
|
-
Error: if the path
|
|
235
|
+
Error: if the path is invalid,
|
|
236
|
+
or if the path already exists and `if_exists='error'`,
|
|
237
|
+
or if the path already exists and is not a view,
|
|
238
|
+
or an error occurs while attempting to create the view.
|
|
141
239
|
|
|
142
240
|
Examples:
|
|
143
241
|
Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10:
|
|
144
242
|
|
|
145
243
|
>>> tbl = pxt.get_table('my_table')
|
|
146
244
|
... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10))
|
|
245
|
+
|
|
246
|
+
Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10,
|
|
247
|
+
and if it not already exist. Otherwise, get the existing view named `my_view`:
|
|
248
|
+
|
|
249
|
+
>>> tbl = pxt.get_table('my_table')
|
|
250
|
+
... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10), if_exists='ignore')
|
|
251
|
+
|
|
252
|
+
Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 100,
|
|
253
|
+
and replace any existing view named `my_view`:
|
|
254
|
+
|
|
255
|
+
>>> tbl = pxt.get_table('my_table')
|
|
256
|
+
... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
|
|
147
257
|
"""
|
|
148
258
|
where: Optional[exprs.Expr] = None
|
|
149
259
|
if isinstance(base, catalog.Table):
|
|
@@ -157,15 +267,19 @@ def create_view(
|
|
|
157
267
|
else:
|
|
158
268
|
raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
|
|
159
269
|
assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
|
|
270
|
+
|
|
160
271
|
path = catalog.Path(path_str)
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
272
|
+
cat = Catalog.get()
|
|
273
|
+
|
|
274
|
+
if cat.paths.get_object(path) is not None:
|
|
275
|
+
# The view already exists. Handle it as per user directive.
|
|
276
|
+
_if_exists = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
277
|
+
existing_path = _get_or_drop_existing_path(path_str, catalog.View, is_snapshot, _if_exists)
|
|
278
|
+
if existing_path is not None:
|
|
279
|
+
assert isinstance(existing_path, catalog.View)
|
|
280
|
+
return existing_path
|
|
281
|
+
|
|
282
|
+
dir = cat.paths[path.parent]
|
|
169
283
|
|
|
170
284
|
if additional_columns is None:
|
|
171
285
|
additional_columns = {}
|
|
@@ -179,7 +293,7 @@ def create_view(
|
|
|
179
293
|
is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
|
|
180
294
|
num_retained_versions=num_retained_versions, comment=comment,
|
|
181
295
|
media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
|
|
182
|
-
|
|
296
|
+
cat.paths[path] = view
|
|
183
297
|
_logger.info(f'Created view `{path_str}`.')
|
|
184
298
|
FileCache.get().emit_eviction_warnings()
|
|
185
299
|
return view
|
|
@@ -194,7 +308,7 @@ def create_snapshot(
|
|
|
194
308
|
num_retained_versions: int = 10,
|
|
195
309
|
comment: str = '',
|
|
196
310
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
197
|
-
|
|
311
|
+
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
198
312
|
) -> Optional[catalog.Table]:
|
|
199
313
|
"""Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
200
314
|
|
|
@@ -209,21 +323,44 @@ def create_snapshot(
|
|
|
209
323
|
iterator: The iterator to use for this snapshot. If specified, then this snapshot will be a one-to-many view of
|
|
210
324
|
the base table.
|
|
211
325
|
num_retained_versions: Number of versions of the view to retain.
|
|
212
|
-
comment: Optional comment for the
|
|
213
|
-
|
|
326
|
+
comment: Optional comment for the snapshot.
|
|
327
|
+
media_validation: Media validation policy for the snapshot.
|
|
328
|
+
- `'on_read'`: validate media files at query time
|
|
329
|
+
- `'on_write'`: validate media files during insert/update operations
|
|
330
|
+
if_exists: Directive regarding how to handle if the path already exists.
|
|
331
|
+
Must be one of the following:
|
|
332
|
+
- `'error'`: raise an error
|
|
333
|
+
- `'ignore'`: do nothing and return the existing snapshot handle
|
|
334
|
+
- `'replace'`: if the existing snapshot has no dependents, drop and replace it with a new one
|
|
335
|
+
- `'replace_force'`: drop the existing snapshot and all its dependents, and create a new one
|
|
336
|
+
Default is `'error'`.
|
|
214
337
|
|
|
215
338
|
Returns:
|
|
216
|
-
A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot.
|
|
217
|
-
|
|
339
|
+
A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot.
|
|
340
|
+
Please note the schema or base of the existing snapshot may not match those provided in the call.
|
|
218
341
|
|
|
219
342
|
Raises:
|
|
220
|
-
Error: if the path
|
|
343
|
+
Error: if the path is invalid,
|
|
344
|
+
or if the path already exists and `if_exists='error'`,
|
|
345
|
+
or if the path already exists and is not a snapshot,
|
|
346
|
+
or an error occurs while attempting to create the snapshot.
|
|
221
347
|
|
|
222
348
|
Examples:
|
|
223
|
-
Create a snapshot of `my_table`:
|
|
349
|
+
Create a snapshot `my_snapshot` of a table `my_table`:
|
|
224
350
|
|
|
225
351
|
>>> tbl = pxt.get_table('my_table')
|
|
226
352
|
... snapshot = pxt.create_snapshot('my_snapshot', tbl)
|
|
353
|
+
|
|
354
|
+
Create a snapshot `my_snapshot` of a view `my_view` with additional int column `col3`,
|
|
355
|
+
if `my_snapshot` does not already exist:
|
|
356
|
+
|
|
357
|
+
>>> view = pxt.get_table('my_view')
|
|
358
|
+
... snapshot = pxt.create_snapshot('my_snapshot', view, additional_columns={'col3': pxt.Int}, if_exists='ignore')
|
|
359
|
+
|
|
360
|
+
Create a snapshot `my_snapshot` on a table `my_table`, and replace any existing snapshot named `my_snapshot`:
|
|
361
|
+
|
|
362
|
+
>>> tbl = pxt.get_table('my_table')
|
|
363
|
+
... snapshot = pxt.create_snapshot('my_snapshot', tbl, if_exists='replace_force')
|
|
227
364
|
"""
|
|
228
365
|
return create_view(
|
|
229
366
|
path_str,
|
|
@@ -234,7 +371,7 @@ def create_snapshot(
|
|
|
234
371
|
num_retained_versions=num_retained_versions,
|
|
235
372
|
comment=comment,
|
|
236
373
|
media_validation=media_validation,
|
|
237
|
-
|
|
374
|
+
if_exists=if_exists,
|
|
238
375
|
)
|
|
239
376
|
|
|
240
377
|
|
|
@@ -376,16 +513,28 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
376
513
|
Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
|
|
377
514
|
return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
|
|
378
515
|
|
|
379
|
-
|
|
380
|
-
def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.Dir]:
|
|
516
|
+
def create_dir(path_str: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error') -> Optional[catalog.Dir]:
|
|
381
517
|
"""Create a directory.
|
|
382
518
|
|
|
383
519
|
Args:
|
|
384
520
|
path_str: Path to the directory.
|
|
385
|
-
|
|
521
|
+
if_exists: Directive regarding how to handle if the path already exists.
|
|
522
|
+
Must be one of the following:
|
|
523
|
+
- `'error'`: raise an error
|
|
524
|
+
- `'ignore'`: do nothing and return the existing directory handle
|
|
525
|
+
- `'replace'`: if the existing directory is empty, drop it and create a new one
|
|
526
|
+
- `'replace_force'`: drop the existing directory and all its children, and create a new one
|
|
527
|
+
Default is `'error'`.
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
A handle to the newly created directory, or to an already existing directory at the path when `if_exists='ignore'`.
|
|
531
|
+
Please note the existing directory may not be empty.
|
|
386
532
|
|
|
387
533
|
Raises:
|
|
388
|
-
Error: If the path
|
|
534
|
+
Error: If the path is invalid,
|
|
535
|
+
or if the path already exists and `if_exists='error'`,
|
|
536
|
+
or if the path already exists and is not a directory,
|
|
537
|
+
or an error occurs while attempting to create the directory.
|
|
389
538
|
|
|
390
539
|
Examples:
|
|
391
540
|
>>> pxt.create_dir('my_dir')
|
|
@@ -393,31 +542,41 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.D
|
|
|
393
542
|
Create a subdirectory:
|
|
394
543
|
|
|
395
544
|
>>> pxt.create_dir('my_dir.sub_dir')
|
|
545
|
+
|
|
546
|
+
Create a subdirectory only if it does not already exist, otherwise do nothing:
|
|
547
|
+
|
|
548
|
+
>>> pxt.create_dir('my_dir.sub_dir', if_exists='ignore')
|
|
549
|
+
|
|
550
|
+
Create a directory and replace if it already exists:
|
|
551
|
+
|
|
552
|
+
>>> pxt.create_dir('my_dir', if_exists='replace_force')
|
|
396
553
|
"""
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
Catalog.get().paths.check_is_valid(path, expected=None)
|
|
400
|
-
parent = Catalog.get().paths[path.parent]
|
|
401
|
-
assert parent is not None
|
|
402
|
-
with orm.Session(Env.get().engine, future=True) as session:
|
|
403
|
-
dir_md = schema.DirMd(name=path.name)
|
|
404
|
-
dir_record = schema.Dir(parent_id=parent._id, md=dataclasses.asdict(dir_md))
|
|
405
|
-
session.add(dir_record)
|
|
406
|
-
session.flush()
|
|
407
|
-
assert dir_record.id is not None
|
|
408
|
-
assert isinstance(dir_record.id, UUID)
|
|
409
|
-
dir = catalog.Dir(dir_record.id, parent._id, path.name)
|
|
410
|
-
Catalog.get().paths[path] = dir
|
|
411
|
-
session.commit()
|
|
412
|
-
_logger.info(f'Created directory `{path_str}`.')
|
|
413
|
-
print(f'Created directory `{path_str}`.')
|
|
414
|
-
return dir
|
|
415
|
-
except excs.Error as e:
|
|
416
|
-
if ignore_errors:
|
|
417
|
-
return None
|
|
418
|
-
else:
|
|
419
|
-
raise e
|
|
554
|
+
path = catalog.Path(path_str)
|
|
555
|
+
cat = Catalog.get()
|
|
420
556
|
|
|
557
|
+
if cat.paths.get_object(path):
|
|
558
|
+
# The directory already exists. Handle it as per user directive.
|
|
559
|
+
_if_exists = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
560
|
+
existing_path = _get_or_drop_existing_path(path_str, catalog.Dir, False, _if_exists)
|
|
561
|
+
if existing_path is not None:
|
|
562
|
+
assert isinstance(existing_path, catalog.Dir)
|
|
563
|
+
return existing_path
|
|
564
|
+
|
|
565
|
+
parent = cat.paths[path.parent]
|
|
566
|
+
assert parent is not None
|
|
567
|
+
with orm.Session(Env.get().engine, future=True) as session:
|
|
568
|
+
dir_md = schema.DirMd(name=path.name)
|
|
569
|
+
dir_record = schema.Dir(parent_id=parent._id, md=dataclasses.asdict(dir_md))
|
|
570
|
+
session.add(dir_record)
|
|
571
|
+
session.flush()
|
|
572
|
+
assert dir_record.id is not None
|
|
573
|
+
assert isinstance(dir_record.id, UUID)
|
|
574
|
+
dir = catalog.Dir(dir_record.id, parent._id, path.name)
|
|
575
|
+
cat.paths[path] = dir
|
|
576
|
+
session.commit()
|
|
577
|
+
_logger.info(f'Created directory `{path_str}`.')
|
|
578
|
+
print(f'Created directory `{path_str}`.')
|
|
579
|
+
return dir
|
|
421
580
|
|
|
422
581
|
def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
|
|
423
582
|
"""Remove a directory.
|
|
@@ -510,7 +669,7 @@ def list_functions() -> Styler:
|
|
|
510
669
|
paths = ['.'.join(f.self_path.split('.')[:-1]) for f in functions]
|
|
511
670
|
names = [f.name for f in functions]
|
|
512
671
|
params = [
|
|
513
|
-
', '.join([param_name + ': ' + str(param_type) for param_name, param_type in f.
|
|
672
|
+
', '.join([param_name + ': ' + str(param_type) for param_name, param_type in f.signatures[0].parameters.items()])
|
|
514
673
|
for f in functions
|
|
515
674
|
]
|
|
516
675
|
pd_df = pd.DataFrame(
|
|
@@ -518,7 +677,7 @@ def list_functions() -> Styler:
|
|
|
518
677
|
'Path': paths,
|
|
519
678
|
'Function Name': names,
|
|
520
679
|
'Parameters': params,
|
|
521
|
-
'Return Type': [str(f.
|
|
680
|
+
'Return Type': [str(f.signatures[0].get_return_type()) for f in functions],
|
|
522
681
|
}
|
|
523
682
|
)
|
|
524
683
|
pd_df = pd_df.style.set_properties(None, **{'text-align': 'left'}).set_table_styles(
|
|
@@ -37,6 +37,14 @@ class EmbeddingIndex(IndexBase):
|
|
|
37
37
|
Metric.L2: 'vector_l2_ops'
|
|
38
38
|
}
|
|
39
39
|
|
|
40
|
+
metric: Metric
|
|
41
|
+
value_expr: exprs.FunctionCall
|
|
42
|
+
string_embed: Optional[func.Function]
|
|
43
|
+
image_embed: Optional[func.Function]
|
|
44
|
+
string_embed_signature_idx: int
|
|
45
|
+
image_embed_signature_idx: int
|
|
46
|
+
index_col_type: pgvector.sqlalchemy.Vector
|
|
47
|
+
|
|
40
48
|
def __init__(
|
|
41
49
|
self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
|
|
42
50
|
image_embed: Optional[func.Function] = None):
|
|
@@ -49,18 +57,22 @@ class EmbeddingIndex(IndexBase):
|
|
|
49
57
|
raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
|
|
50
58
|
if c.col_type.is_image_type() and image_embed is None:
|
|
51
59
|
raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
self.
|
|
55
|
-
|
|
56
|
-
# verify signature
|
|
57
|
-
self._validate_embedding_fn(
|
|
60
|
+
|
|
61
|
+
if string_embed is None:
|
|
62
|
+
self.string_embed = None
|
|
63
|
+
else:
|
|
64
|
+
# verify signature and convert to a monomorphic function
|
|
65
|
+
self.string_embed = self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
|
|
66
|
+
|
|
67
|
+
if image_embed is None:
|
|
68
|
+
self.image_embed = None
|
|
69
|
+
else:
|
|
70
|
+
# verify signature and convert to a monomorphic function
|
|
71
|
+
self.image_embed = self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
|
|
58
72
|
|
|
59
73
|
self.metric = self.Metric[metric.upper()]
|
|
60
74
|
self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
|
|
61
75
|
assert isinstance(self.value_expr.col_type, ts.ArrayType)
|
|
62
|
-
self.string_embed = string_embed
|
|
63
|
-
self.image_embed = image_embed
|
|
64
76
|
vector_size = self.value_expr.col_type.shape[0]
|
|
65
77
|
assert vector_size is not None
|
|
66
78
|
self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
|
|
@@ -91,10 +103,10 @@ class EmbeddingIndex(IndexBase):
|
|
|
91
103
|
assert isinstance(item, (str, PIL.Image.Image))
|
|
92
104
|
if isinstance(item, str):
|
|
93
105
|
assert self.string_embed is not None
|
|
94
|
-
embedding = self.string_embed.exec(item)
|
|
106
|
+
embedding = self.string_embed.exec([item], {})
|
|
95
107
|
if isinstance(item, PIL.Image.Image):
|
|
96
108
|
assert self.image_embed is not None
|
|
97
|
-
embedding = self.image_embed.exec(item)
|
|
109
|
+
embedding = self.image_embed.exec([item], {})
|
|
98
110
|
|
|
99
111
|
if self.metric == self.Metric.COSINE:
|
|
100
112
|
return val_column.sa_col.cosine_distance(embedding) * -1 + 1
|
|
@@ -110,10 +122,10 @@ class EmbeddingIndex(IndexBase):
|
|
|
110
122
|
embedding: Optional[np.ndarray] = None
|
|
111
123
|
if isinstance(item, str):
|
|
112
124
|
assert self.string_embed is not None
|
|
113
|
-
embedding = self.string_embed.exec(item)
|
|
125
|
+
embedding = self.string_embed.exec([item], {})
|
|
114
126
|
if isinstance(item, PIL.Image.Image):
|
|
115
127
|
assert self.image_embed is not None
|
|
116
|
-
embedding = self.image_embed.exec(item)
|
|
128
|
+
embedding = self.image_embed.exec([item], {})
|
|
117
129
|
assert embedding is not None
|
|
118
130
|
|
|
119
131
|
if self.metric == self.Metric.COSINE:
|
|
@@ -132,27 +144,33 @@ class EmbeddingIndex(IndexBase):
|
|
|
132
144
|
return 'embedding'
|
|
133
145
|
|
|
134
146
|
@classmethod
|
|
135
|
-
def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) ->
|
|
136
|
-
"""Validate the signature"""
|
|
147
|
+
def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> func.Function:
|
|
148
|
+
"""Validate that the Function has a matching signature, and return the corresponding monomorphic function."""
|
|
137
149
|
assert isinstance(embed_fn, func.Function)
|
|
138
|
-
sig = embed_fn.signature
|
|
139
150
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
151
|
+
signature_idx: int = -1
|
|
152
|
+
for idx, sig in enumerate(embed_fn.signatures):
|
|
153
|
+
# The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
|
|
154
|
+
# has more than one parameter, as long as it has at most one *required* parameter.
|
|
155
|
+
if (len(sig.parameters) >= 1
|
|
156
|
+
and len(sig.required_parameters) <= 1
|
|
157
|
+
and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
|
|
158
|
+
signature_idx = idx
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
if signature_idx == -1:
|
|
162
|
+
raise excs.Error(f'{name} must take a single {expected_type.name.lower()} parameter')
|
|
163
|
+
|
|
164
|
+
resolved_fn = embed_fn._resolved_fns[signature_idx]
|
|
147
165
|
|
|
148
166
|
# validate return type
|
|
149
167
|
param_name = sig.parameters_by_pos[0].name
|
|
150
168
|
if expected_type == ts.ColumnType.Type.STRING:
|
|
151
|
-
return_type =
|
|
169
|
+
return_type = resolved_fn.call_return_type([], {param_name: 'dummy'})
|
|
152
170
|
else:
|
|
153
171
|
assert expected_type == ts.ColumnType.Type.IMAGE
|
|
154
172
|
img = PIL.Image.new('RGB', (512, 512))
|
|
155
|
-
return_type =
|
|
173
|
+
return_type = resolved_fn.call_return_type([], {param_name: img})
|
|
156
174
|
assert return_type is not None
|
|
157
175
|
if not isinstance(return_type, ts.ArrayType):
|
|
158
176
|
raise excs.Error(f'{name} must return an array, but returns {return_type}')
|
|
@@ -161,6 +179,8 @@ class EmbeddingIndex(IndexBase):
|
|
|
161
179
|
if len(shape) != 1 or shape[0] == None:
|
|
162
180
|
raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
|
|
163
181
|
|
|
182
|
+
return resolved_fn
|
|
183
|
+
|
|
164
184
|
def as_dict(self) -> dict:
|
|
165
185
|
return {
|
|
166
186
|
'metric': self.metric.name.lower(),
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 25
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from uuid import UUID
|
|
1
2
|
import sqlalchemy as sql
|
|
2
3
|
|
|
3
4
|
from pixeltable.metadata import register_converter
|
|
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
16
17
|
# External stores are not migratable; just drop them
|
|
17
18
|
del table_md['remotes']
|
|
18
19
|
table_md['external_stores'] = {}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from uuid import UUID
|
|
1
2
|
import sqlalchemy as sql
|
|
2
3
|
|
|
3
4
|
from pixeltable.metadata import register_converter
|
|
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
|
|
|
12
13
|
)
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def __update_table_md(table_md: dict) -> None:
|
|
16
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
16
17
|
# key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
|
|
17
18
|
if len(table_md['index_md']) == 0:
|
|
18
19
|
return
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
from uuid import UUID
|
|
4
|
+
import sqlalchemy as sql
|
|
5
|
+
|
|
6
|
+
from pixeltable.metadata import register_converter
|
|
7
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
8
|
+
from pixeltable.metadata.schema import Table
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
@register_converter(version=23)
|
|
13
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
14
|
+
convert_table_md(
|
|
15
|
+
engine,
|
|
16
|
+
table_md_updater=__update_table_md
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
def __update_table_md(table_md: dict, table_id: UUID) -> None:
|
|
20
|
+
"""update the index metadata to add indexed_col_tbl_id column if it is missing
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
table_md (dict): copy of the original table metadata. this gets updated in place.
|
|
24
|
+
table_id (UUID): the table id
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
if len(table_md['index_md']) == 0:
|
|
28
|
+
return
|
|
29
|
+
for idx_md in table_md['index_md'].values():
|
|
30
|
+
if 'indexed_col_tbl_id' not in idx_md:
|
|
31
|
+
# index metadata is missing indexed_col_tbl_id
|
|
32
|
+
# assume that the indexed column is in the same table
|
|
33
|
+
# and update the index metadata.
|
|
34
|
+
_logger.info(f'Updating index metadata for table: {table_id} index: {idx_md["id"]}')
|
|
35
|
+
idx_md['indexed_col_tbl_id'] = str(table_id)
|