pixeltable 0.2.27__py3-none-any.whl → 0.2.29__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (50) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/dir.py +6 -0
  4. pixeltable/catalog/globals.py +13 -0
  5. pixeltable/catalog/named_function.py +4 -0
  6. pixeltable/catalog/path_dict.py +37 -11
  7. pixeltable/catalog/schema_object.py +6 -0
  8. pixeltable/catalog/table.py +22 -5
  9. pixeltable/catalog/table_version.py +22 -8
  10. pixeltable/dataframe.py +201 -3
  11. pixeltable/env.py +9 -3
  12. pixeltable/exec/expr_eval_node.py +1 -1
  13. pixeltable/exec/sql_node.py +2 -2
  14. pixeltable/exprs/expr.py +1 -0
  15. pixeltable/exprs/function_call.py +134 -24
  16. pixeltable/exprs/inline_expr.py +22 -2
  17. pixeltable/exprs/row_builder.py +1 -1
  18. pixeltable/exprs/similarity_expr.py +9 -2
  19. pixeltable/func/aggregate_function.py +148 -68
  20. pixeltable/func/callable_function.py +49 -13
  21. pixeltable/func/expr_template_function.py +55 -24
  22. pixeltable/func/function.py +183 -22
  23. pixeltable/func/function_registry.py +2 -1
  24. pixeltable/func/query_template_function.py +11 -6
  25. pixeltable/func/signature.py +64 -7
  26. pixeltable/func/udf.py +57 -35
  27. pixeltable/functions/globals.py +54 -34
  28. pixeltable/functions/json.py +3 -8
  29. pixeltable/functions/ollama.py +4 -4
  30. pixeltable/functions/timestamp.py +1 -1
  31. pixeltable/functions/video.py +3 -9
  32. pixeltable/functions/vision.py +1 -1
  33. pixeltable/globals.py +218 -59
  34. pixeltable/index/embedding_index.py +44 -24
  35. pixeltable/metadata/__init__.py +1 -1
  36. pixeltable/metadata/converters/convert_16.py +2 -1
  37. pixeltable/metadata/converters/convert_17.py +2 -1
  38. pixeltable/metadata/converters/convert_23.py +35 -0
  39. pixeltable/metadata/converters/convert_24.py +47 -0
  40. pixeltable/metadata/converters/util.py +4 -2
  41. pixeltable/metadata/notes.py +2 -0
  42. pixeltable/metadata/schema.py +1 -0
  43. pixeltable/tool/create_test_db_dump.py +11 -0
  44. pixeltable/tool/doc_plugins/griffe.py +4 -3
  45. pixeltable/type_system.py +182 -47
  46. {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/METADATA +3 -2
  47. {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/RECORD +50 -48
  48. {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/LICENSE +0 -0
  49. {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/WHEEL +0 -0
  50. {pixeltable-0.2.27.dist-info → pixeltable-0.2.29.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
- from typing import Any, Iterable, Optional, Union, Literal
3
+ from typing import Any, Iterable, Optional, Union, Literal, Type
4
4
  from uuid import UUID
5
5
 
6
6
  import pandas as pd
@@ -20,11 +20,68 @@ from pixeltable.utils.filecache import FileCache
20
20
 
21
21
  _logger = logging.getLogger('pixeltable')
22
22
 
23
-
24
23
  def init() -> None:
25
24
  """Initializes the Pixeltable environment."""
26
25
  _ = Catalog.get()
27
26
 
27
+ def _get_or_drop_existing_path(
28
+ path_str: str,
29
+ expected_obj_type: Type[catalog.SchemaObject],
30
+ expected_snapshot: bool,
31
+ if_exists: catalog.IfExistsParam
32
+ ) -> Optional[catalog.SchemaObject]:
33
+ """Handle schema object path collision during creation according to the if_exists parameter.
34
+
35
+ Args:
36
+ path_str: An existing and valid path to the dir, table, view, or snapshot.
37
+ expected_obj_type: Whether the caller of this function is creating a dir, table, or view at the existing path.
38
+ expected_snapshot: Whether the caller of this function is creating a snapshot at the existing path.
39
+ if_exists: Directive regarding how to handle the existing path.
40
+
41
+ Returns:
42
+ A handle to the existing dir, table, view, or snapshot, if `if_exists='ignore'`, otherwise `None`.
43
+
44
+ Raises:
45
+ Error: If the existing path is not of the expected type, or if the existing path has dependents and
46
+ `if_exists='replace'` or `if_exists='replace_force'`.
47
+ """
48
+ cat = Catalog.get()
49
+ path = catalog.Path(path_str)
50
+ assert cat.paths.get_object(path) is not None
51
+
52
+ if if_exists == catalog.IfExistsParam.ERROR:
53
+ raise excs.Error(f'Path `{path_str}` already exists.')
54
+
55
+ existing_path = cat.paths[path]
56
+ existing_path_is_snapshot = 'is_snapshot' in existing_path.get_metadata() and existing_path.get_metadata()['is_snapshot']
57
+ obj_type_str = 'Snapshot' if expected_snapshot else expected_obj_type._display_name().capitalize()
58
+ # Check if the existing path is of expected type.
59
+ if (not isinstance(existing_path, expected_obj_type)
60
+ or (expected_snapshot and not existing_path_is_snapshot)):
61
+ raise excs.Error(f'Path `{path_str}` already exists but is not a {obj_type_str}. Cannot {if_exists.name.lower()} it.')
62
+
63
+ # if_exists='ignore' return the handle to the existing object.
64
+ assert isinstance(existing_path, expected_obj_type)
65
+ if if_exists == catalog.IfExistsParam.IGNORE:
66
+ return existing_path
67
+
68
+ # Check if the existing object has dependents. If so, cannot replace it
69
+ # unless if_exists='replace_force'.
70
+ has_dependents = existing_path._has_dependents
71
+ if if_exists == catalog.IfExistsParam.REPLACE and has_dependents:
72
+ raise excs.Error(f"{obj_type_str} `{path_str}` already exists and has dependents. Use `if_exists='replace_force'` to replace it.")
73
+ else:
74
+ assert if_exists == catalog.IfExistsParam.REPLACE_FORCE or not has_dependents
75
+ # Drop the existing path so it can be replaced.
76
+ # Any errors during drop will be raised.
77
+ _logger.info(f"Dropping {obj_type_str} `{path_str}` to replace it.")
78
+ if isinstance(existing_path, catalog.Dir):
79
+ drop_dir(path_str, force=True, ignore_errors=False)
80
+ else:
81
+ drop_table(path_str, force=True, ignore_errors=False)
82
+ assert cat.paths.get_object(path) is None
83
+
84
+ return None
28
85
 
29
86
  def create_table(
30
87
  path_str: str,
@@ -33,7 +90,8 @@ def create_table(
33
90
  primary_key: Optional[Union[str, list[str]]] = None,
34
91
  num_retained_versions: int = 10,
35
92
  comment: str = '',
36
- media_validation: Literal['on_read', 'on_write'] = 'on_write'
93
+ media_validation: Literal['on_read', 'on_write'] = 'on_write',
94
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error'
37
95
  ) -> catalog.Table:
38
96
  """Create a new base table.
39
97
 
@@ -46,15 +104,25 @@ def create_table(
46
104
  num_retained_versions: Number of versions of the table to retain.
47
105
  comment: An optional comment; its meaning is user-defined.
48
106
  media_validation: Media validation policy for the table.
49
-
50
107
  - `'on_read'`: validate media files at query time
51
108
  - `'on_write'`: validate media files during insert/update operations
109
+ if_exists: Directive regarding how to handle if the path already exists.
110
+ Must be one of the following:
111
+ - `'error'`: raise an error
112
+ - `'ignore'`: do nothing and return the existing table handle
113
+ - `'replace'`: if the existing table has no views, drop and replace it with a new one
114
+ - `'replace_force'`: drop the existing table and all its views, and create a new one
115
+ Default is `'error'`.
52
116
 
53
117
  Returns:
54
- A handle to the newly created [`Table`][pixeltable.Table].
118
+ A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
119
+ Please note the schema of the existing table may not match the schema provided in the call.
55
120
 
56
121
  Raises:
57
- Error: if the path already exists or is invalid.
122
+ Error: if the path is invalid,
123
+ or if the path already exists and `if_exists='error'`,
124
+ or if the path already exists and is not a table,
125
+ or an error occurs while attempting to create the table.
58
126
 
59
127
  Examples:
60
128
  Create a table with an int and a string column:
@@ -66,10 +134,27 @@ def create_table(
66
134
 
67
135
  >>> tbl1 = pxt.get_table('orig_table')
68
136
  ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
137
+
138
+ Create a table if does not already exist, otherwise get the existing table:
139
+
140
+ >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
141
+
142
+ Create a table with an int and a float column, and replace any existing table:
143
+
144
+ >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.Float}, if_exists='replace')
69
145
  """
70
146
  path = catalog.Path(path_str)
71
- Catalog.get().paths.check_is_valid(path, expected=None)
72
- dir = Catalog.get().paths[path.parent]
147
+ cat = Catalog.get()
148
+
149
+ if cat.paths.get_object(path) is not None:
150
+ # The table already exists. Handle it as per user directive.
151
+ _if_exists = catalog.IfExistsParam.validated(if_exists, 'if_exists')
152
+ existing_table = _get_or_drop_existing_path(path_str, catalog.InsertableTable, False, _if_exists)
153
+ if existing_table is not None:
154
+ assert isinstance(existing_table, catalog.Table)
155
+ return existing_table
156
+
157
+ dir = cat.paths[path.parent]
73
158
 
74
159
  df: Optional[DataFrame] = None
75
160
  if isinstance(schema_or_df, dict):
@@ -96,7 +181,7 @@ def create_table(
96
181
  tbl = catalog.InsertableTable._create(
97
182
  dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
98
183
  comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
99
- Catalog.get().paths[path] = tbl
184
+ cat.paths[path] = tbl
100
185
 
101
186
  _logger.info(f'Created table `{path_str}`.')
102
187
  return tbl
@@ -112,7 +197,7 @@ def create_view(
112
197
  num_retained_versions: int = 10,
113
198
  comment: str = '',
114
199
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
115
- ignore_errors: bool = False,
200
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
116
201
  ) -> Optional[catalog.Table]:
117
202
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
118
203
 
@@ -130,20 +215,45 @@ def create_view(
130
215
  the base table.
131
216
  num_retained_versions: Number of versions of the view to retain.
132
217
  comment: Optional comment for the view.
133
- ignore_errors: if True, fail silently if the path already exists or is invalid.
218
+ media_validation: Media validation policy for the view.
219
+ - `'on_read'`: validate media files at query time
220
+ - `'on_write'`: validate media files during insert/update operations
221
+ if_exists: Directive regarding how to handle if the path already exists.
222
+ Must be one of the following:
223
+ - `'error'`: raise an error
224
+ - `'ignore'`: do nothing and return the existing view handle
225
+ - `'replace'`: if the existing view has no dependents, drop and replace it with a new one
226
+ - `'replace_force'`: drop the existing view and all its dependents, and create a new one
227
+ Default is `'error'`.
134
228
 
135
229
  Returns:
136
230
  A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
137
- exists or is invalid and `ignore_errors=True`, returns `None`.
231
+ exists and `if_exists='ignore'`, returns a handle to the existing view. Please note the schema
232
+ or the base of the existing view may not match those provided in the call.
138
233
 
139
234
  Raises:
140
- Error: if the path already exists or is invalid and `ignore_errors=False`.
235
+ Error: if the path is invalid,
236
+ or if the path already exists and `if_exists='error'`,
237
+ or if the path already exists and is not a view,
238
+ or an error occurs while attempting to create the view.
141
239
 
142
240
  Examples:
143
241
  Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10:
144
242
 
145
243
  >>> tbl = pxt.get_table('my_table')
146
244
  ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10))
245
+
246
+ Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10,
247
+ and if it not already exist. Otherwise, get the existing view named `my_view`:
248
+
249
+ >>> tbl = pxt.get_table('my_table')
250
+ ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10), if_exists='ignore')
251
+
252
+ Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 100,
253
+ and replace any existing view named `my_view`:
254
+
255
+ >>> tbl = pxt.get_table('my_table')
256
+ ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
147
257
  """
148
258
  where: Optional[exprs.Expr] = None
149
259
  if isinstance(base, catalog.Table):
@@ -157,15 +267,19 @@ def create_view(
157
267
  else:
158
268
  raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
159
269
  assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
270
+
160
271
  path = catalog.Path(path_str)
161
- try:
162
- Catalog.get().paths.check_is_valid(path, expected=None)
163
- except Exception as e:
164
- if ignore_errors:
165
- return None
166
- else:
167
- raise e
168
- dir = Catalog.get().paths[path.parent]
272
+ cat = Catalog.get()
273
+
274
+ if cat.paths.get_object(path) is not None:
275
+ # The view already exists. Handle it as per user directive.
276
+ _if_exists = catalog.IfExistsParam.validated(if_exists, 'if_exists')
277
+ existing_path = _get_or_drop_existing_path(path_str, catalog.View, is_snapshot, _if_exists)
278
+ if existing_path is not None:
279
+ assert isinstance(existing_path, catalog.View)
280
+ return existing_path
281
+
282
+ dir = cat.paths[path.parent]
169
283
 
170
284
  if additional_columns is None:
171
285
  additional_columns = {}
@@ -179,7 +293,7 @@ def create_view(
179
293
  is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
180
294
  num_retained_versions=num_retained_versions, comment=comment,
181
295
  media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
182
- Catalog.get().paths[path] = view
296
+ cat.paths[path] = view
183
297
  _logger.info(f'Created view `{path_str}`.')
184
298
  FileCache.get().emit_eviction_warnings()
185
299
  return view
@@ -194,7 +308,7 @@ def create_snapshot(
194
308
  num_retained_versions: int = 10,
195
309
  comment: str = '',
196
310
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
197
- ignore_errors: bool = False,
311
+ if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
198
312
  ) -> Optional[catalog.Table]:
199
313
  """Create a snapshot of an existing table object (which itself can be a view or a snapshot or a base table).
200
314
 
@@ -209,21 +323,44 @@ def create_snapshot(
209
323
  iterator: The iterator to use for this snapshot. If specified, then this snapshot will be a one-to-many view of
210
324
  the base table.
211
325
  num_retained_versions: Number of versions of the view to retain.
212
- comment: Optional comment for the view.
213
- ignore_errors: if True, fail silently if the path already exists or is invalid.
326
+ comment: Optional comment for the snapshot.
327
+ media_validation: Media validation policy for the snapshot.
328
+ - `'on_read'`: validate media files at query time
329
+ - `'on_write'`: validate media files during insert/update operations
330
+ if_exists: Directive regarding how to handle if the path already exists.
331
+ Must be one of the following:
332
+ - `'error'`: raise an error
333
+ - `'ignore'`: do nothing and return the existing snapshot handle
334
+ - `'replace'`: if the existing snapshot has no dependents, drop and replace it with a new one
335
+ - `'replace_force'`: drop the existing snapshot and all its dependents, and create a new one
336
+ Default is `'error'`.
214
337
 
215
338
  Returns:
216
- A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot. If the path already
217
- exists or is invalid and `ignore_errors=True`, returns `None`.
339
+ A handle to the [`Table`][pixeltable.Table] representing the newly created snapshot.
340
+ Please note the schema or base of the existing snapshot may not match those provided in the call.
218
341
 
219
342
  Raises:
220
- Error: if the path already exists or is invalid and `ignore_errors=False`.
343
+ Error: if the path is invalid,
344
+ or if the path already exists and `if_exists='error'`,
345
+ or if the path already exists and is not a snapshot,
346
+ or an error occurs while attempting to create the snapshot.
221
347
 
222
348
  Examples:
223
- Create a snapshot of `my_table`:
349
+ Create a snapshot `my_snapshot` of a table `my_table`:
224
350
 
225
351
  >>> tbl = pxt.get_table('my_table')
226
352
  ... snapshot = pxt.create_snapshot('my_snapshot', tbl)
353
+
354
+ Create a snapshot `my_snapshot` of a view `my_view` with additional int column `col3`,
355
+ if `my_snapshot` does not already exist:
356
+
357
+ >>> view = pxt.get_table('my_view')
358
+ ... snapshot = pxt.create_snapshot('my_snapshot', view, additional_columns={'col3': pxt.Int}, if_exists='ignore')
359
+
360
+ Create a snapshot `my_snapshot` on a table `my_table`, and replace any existing snapshot named `my_snapshot`:
361
+
362
+ >>> tbl = pxt.get_table('my_table')
363
+ ... snapshot = pxt.create_snapshot('my_snapshot', tbl, if_exists='replace_force')
227
364
  """
228
365
  return create_view(
229
366
  path_str,
@@ -234,7 +371,7 @@ def create_snapshot(
234
371
  num_retained_versions=num_retained_versions,
235
372
  comment=comment,
236
373
  media_validation=media_validation,
237
- ignore_errors=ignore_errors,
374
+ if_exists=if_exists,
238
375
  )
239
376
 
240
377
 
@@ -376,16 +513,28 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
376
513
  Catalog.get().paths.check_is_valid(path, expected=catalog.Dir)
377
514
  return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
378
515
 
379
-
380
- def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.Dir]:
516
+ def create_dir(path_str: str, if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error') -> Optional[catalog.Dir]:
381
517
  """Create a directory.
382
518
 
383
519
  Args:
384
520
  path_str: Path to the directory.
385
- ignore_errors: if `True`, will return silently instead of throwing an exception if an error occurs.
521
+ if_exists: Directive regarding how to handle if the path already exists.
522
+ Must be one of the following:
523
+ - `'error'`: raise an error
524
+ - `'ignore'`: do nothing and return the existing directory handle
525
+ - `'replace'`: if the existing directory is empty, drop it and create a new one
526
+ - `'replace_force'`: drop the existing directory and all its children, and create a new one
527
+ Default is `'error'`.
528
+
529
+ Returns:
530
+ A handle to the newly created directory, or to an already existing directory at the path when `if_exists='ignore'`.
531
+ Please note the existing directory may not be empty.
386
532
 
387
533
  Raises:
388
- Error: If the path already exists or the parent is not a directory, and `ignore_errors=False`.
534
+ Error: If the path is invalid,
535
+ or if the path already exists and `if_exists='error'`,
536
+ or if the path already exists and is not a directory,
537
+ or an error occurs while attempting to create the directory.
389
538
 
390
539
  Examples:
391
540
  >>> pxt.create_dir('my_dir')
@@ -393,31 +542,41 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.D
393
542
  Create a subdirectory:
394
543
 
395
544
  >>> pxt.create_dir('my_dir.sub_dir')
545
+
546
+ Create a subdirectory only if it does not already exist, otherwise do nothing:
547
+
548
+ >>> pxt.create_dir('my_dir.sub_dir', if_exists='ignore')
549
+
550
+ Create a directory and replace if it already exists:
551
+
552
+ >>> pxt.create_dir('my_dir', if_exists='replace_force')
396
553
  """
397
- try:
398
- path = catalog.Path(path_str)
399
- Catalog.get().paths.check_is_valid(path, expected=None)
400
- parent = Catalog.get().paths[path.parent]
401
- assert parent is not None
402
- with orm.Session(Env.get().engine, future=True) as session:
403
- dir_md = schema.DirMd(name=path.name)
404
- dir_record = schema.Dir(parent_id=parent._id, md=dataclasses.asdict(dir_md))
405
- session.add(dir_record)
406
- session.flush()
407
- assert dir_record.id is not None
408
- assert isinstance(dir_record.id, UUID)
409
- dir = catalog.Dir(dir_record.id, parent._id, path.name)
410
- Catalog.get().paths[path] = dir
411
- session.commit()
412
- _logger.info(f'Created directory `{path_str}`.')
413
- print(f'Created directory `{path_str}`.')
414
- return dir
415
- except excs.Error as e:
416
- if ignore_errors:
417
- return None
418
- else:
419
- raise e
554
+ path = catalog.Path(path_str)
555
+ cat = Catalog.get()
420
556
 
557
+ if cat.paths.get_object(path):
558
+ # The directory already exists. Handle it as per user directive.
559
+ _if_exists = catalog.IfExistsParam.validated(if_exists, 'if_exists')
560
+ existing_path = _get_or_drop_existing_path(path_str, catalog.Dir, False, _if_exists)
561
+ if existing_path is not None:
562
+ assert isinstance(existing_path, catalog.Dir)
563
+ return existing_path
564
+
565
+ parent = cat.paths[path.parent]
566
+ assert parent is not None
567
+ with orm.Session(Env.get().engine, future=True) as session:
568
+ dir_md = schema.DirMd(name=path.name)
569
+ dir_record = schema.Dir(parent_id=parent._id, md=dataclasses.asdict(dir_md))
570
+ session.add(dir_record)
571
+ session.flush()
572
+ assert dir_record.id is not None
573
+ assert isinstance(dir_record.id, UUID)
574
+ dir = catalog.Dir(dir_record.id, parent._id, path.name)
575
+ cat.paths[path] = dir
576
+ session.commit()
577
+ _logger.info(f'Created directory `{path_str}`.')
578
+ print(f'Created directory `{path_str}`.')
579
+ return dir
421
580
 
422
581
  def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) -> None:
423
582
  """Remove a directory.
@@ -510,7 +669,7 @@ def list_functions() -> Styler:
510
669
  paths = ['.'.join(f.self_path.split('.')[:-1]) for f in functions]
511
670
  names = [f.name for f in functions]
512
671
  params = [
513
- ', '.join([param_name + ': ' + str(param_type) for param_name, param_type in f.signature.parameters.items()])
672
+ ', '.join([param_name + ': ' + str(param_type) for param_name, param_type in f.signatures[0].parameters.items()])
514
673
  for f in functions
515
674
  ]
516
675
  pd_df = pd.DataFrame(
@@ -518,7 +677,7 @@ def list_functions() -> Styler:
518
677
  'Path': paths,
519
678
  'Function Name': names,
520
679
  'Parameters': params,
521
- 'Return Type': [str(f.signature.get_return_type()) for f in functions],
680
+ 'Return Type': [str(f.signatures[0].get_return_type()) for f in functions],
522
681
  }
523
682
  )
524
683
  pd_df = pd_df.style.set_properties(None, **{'text-align': 'left'}).set_table_styles(
@@ -37,6 +37,14 @@ class EmbeddingIndex(IndexBase):
37
37
  Metric.L2: 'vector_l2_ops'
38
38
  }
39
39
 
40
+ metric: Metric
41
+ value_expr: exprs.FunctionCall
42
+ string_embed: Optional[func.Function]
43
+ image_embed: Optional[func.Function]
44
+ string_embed_signature_idx: int
45
+ image_embed_signature_idx: int
46
+ index_col_type: pgvector.sqlalchemy.Vector
47
+
40
48
  def __init__(
41
49
  self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
42
50
  image_embed: Optional[func.Function] = None):
@@ -49,18 +57,22 @@ class EmbeddingIndex(IndexBase):
49
57
  raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
50
58
  if c.col_type.is_image_type() and image_embed is None:
51
59
  raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
52
- if string_embed is not None:
53
- # verify signature
54
- self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
55
- if image_embed is not None:
56
- # verify signature
57
- self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
60
+
61
+ if string_embed is None:
62
+ self.string_embed = None
63
+ else:
64
+ # verify signature and convert to a monomorphic function
65
+ self.string_embed = self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
66
+
67
+ if image_embed is None:
68
+ self.image_embed = None
69
+ else:
70
+ # verify signature and convert to a monomorphic function
71
+ self.image_embed = self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
58
72
 
59
73
  self.metric = self.Metric[metric.upper()]
60
74
  self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
61
75
  assert isinstance(self.value_expr.col_type, ts.ArrayType)
62
- self.string_embed = string_embed
63
- self.image_embed = image_embed
64
76
  vector_size = self.value_expr.col_type.shape[0]
65
77
  assert vector_size is not None
66
78
  self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
@@ -91,10 +103,10 @@ class EmbeddingIndex(IndexBase):
91
103
  assert isinstance(item, (str, PIL.Image.Image))
92
104
  if isinstance(item, str):
93
105
  assert self.string_embed is not None
94
- embedding = self.string_embed.exec(item)
106
+ embedding = self.string_embed.exec([item], {})
95
107
  if isinstance(item, PIL.Image.Image):
96
108
  assert self.image_embed is not None
97
- embedding = self.image_embed.exec(item)
109
+ embedding = self.image_embed.exec([item], {})
98
110
 
99
111
  if self.metric == self.Metric.COSINE:
100
112
  return val_column.sa_col.cosine_distance(embedding) * -1 + 1
@@ -110,10 +122,10 @@ class EmbeddingIndex(IndexBase):
110
122
  embedding: Optional[np.ndarray] = None
111
123
  if isinstance(item, str):
112
124
  assert self.string_embed is not None
113
- embedding = self.string_embed.exec(item)
125
+ embedding = self.string_embed.exec([item], {})
114
126
  if isinstance(item, PIL.Image.Image):
115
127
  assert self.image_embed is not None
116
- embedding = self.image_embed.exec(item)
128
+ embedding = self.image_embed.exec([item], {})
117
129
  assert embedding is not None
118
130
 
119
131
  if self.metric == self.Metric.COSINE:
@@ -132,27 +144,33 @@ class EmbeddingIndex(IndexBase):
132
144
  return 'embedding'
133
145
 
134
146
  @classmethod
135
- def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> None:
136
- """Validate the signature"""
147
+ def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> func.Function:
148
+ """Validate that the Function has a matching signature, and return the corresponding monomorphic function."""
137
149
  assert isinstance(embed_fn, func.Function)
138
- sig = embed_fn.signature
139
150
 
140
- # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
141
- # has more than one parameter, as long as it has at most one *required* parameter.
142
- if (len(sig.parameters) == 0
143
- or len(sig.required_parameters) > 1
144
- or sig.parameters_by_pos[0].col_type.type_enum != expected_type):
145
- raise excs.Error(
146
- f'{name} must take a single {expected_type.name.lower()} parameter, but has signature {sig}')
151
+ signature_idx: int = -1
152
+ for idx, sig in enumerate(embed_fn.signatures):
153
+ # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
154
+ # has more than one parameter, as long as it has at most one *required* parameter.
155
+ if (len(sig.parameters) >= 1
156
+ and len(sig.required_parameters) <= 1
157
+ and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
158
+ signature_idx = idx
159
+ break
160
+
161
+ if signature_idx == -1:
162
+ raise excs.Error(f'{name} must take a single {expected_type.name.lower()} parameter')
163
+
164
+ resolved_fn = embed_fn._resolved_fns[signature_idx]
147
165
 
148
166
  # validate return type
149
167
  param_name = sig.parameters_by_pos[0].name
150
168
  if expected_type == ts.ColumnType.Type.STRING:
151
- return_type = embed_fn.call_return_type({param_name: 'dummy'})
169
+ return_type = resolved_fn.call_return_type([], {param_name: 'dummy'})
152
170
  else:
153
171
  assert expected_type == ts.ColumnType.Type.IMAGE
154
172
  img = PIL.Image.new('RGB', (512, 512))
155
- return_type = embed_fn.call_return_type({param_name: img})
173
+ return_type = resolved_fn.call_return_type([], {param_name: img})
156
174
  assert return_type is not None
157
175
  if not isinstance(return_type, ts.ArrayType):
158
176
  raise excs.Error(f'{name} must return an array, but returns {return_type}')
@@ -161,6 +179,8 @@ class EmbeddingIndex(IndexBase):
161
179
  if len(shape) != 1 or shape[0] == None:
162
180
  raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
163
181
 
182
+ return resolved_fn
183
+
164
184
  def as_dict(self) -> dict:
165
185
  return {
166
186
  'metric': self.metric.name.lower(),
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 23
13
+ VERSION = 25
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -1,3 +1,4 @@
1
+ from uuid import UUID
1
2
  import sqlalchemy as sql
2
3
 
3
4
  from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
12
13
  )
13
14
 
14
15
 
15
- def __update_table_md(table_md: dict) -> None:
16
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
16
17
  # External stores are not migratable; just drop them
17
18
  del table_md['remotes']
18
19
  table_md['external_stores'] = {}
@@ -1,3 +1,4 @@
1
+ from uuid import UUID
1
2
  import sqlalchemy as sql
2
3
 
3
4
  from pixeltable.metadata import register_converter
@@ -12,7 +13,7 @@ def _(engine: sql.engine.Engine) -> None:
12
13
  )
13
14
 
14
15
 
15
- def __update_table_md(table_md: dict) -> None:
16
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
16
17
  # key changes in IndexMd.init_args: img_embed -> image_embed, txt_embed -> string_embed
17
18
  if len(table_md['index_md']) == 0:
18
19
  return
@@ -0,0 +1,35 @@
1
+ import logging
2
+ from typing import Any, Optional
3
+ from uuid import UUID
4
+ import sqlalchemy as sql
5
+
6
+ from pixeltable.metadata import register_converter
7
+ from pixeltable.metadata.converters.util import convert_table_md
8
+ from pixeltable.metadata.schema import Table
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+ @register_converter(version=23)
13
+ def _(engine: sql.engine.Engine) -> None:
14
+ convert_table_md(
15
+ engine,
16
+ table_md_updater=__update_table_md
17
+ )
18
+
19
+ def __update_table_md(table_md: dict, table_id: UUID) -> None:
20
+ """update the index metadata to add indexed_col_tbl_id column if it is missing
21
+
22
+ Args:
23
+ table_md (dict): copy of the original table metadata. this gets updated in place.
24
+ table_id (UUID): the table id
25
+
26
+ """
27
+ if len(table_md['index_md']) == 0:
28
+ return
29
+ for idx_md in table_md['index_md'].values():
30
+ if 'indexed_col_tbl_id' not in idx_md:
31
+ # index metadata is missing indexed_col_tbl_id
32
+ # assume that the indexed column is in the same table
33
+ # and update the index metadata.
34
+ _logger.info(f'Updating index metadata for table: {table_id} index: {idx_md["id"]}')
35
+ idx_md['indexed_col_tbl_id'] = str(table_id)