pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show
  1. pixeltable/__init__.py +7 -19
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +7 -7
  4. pixeltable/catalog/column.py +37 -11
  5. pixeltable/catalog/globals.py +21 -0
  6. pixeltable/catalog/insertable_table.py +6 -4
  7. pixeltable/catalog/table.py +227 -148
  8. pixeltable/catalog/table_version.py +66 -28
  9. pixeltable/catalog/table_version_path.py +0 -8
  10. pixeltable/catalog/view.py +18 -19
  11. pixeltable/dataframe.py +16 -32
  12. pixeltable/env.py +6 -1
  13. pixeltable/exec/__init__.py +1 -2
  14. pixeltable/exec/aggregation_node.py +27 -17
  15. pixeltable/exec/cache_prefetch_node.py +1 -1
  16. pixeltable/exec/data_row_batch.py +9 -26
  17. pixeltable/exec/exec_node.py +36 -7
  18. pixeltable/exec/expr_eval_node.py +19 -11
  19. pixeltable/exec/in_memory_data_node.py +14 -11
  20. pixeltable/exec/sql_node.py +266 -138
  21. pixeltable/exprs/__init__.py +1 -0
  22. pixeltable/exprs/arithmetic_expr.py +3 -1
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +93 -14
  26. pixeltable/exprs/comparison.py +5 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +56 -36
  29. pixeltable/exprs/expr.py +65 -63
  30. pixeltable/exprs/expr_dict.py +55 -0
  31. pixeltable/exprs/expr_set.py +26 -15
  32. pixeltable/exprs/function_call.py +53 -24
  33. pixeltable/exprs/globals.py +4 -1
  34. pixeltable/exprs/in_predicate.py +8 -7
  35. pixeltable/exprs/inline_expr.py +4 -4
  36. pixeltable/exprs/is_null.py +4 -4
  37. pixeltable/exprs/json_mapper.py +11 -12
  38. pixeltable/exprs/json_path.py +5 -10
  39. pixeltable/exprs/literal.py +5 -5
  40. pixeltable/exprs/method_ref.py +5 -4
  41. pixeltable/exprs/object_ref.py +2 -1
  42. pixeltable/exprs/row_builder.py +88 -36
  43. pixeltable/exprs/rowid_ref.py +14 -13
  44. pixeltable/exprs/similarity_expr.py +12 -7
  45. pixeltable/exprs/sql_element_cache.py +12 -6
  46. pixeltable/exprs/type_cast.py +8 -6
  47. pixeltable/exprs/variable.py +5 -4
  48. pixeltable/ext/functions/whisperx.py +7 -2
  49. pixeltable/func/aggregate_function.py +1 -1
  50. pixeltable/func/callable_function.py +2 -2
  51. pixeltable/func/function.py +11 -10
  52. pixeltable/func/function_registry.py +6 -7
  53. pixeltable/func/query_template_function.py +11 -12
  54. pixeltable/func/signature.py +17 -15
  55. pixeltable/func/udf.py +0 -4
  56. pixeltable/functions/__init__.py +2 -2
  57. pixeltable/functions/audio.py +4 -6
  58. pixeltable/functions/globals.py +84 -42
  59. pixeltable/functions/huggingface.py +31 -34
  60. pixeltable/functions/image.py +59 -45
  61. pixeltable/functions/json.py +0 -1
  62. pixeltable/functions/llama_cpp.py +106 -0
  63. pixeltable/functions/mistralai.py +2 -2
  64. pixeltable/functions/ollama.py +147 -0
  65. pixeltable/functions/openai.py +22 -25
  66. pixeltable/functions/replicate.py +72 -0
  67. pixeltable/functions/string.py +59 -50
  68. pixeltable/functions/timestamp.py +20 -20
  69. pixeltable/functions/together.py +2 -2
  70. pixeltable/functions/video.py +11 -20
  71. pixeltable/functions/whisper.py +2 -20
  72. pixeltable/globals.py +65 -74
  73. pixeltable/index/base.py +2 -2
  74. pixeltable/index/btree.py +20 -7
  75. pixeltable/index/embedding_index.py +12 -14
  76. pixeltable/io/__init__.py +1 -2
  77. pixeltable/io/external_store.py +11 -5
  78. pixeltable/io/fiftyone.py +178 -0
  79. pixeltable/io/globals.py +98 -2
  80. pixeltable/io/hf_datasets.py +1 -1
  81. pixeltable/io/label_studio.py +6 -6
  82. pixeltable/io/parquet.py +14 -13
  83. pixeltable/iterators/base.py +3 -2
  84. pixeltable/iterators/document.py +10 -8
  85. pixeltable/iterators/video.py +126 -60
  86. pixeltable/metadata/__init__.py +4 -3
  87. pixeltable/metadata/converters/convert_14.py +4 -2
  88. pixeltable/metadata/converters/convert_15.py +1 -1
  89. pixeltable/metadata/converters/convert_19.py +1 -0
  90. pixeltable/metadata/converters/convert_20.py +1 -1
  91. pixeltable/metadata/converters/convert_21.py +34 -0
  92. pixeltable/metadata/converters/util.py +54 -12
  93. pixeltable/metadata/notes.py +1 -0
  94. pixeltable/metadata/schema.py +40 -21
  95. pixeltable/plan.py +149 -165
  96. pixeltable/py.typed +0 -0
  97. pixeltable/store.py +57 -37
  98. pixeltable/tool/create_test_db_dump.py +6 -6
  99. pixeltable/tool/create_test_video.py +1 -1
  100. pixeltable/tool/doc_plugins/griffe.py +3 -34
  101. pixeltable/tool/embed_udf.py +1 -1
  102. pixeltable/tool/mypy_plugin.py +55 -0
  103. pixeltable/type_system.py +260 -61
  104. pixeltable/utils/arrow.py +10 -9
  105. pixeltable/utils/coco.py +4 -4
  106. pixeltable/utils/documents.py +16 -2
  107. pixeltable/utils/filecache.py +9 -9
  108. pixeltable/utils/formatter.py +10 -11
  109. pixeltable/utils/http_server.py +2 -5
  110. pixeltable/utils/media_store.py +6 -6
  111. pixeltable/utils/pytorch.py +10 -11
  112. pixeltable/utils/sql.py +2 -1
  113. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
  114. pixeltable-0.2.22.dist-info/RECORD +153 -0
  115. pixeltable/exec/media_validation_node.py +0 -43
  116. pixeltable/utils/help.py +0 -11
  117. pixeltable-0.2.20.dist-info/RECORD +0 -147
  118. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  119. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  120. {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
@@ -14,27 +14,9 @@ from pixeltable.env import Env
14
14
  if TYPE_CHECKING:
15
15
  from whisper import Whisper # type: ignore[import-untyped]
16
16
 
17
-
18
- @pxt.udf(
19
- param_types=[
20
- pxt.AudioType(),
21
- pxt.StringType(),
22
- pxt.JsonType(nullable=True),
23
- pxt.FloatType(nullable=True),
24
- pxt.FloatType(nullable=True),
25
- pxt.FloatType(nullable=True),
26
- pxt.BoolType(),
27
- pxt.StringType(nullable=True),
28
- pxt.BoolType(),
29
- pxt.StringType(),
30
- pxt.StringType(),
31
- pxt.StringType(),
32
- pxt.FloatType(nullable=True),
33
- pxt.JsonType(nullable=True),
34
- ]
35
- )
17
+ @pxt.udf
36
18
  def transcribe(
37
- audio: str,
19
+ audio: pxt.Audio,
38
20
  *,
39
21
  model: str,
40
22
  temperature: Optional[list[float]] = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
pixeltable/globals.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import dataclasses
2
2
  import logging
3
- from typing import Any, Iterable, Optional, Union
3
+ from typing import Any, Iterable, Optional, Union, Literal
4
4
  from uuid import UUID
5
5
 
6
6
  import pandas as pd
@@ -33,6 +33,7 @@ def create_table(
33
33
  primary_key: Optional[Union[str, list[str]]] = None,
34
34
  num_retained_versions: int = 10,
35
35
  comment: str = '',
36
+ media_validation: Literal['on_read', 'on_write'] = 'on_write'
36
37
  ) -> catalog.Table:
37
38
  """Create a new base table.
38
39
 
@@ -44,6 +45,9 @@ def create_table(
44
45
  table.
45
46
  num_retained_versions: Number of versions of the table to retain.
46
47
  comment: An optional comment; its meaning is user-defined.
48
+ media_validation: Media validation policy for the table.
49
+ - `'on_read'`: validate media files at query time
50
+ - `'on_write'`: validate media files during insert/update operations
47
51
 
48
52
  Returns:
49
53
  A handle to the newly created [`Table`][pixeltable.Table].
@@ -54,11 +58,13 @@ def create_table(
54
58
  Examples:
55
59
  Create a table with an int and a string column:
56
60
 
57
- >>> table = pxt.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
61
+ >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String})
58
62
 
59
- Create a table from a select statement over an existing table `tbl`:
63
+ Create a table from a select statement over an existing table `orig_table` (this will create a new table
64
+ containing the exact contents of the query):
60
65
 
61
- >>> table = pxt.create_table('my_table', tbl.where(tbl.col1 < 10).select(tbl.col2))
66
+ >>> tbl1 = pxt.get_table('orig_table')
67
+ ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
62
68
  """
63
69
  path = catalog.Path(path_str)
64
70
  Catalog.get().paths.check_is_valid(path, expected=None)
@@ -87,14 +93,8 @@ def create_table(
87
93
  raise excs.Error('primary_key must be a single column name or a list of column names')
88
94
 
89
95
  tbl = catalog.InsertableTable._create(
90
- dir._id,
91
- path.name,
92
- schema,
93
- df,
94
- primary_key=primary_key,
95
- num_retained_versions=num_retained_versions,
96
- comment=comment,
97
- )
96
+ dir._id, path.name, schema, df, primary_key=primary_key, num_retained_versions=num_retained_versions,
97
+ comment=comment, media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
98
98
  Catalog.get().paths[path] = tbl
99
99
 
100
100
  _logger.info(f'Created table `{path_str}`.')
@@ -105,22 +105,24 @@ def create_view(
105
105
  path_str: str,
106
106
  base: Union[catalog.Table, DataFrame],
107
107
  *,
108
- schema: Optional[dict[str, Any]] = None,
109
- filter: Optional[exprs.Expr] = None,
108
+ additional_columns: Optional[dict[str, Any]] = None,
110
109
  is_snapshot: bool = False,
111
110
  iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
112
111
  num_retained_versions: int = 10,
113
112
  comment: str = '',
113
+ media_validation: Literal['on_read', 'on_write'] = 'on_write',
114
114
  ignore_errors: bool = False,
115
115
  ) -> Optional[catalog.Table]:
116
116
  """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
117
117
 
118
118
  Args:
119
- path_str: Path to the view.
119
+ path_str: A name for the view; can be either a simple name such as `my_view`, or a pathname such as
120
+ `dir1.my_view`.
120
121
  base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
121
122
  base the view on.
122
- schema: dictionary mapping column names to column types, value expressions, or to column specifications.
123
- filter: predicate to filter rows of the base table.
123
+ additional_columns: If specified, will add these columns to the view once it is created. The format
124
+ of the `additional_columns` parameter is identical to the format of the `schema_or_df` parameter in
125
+ [`create_table`][pixeltable.create_table].
124
126
  is_snapshot: Whether the view is a snapshot.
125
127
  iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
126
128
  the base table.
@@ -130,36 +132,29 @@ def create_view(
130
132
 
131
133
  Returns:
132
134
  A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
133
- exists or is invalid and `ignore_errors=True`, returns `None`.
135
+ exists or is invalid and `ignore_errors=True`, returns `None`.
134
136
 
135
137
  Raises:
136
138
  Error: if the path already exists or is invalid and `ignore_errors=False`.
137
139
 
138
140
  Examples:
139
- Create a view with an additional int and a string column and a filter:
140
-
141
- >>> view = cl.create_view(
142
- 'my_view', base, schema={'col3': IntType(), 'col4': StringType()}, filter=base.col1 > 10)
141
+ Create a view `my_view` of an existing table `my_table`, filtering on rows where `col1` is greater than 10:
143
142
 
144
- Create a table snapshot:
143
+ >>> tbl = pxt.get_table('my_table')
144
+ ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 10))
145
145
 
146
- >>> snapshot_view = cl.create_view('my_snapshot_view', base, is_snapshot=True)
146
+ Create a snapshot of `my_table`:
147
147
 
148
- Create an immutable view with additional computed columns and a filter:
149
-
150
- >>> snapshot_view = cl.create_view(
151
- 'my_snapshot', base, schema={'col3': base.col2 + 1}, filter=base.col1 > 10, is_snapshot=True)
148
+ >>> tbl = pxt.get_table('my_table')
149
+ ... snapshot_view = pxt.create_view('my_snapshot_view', tbl, is_snapshot=True)
152
150
  """
151
+ where: Optional[exprs.Expr] = None
153
152
  if isinstance(base, catalog.Table):
154
153
  tbl_version_path = base._tbl_version_path
155
154
  elif isinstance(base, DataFrame):
156
155
  base._validate_mutable('create_view')
157
156
  tbl_version_path = base.tbl
158
- if base.where_clause is not None and filter is not None:
159
- raise excs.Error(
160
- 'Cannot specify a `filter` directly if one is already declared in a `DataFrame.where` clause'
161
- )
162
- filter = base.where_clause
157
+ where = base.where_clause
163
158
  else:
164
159
  raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
165
160
  assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
@@ -173,25 +168,18 @@ def create_view(
173
168
  raise e
174
169
  dir = Catalog.get().paths[path.parent]
175
170
 
176
- if schema is None:
177
- schema = {}
171
+ if additional_columns is None:
172
+ additional_columns = {}
178
173
  if iterator is None:
179
174
  iterator_class, iterator_args = None, None
180
175
  else:
181
176
  iterator_class, iterator_args = iterator
182
177
 
183
178
  view = catalog.View._create(
184
- dir._id,
185
- path.name,
186
- base=tbl_version_path,
187
- schema=schema,
188
- predicate=filter,
189
- is_snapshot=is_snapshot,
190
- iterator_cls=iterator_class,
191
- iterator_args=iterator_args,
192
- num_retained_versions=num_retained_versions,
193
- comment=comment,
194
- )
179
+ dir._id, path.name, base=tbl_version_path, additional_columns=additional_columns, predicate=where,
180
+ is_snapshot=is_snapshot, iterator_cls=iterator_class, iterator_args=iterator_args,
181
+ num_retained_versions=num_retained_versions, comment=comment,
182
+ media_validation=catalog.MediaValidation.validated(media_validation, 'media_validation'))
195
183
  Catalog.get().paths[path] = view
196
184
  _logger.info(f'Created view `{path_str}`.')
197
185
  FileCache.get().emit_eviction_warnings()
@@ -199,7 +187,7 @@ def create_view(
199
187
 
200
188
 
201
189
  def get_table(path: str) -> catalog.Table:
202
- """Get a handle to an existing table or view or snapshot.
190
+ """Get a handle to an existing table, view, or snapshot.
203
191
 
204
192
  Args:
205
193
  path: Path to the table.
@@ -213,15 +201,15 @@ def get_table(path: str) -> catalog.Table:
213
201
  Examples:
214
202
  Get handle for a table in the top-level directory:
215
203
 
216
- >>> table = cl.get_table('my_table')
204
+ >>> tbl = pxt.get_table('my_table')
217
205
 
218
206
  For a table in a subdirectory:
219
207
 
220
- >>> table = cl.get_table('subdir.my_table')
208
+ >>> tbl = pxt.get_table('subdir.my_table')
221
209
 
222
- For a snapshot in the top-level directory:
210
+ Handles to views and snapshots are retrieved in the same way:
223
211
 
224
- >>> table = cl.get_table('my_snapshot')
212
+ >>> tbl = pxt.get_table('my_snapshot')
225
213
  """
226
214
  p = catalog.Path(path)
227
215
  Catalog.get().paths.check_is_valid(p, expected=catalog.Table)
@@ -243,11 +231,11 @@ def move(path: str, new_path: str) -> None:
243
231
  Examples:
244
232
  Move a table to a different directory:
245
233
 
246
- >>>> cl.move('dir1.my_table', 'dir2.my_table')
234
+ >>>> pxt.move('dir1.my_table', 'dir2.my_table')
247
235
 
248
236
  Rename a table:
249
237
 
250
- >>>> cl.move('dir1.my_table', 'dir1.new_name')
238
+ >>>> pxt.move('dir1.my_table', 'dir1.new_name')
251
239
  """
252
240
  p = catalog.Path(path)
253
241
  Catalog.get().paths.check_is_valid(p, expected=catalog.SchemaObject)
@@ -260,18 +248,18 @@ def move(path: str, new_path: str) -> None:
260
248
 
261
249
 
262
250
  def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> None:
263
- """Drop a table or view or snapshot.
251
+ """Drop a table, view, or snapshot.
264
252
 
265
253
  Args:
266
254
  path: Path to the [`Table`][pixeltable.Table].
267
- force: If `True`, will also drop all views or sub-views of this table.
268
- ignore_errors: Whether to ignore errors if the table does not exist.
255
+ force: If `True`, will also drop all views and sub-views of this table.
256
+ ignore_errors: If `True`, return silently if the table does not exist (without throwing an exception).
269
257
 
270
258
  Raises:
271
- Error: If the path does not exist or does not designate a table object and ignore_errors is False.
259
+ Error: If the path does not exist or does not designate a table object, and `ignore_errors=False`.
272
260
 
273
261
  Examples:
274
- >>> cl.drop_table('my_table')
262
+ >>> pxt.drop_table('my_table')
275
263
  """
276
264
  cat = Catalog.get()
277
265
  path_obj = catalog.Path(path)
@@ -302,7 +290,8 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
302
290
 
303
291
  Args:
304
292
  dir_path: Path to the directory. Defaults to the root directory.
305
- recursive: Whether to list tables in subdirectories as well.
293
+ recursive: If `False`, returns only those tables that are directly contained in specified directory; if
294
+ `True`, returns all tables that are descendants of the specified directory, recursively.
306
295
 
307
296
  Returns:
308
297
  A list of [`Table`][pixeltable.Table] paths.
@@ -313,13 +302,11 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
313
302
  Examples:
314
303
  List tables in top-level directory:
315
304
 
316
- >>> cl.list_tables()
317
- ['my_table', ...]
305
+ >>> pxt.list_tables()
318
306
 
319
307
  List tables in 'dir1':
320
308
 
321
- >>> cl.list_tables('dir1')
322
- [...]
309
+ >>> pxt.list_tables('dir1')
323
310
  """
324
311
  assert dir_path is not None
325
312
  path = catalog.Path(dir_path, empty_is_valid=True)
@@ -332,17 +319,17 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.D
332
319
 
333
320
  Args:
334
321
  path_str: Path to the directory.
335
- ignore_errors: if True, silently returns on error
322
+ ignore_errors: if `True`, will return silently instead of throwing an exception if an error occurs.
336
323
 
337
324
  Raises:
338
- Error: If the path already exists or the parent is not a directory.
325
+ Error: If the path already exists or the parent is not a directory, and `ignore_errors=False`.
339
326
 
340
327
  Examples:
341
- >>> cl.create_dir('my_dir')
328
+ >>> pxt.create_dir('my_dir')
342
329
 
343
330
  Create a subdirectory:
344
331
 
345
- >>> cl.create_dir('my_dir.sub_dir')
332
+ >>> pxt.create_dir('my_dir.sub_dir')
346
333
  """
347
334
  try:
348
335
  path = catalog.Path(path_str)
@@ -373,17 +360,21 @@ def drop_dir(path_str: str, force: bool = False, ignore_errors: bool = False) ->
373
360
  """Remove a directory.
374
361
 
375
362
  Args:
376
- path_str: Path to the directory.
363
+ path_str: Name or path of the directory.
364
+ force: If `True`, will also drop all tables and subdirectories of this directory, recursively, along
365
+ with any views or snapshots that depend on any of the dropped tables.
366
+ ignore_errors: if `True`, will return silently instead of throwing an exception if the directory
367
+ does not exist.
377
368
 
378
369
  Raises:
379
- Error: If the path does not exist or does not designate a directory or if the directory is not empty.
370
+ Error: If the path does not exist or does not designate a directory, or if the directory is not empty.
380
371
 
381
372
  Examples:
382
- >>> cl.drop_dir('my_dir')
373
+ >>> pxt.drop_dir('my_dir')
383
374
 
384
375
  Remove a subdirectory:
385
376
 
386
- >>> cl.drop_dir('my_dir.sub_dir')
377
+ >>> pxt.drop_dir('my_dir.sub_dir')
387
378
  """
388
379
  cat = Catalog.get()
389
380
  path = catalog.Path(path_str)
@@ -428,14 +419,14 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
428
419
  """List the directories in a directory.
429
420
 
430
421
  Args:
431
- path_str: Path to the directory.
432
- recursive: Whether to list subdirectories recursively.
422
+ path_str: Name or path of the directory.
423
+ recursive: If `True`, lists all descendants of this directory recursively.
433
424
 
434
425
  Returns:
435
426
  List of directory paths.
436
427
 
437
428
  Raises:
438
- Error: If the path does not exist or does not designate a directory.
429
+ Error: If `path_str` does not exist or does not designate a directory.
439
430
 
440
431
  Examples:
441
432
  >>> cl.list_dirs('my_dir', recursive=True)
pixeltable/index/base.py CHANGED
@@ -5,7 +5,7 @@ from typing import Any
5
5
 
6
6
  import sqlalchemy as sql
7
7
 
8
- import pixeltable.catalog as catalog
8
+ from pixeltable import catalog, exprs
9
9
 
10
10
 
11
11
  class IndexBase(abc.ABC):
@@ -22,7 +22,7 @@ class IndexBase(abc.ABC):
22
22
  pass
23
23
 
24
24
  @abc.abstractmethod
25
- def index_value_expr(self) -> 'pixeltable.exprs.Expr':
25
+ def index_value_expr(self) -> exprs.Expr:
26
26
  """Return expression that computes the value that goes into the index"""
27
27
  pass
28
28
 
pixeltable/index/btree.py CHANGED
@@ -1,13 +1,16 @@
1
- from typing import Optional
1
+ from typing import Optional, TYPE_CHECKING
2
2
 
3
3
  import sqlalchemy as sql
4
4
 
5
5
  # TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
6
- #import pixeltable.catalog as catalog
6
+ # import pixeltable.catalog as catalog
7
7
  import pixeltable.exceptions as excs
8
- import pixeltable.func as func
8
+ from pixeltable import catalog, exprs
9
+ from pixeltable.func.udf import udf
9
10
  from .base import IndexBase
10
11
 
12
+ if TYPE_CHECKING:
13
+ import pixeltable.exprs
11
14
 
12
15
  class BtreeIndex(IndexBase):
13
16
  """
@@ -15,7 +18,10 @@ class BtreeIndex(IndexBase):
15
18
  """
16
19
  MAX_STRING_LEN = 256
17
20
 
18
- @func.udf
21
+ value_expr: 'pixeltable.exprs.Expr'
22
+
23
+ @staticmethod
24
+ @udf
19
25
  def str_filter(s: Optional[str]) -> Optional[str]:
20
26
  if s is None:
21
27
  return None
@@ -24,10 +30,16 @@ class BtreeIndex(IndexBase):
24
30
  def __init__(self, c: 'catalog.Column'):
25
31
  if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
26
32
  raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
27
- from pixeltable.exprs import ColumnRef
28
- self.value_expr = self.str_filter(ColumnRef(c)) if c.col_type.is_string_type() else ColumnRef(c)
33
+ if c.col_type.is_media_type():
34
+ # an index on a media column is an index on the file url
35
+ # no validation for media columns: we're only interested in the string value
36
+ self.value_expr = exprs.ColumnRef(c, perform_validation=False)
37
+ else:
38
+ self.value_expr = (
39
+ BtreeIndex.str_filter(exprs.ColumnRef(c)) if c.col_type.is_string_type() else exprs.ColumnRef(c)
40
+ )
29
41
 
30
- def index_value_expr(self) -> 'pixeltable.exprs.Expr':
42
+ def index_value_expr(self) -> 'exprs.Expr':
31
43
  return self.value_expr
32
44
 
33
45
  def records_value_errors(self) -> bool:
@@ -52,3 +64,4 @@ class BtreeIndex(IndexBase):
52
64
  @classmethod
53
65
  def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
54
66
  return cls(c)
67
+
@@ -1,18 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional, Any
4
3
  import enum
4
+ from typing import Any, Optional
5
5
 
6
- import PIL.Image
7
6
  import numpy as np
8
- import pgvector.sqlalchemy
7
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
9
8
  import PIL.Image
10
9
  import sqlalchemy as sql
11
10
 
12
- import pixeltable.catalog as catalog
13
11
  import pixeltable.exceptions as excs
14
- import pixeltable.func as func
15
12
  import pixeltable.type_system as ts
13
+ from pixeltable import catalog, exprs, func
14
+
16
15
  from .base import IndexBase
17
16
 
18
17
 
@@ -58,16 +57,15 @@ class EmbeddingIndex(IndexBase):
58
57
  self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
59
58
 
60
59
  self.metric = self.Metric[metric.upper()]
61
- from pixeltable.exprs import ColumnRef
62
- self.value_expr = string_embed(ColumnRef(c)) if c.col_type.is_string_type() else image_embed(ColumnRef(c))
63
- assert self.value_expr.col_type.is_array_type()
60
+ self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
61
+ assert isinstance(self.value_expr.col_type, ts.ArrayType)
64
62
  self.string_embed = string_embed
65
63
  self.image_embed = image_embed
66
64
  vector_size = self.value_expr.col_type.shape[0]
67
65
  assert vector_size is not None
68
66
  self.index_col_type = pgvector.sqlalchemy.Vector(vector_size)
69
67
 
70
- def index_value_expr(self) -> 'pixeltable.exprs.Expr':
68
+ def index_value_expr(self) -> exprs.Expr:
71
69
  """Return expression that computes the value that goes into the index"""
72
70
  return self.value_expr
73
71
 
@@ -88,8 +86,8 @@ class EmbeddingIndex(IndexBase):
88
86
  )
89
87
  idx.create(bind=conn)
90
88
 
91
- def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ClauseElement:
92
- """Create a ClauseElement that represents '<val_column> <op> <item>'"""
89
+ def similarity_clause(self, val_column: catalog.Column, item: Any) -> sql.ColumnElement:
90
+ """Create a ColumnElement that represents '<val_column> <op> <item>'"""
93
91
  assert isinstance(item, (str, PIL.Image.Image))
94
92
  if isinstance(item, str):
95
93
  assert self.string_embed is not None
@@ -106,8 +104,8 @@ class EmbeddingIndex(IndexBase):
106
104
  assert self.metric == self.Metric.L2
107
105
  return val_column.sa_col.l2_distance(embedding)
108
106
 
109
- def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ClauseElement:
110
- """Create a ClauseElement that is used in an ORDER BY clause"""
107
+ def order_by_clause(self, val_column: catalog.Column, item: Any, is_asc: bool) -> sql.ColumnElement:
108
+ """Create a ColumnElement that is used in an ORDER BY clause"""
111
109
  assert isinstance(item, (str, PIL.Image.Image))
112
110
  embedding: Optional[np.ndarray] = None
113
111
  if isinstance(item, str):
@@ -151,7 +149,7 @@ class EmbeddingIndex(IndexBase):
151
149
  img = PIL.Image.new('RGB', (512, 512))
152
150
  return_type = embed_fn.call_return_type({param_name: img})
153
151
  assert return_type is not None
154
- if not return_type.is_array_type():
152
+ if not isinstance(return_type, ts.ArrayType):
155
153
  raise excs.Error(f'{name} must return an array, but returns {return_type}')
156
154
  else:
157
155
  shape = return_type.shape
pixeltable/io/__init__.py CHANGED
@@ -1,10 +1,9 @@
1
1
  from .external_store import ExternalStore, SyncStatus
2
- from .globals import create_label_studio_project, import_rows, import_json
2
+ from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
3
3
  from .hf_datasets import import_huggingface_dataset
4
4
  from .pandas import import_csv, import_excel, import_pandas
5
5
  from .parquet import import_parquet
6
6
 
7
-
8
7
  __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
9
8
  __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}
10
9
  __all__ = sorted(list(__default_dir - __removed_symbols))
@@ -69,6 +69,9 @@ class Project(ExternalStore, abc.ABC):
69
69
  An `ExternalStore` that represents a labeling project. Extends `ExternalStore` with a few
70
70
  additional capabilities specific to such projects.
71
71
  """
72
+
73
+ stored_proxies: dict[Column, Column]
74
+
72
75
  def __init__(self, name: str, col_mapping: dict[Column, str], stored_proxies: Optional[dict[Column, Column]]):
73
76
  super().__init__(name)
74
77
  self._col_mapping = col_mapping
@@ -116,7 +119,7 @@ class Project(ExternalStore, abc.ABC):
116
119
  tbl_version.schema_version = tbl_version.version
117
120
  proxy_cols = [self.create_stored_proxy(tbl_version, col) for col in stored_proxies_needed]
118
121
  # Add the columns; this will also update table metadata.
119
- tbl_version._add_columns(proxy_cols, conn)
122
+ tbl_version._add_columns(proxy_cols, conn, print_stats=False, on_error='ignore')
120
123
  # We don't need to retain `UpdateStatus` since the stored proxies are intended to be
121
124
  # invisible to the user.
122
125
  tbl_version._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
@@ -126,7 +129,7 @@ class Project(ExternalStore, abc.ABC):
126
129
  # any *other* external store for this table.)
127
130
  deletions_needed: set[Column] = set(self.stored_proxies.values())
128
131
  for name, store in tbl_version.external_stores.items():
129
- if name != self.name:
132
+ if isinstance(store, Project) and name != self.name:
130
133
  deletions_needed = deletions_needed.difference(set(store.stored_proxies.values()))
131
134
  if len(deletions_needed) > 0:
132
135
  _logger.info(f'Removing stored proxies for columns: {[col.name for col in deletions_needed]}')
@@ -210,6 +213,8 @@ class Project(ExternalStore, abc.ABC):
210
213
  If validation fails, an exception will be raised. If validation succeeds, a new mapping will be returned
211
214
  in which the Pixeltable column names are resolved to the corresponding `Column` objects.
212
215
  """
216
+ from pixeltable import exprs
217
+
213
218
  is_user_specified_col_mapping = col_mapping is not None
214
219
  if col_mapping is None:
215
220
  col_mapping = {col: col for col in itertools.chain(export_cols.keys(), import_cols.keys())}
@@ -235,8 +240,9 @@ class Project(ExternalStore, abc.ABC):
235
240
  f'Column name `{ext_col}` appears as a value in `col_mapping`, but the external store '
236
241
  f'configuration has no column `{ext_col}`.'
237
242
  )
238
- col = table[t_col].col
239
- resolved_col_mapping[col] = ext_col
243
+ col_ref = table[t_col]
244
+ assert isinstance(col_ref, exprs.ColumnRef)
245
+ resolved_col_mapping[col_ref.col] = ext_col
240
246
  # Validate column specs
241
247
  t_col_types = table._schema
242
248
  for t_col, ext_col in col_mapping.items():
@@ -329,7 +335,7 @@ class MockProject(Project):
329
335
  def get_import_columns(self) -> dict[str, ts.ColumnType]:
330
336
  return self.import_cols
331
337
 
332
- def sync(self, t: Table, export_data: bool, import_data: bool) -> NotImplemented:
338
+ def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
333
339
  raise NotImplementedError()
334
340
 
335
341
  def delete(self) -> None: