pixeltable 0.2.21__py3-none-any.whl → 0.2.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (82) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/column.py +37 -11
  4. pixeltable/catalog/globals.py +18 -0
  5. pixeltable/catalog/insertable_table.py +6 -4
  6. pixeltable/catalog/table.py +19 -3
  7. pixeltable/catalog/table_version.py +34 -14
  8. pixeltable/catalog/view.py +16 -17
  9. pixeltable/dataframe.py +7 -8
  10. pixeltable/env.py +5 -0
  11. pixeltable/exec/__init__.py +0 -1
  12. pixeltable/exec/aggregation_node.py +6 -3
  13. pixeltable/exec/cache_prefetch_node.py +1 -1
  14. pixeltable/exec/data_row_batch.py +2 -19
  15. pixeltable/exec/exec_node.py +2 -1
  16. pixeltable/exec/expr_eval_node.py +17 -10
  17. pixeltable/exec/in_memory_data_node.py +6 -3
  18. pixeltable/exec/sql_node.py +24 -25
  19. pixeltable/exprs/arithmetic_expr.py +3 -1
  20. pixeltable/exprs/array_slice.py +7 -7
  21. pixeltable/exprs/column_property_ref.py +37 -10
  22. pixeltable/exprs/column_ref.py +93 -14
  23. pixeltable/exprs/comparison.py +5 -5
  24. pixeltable/exprs/compound_predicate.py +8 -7
  25. pixeltable/exprs/data_row.py +27 -18
  26. pixeltable/exprs/expr.py +53 -52
  27. pixeltable/exprs/expr_set.py +5 -0
  28. pixeltable/exprs/function_call.py +32 -16
  29. pixeltable/exprs/globals.py +4 -1
  30. pixeltable/exprs/in_predicate.py +8 -7
  31. pixeltable/exprs/inline_expr.py +4 -4
  32. pixeltable/exprs/is_null.py +4 -4
  33. pixeltable/exprs/json_mapper.py +11 -12
  34. pixeltable/exprs/json_path.py +5 -10
  35. pixeltable/exprs/literal.py +5 -5
  36. pixeltable/exprs/method_ref.py +5 -4
  37. pixeltable/exprs/object_ref.py +2 -1
  38. pixeltable/exprs/row_builder.py +88 -36
  39. pixeltable/exprs/rowid_ref.py +12 -11
  40. pixeltable/exprs/similarity_expr.py +12 -7
  41. pixeltable/exprs/sql_element_cache.py +7 -5
  42. pixeltable/exprs/type_cast.py +8 -6
  43. pixeltable/exprs/variable.py +5 -4
  44. pixeltable/func/aggregate_function.py +1 -1
  45. pixeltable/func/function.py +11 -10
  46. pixeltable/functions/__init__.py +2 -2
  47. pixeltable/functions/globals.py +5 -7
  48. pixeltable/functions/huggingface.py +19 -20
  49. pixeltable/functions/llama_cpp.py +106 -0
  50. pixeltable/functions/ollama.py +147 -0
  51. pixeltable/functions/replicate.py +72 -0
  52. pixeltable/functions/string.py +9 -0
  53. pixeltable/globals.py +12 -20
  54. pixeltable/index/btree.py +16 -3
  55. pixeltable/index/embedding_index.py +4 -4
  56. pixeltable/io/__init__.py +1 -2
  57. pixeltable/io/fiftyone.py +178 -0
  58. pixeltable/io/globals.py +96 -2
  59. pixeltable/iterators/base.py +3 -2
  60. pixeltable/iterators/document.py +1 -1
  61. pixeltable/iterators/video.py +120 -63
  62. pixeltable/metadata/__init__.py +1 -1
  63. pixeltable/metadata/converters/convert_21.py +34 -0
  64. pixeltable/metadata/converters/util.py +45 -4
  65. pixeltable/metadata/notes.py +1 -0
  66. pixeltable/metadata/schema.py +8 -0
  67. pixeltable/plan.py +16 -14
  68. pixeltable/py.typed +0 -0
  69. pixeltable/store.py +7 -2
  70. pixeltable/tool/create_test_video.py +1 -1
  71. pixeltable/tool/embed_udf.py +1 -1
  72. pixeltable/tool/mypy_plugin.py +28 -5
  73. pixeltable/type_system.py +17 -1
  74. pixeltable/utils/documents.py +15 -1
  75. pixeltable/utils/formatter.py +9 -10
  76. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/METADATA +46 -10
  77. pixeltable-0.2.22.dist-info/RECORD +153 -0
  78. pixeltable/exec/media_validation_node.py +0 -43
  79. pixeltable-0.2.21.dist-info/RECORD +0 -148
  80. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
  81. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
  82. {pixeltable-0.2.21.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  # These version placeholders will be replaced during build.
2
- __version__ = "0.2.21"
3
- __version_tuple__ = (0, 2, 21)
2
+ __version__ = "0.2.22"
3
+ __version_tuple__ = (0, 2, 22)
@@ -1,7 +1,7 @@
1
1
  from .catalog import Catalog
2
2
  from .column import Column
3
3
  from .dir import Dir
4
- from .globals import UpdateStatus, is_valid_identifier, is_valid_path
4
+ from .globals import UpdateStatus, is_valid_identifier, is_valid_path, MediaValidation
5
5
  from .insertable_table import InsertableTable
6
6
  from .named_function import NamedFunction
7
7
  from .path import Path
@@ -8,24 +8,43 @@ import sqlalchemy as sql
8
8
  import pixeltable.exceptions as excs
9
9
  import pixeltable.type_system as ts
10
10
  from pixeltable import exprs
11
-
12
- from .globals import is_valid_identifier
11
+ from .globals import is_valid_identifier, MediaValidation
13
12
 
14
13
  if TYPE_CHECKING:
15
14
  from .table_version import TableVersion
16
15
 
17
16
  _logger = logging.getLogger('pixeltable')
18
17
 
18
+
19
19
  class Column:
20
20
  """Representation of a column in the schema of a Table/DataFrame.
21
21
 
22
22
  A Column contains all the metadata necessary for executing queries and updates against a particular version of a
23
23
  table/view.
24
24
  """
25
+ name: str
26
+ id: Optional[int]
27
+ col_type: ts.ColumnType
28
+ stored: bool
29
+ is_pk: bool
30
+ _media_validation: Optional[MediaValidation] # if not set, TableVersion.media_validation applies
31
+ schema_version_add: Optional[int]
32
+ schema_version_drop: Optional[int]
33
+ _records_errors: Optional[bool]
34
+ sa_col: Optional[sql.schema.Column]
35
+ sa_col_type: Optional[sql.sqltypes.TypeEngine]
36
+ sa_errormsg_col: Optional[sql.schema.Column]
37
+ sa_errortype_col: Optional[sql.schema.Column]
38
+ compute_func: Optional[Callable]
39
+ _value_expr: Optional[exprs.Expr]
40
+ value_expr_dict: Optional[dict[str, Any]]
41
+ dependent_cols: set[Column]
42
+ tbl: Optional[TableVersion]
43
+
25
44
  def __init__(
26
45
  self, name: Optional[str], col_type: Optional[ts.ColumnType] = None,
27
46
  computed_with: Optional[Union[exprs.Expr, Callable]] = None,
28
- is_pk: bool = False, stored: bool = True,
47
+ is_pk: bool = False, stored: bool = True, media_validation: Optional[MediaValidation] = None,
29
48
  col_id: Optional[int] = None, schema_version_add: Optional[int] = None,
30
49
  schema_version_drop: Optional[int] = None, sa_col_type: Optional[sql.sqltypes.TypeEngine] = None,
31
50
  records_errors: Optional[bool] = None, value_expr_dict: Optional[dict[str, Any]] = None,
@@ -61,8 +80,8 @@ class Column:
61
80
  if col_type is None and computed_with is None:
62
81
  raise excs.Error(f'Column `{name}`: col_type is required if computed_with is not specified')
63
82
 
64
- self._value_expr: Optional[exprs.Expr] = None
65
- self.compute_func: Optional[Callable] = None
83
+ self._value_expr = None
84
+ self.compute_func = None
66
85
  self.value_expr_dict = value_expr_dict
67
86
  if computed_with is not None:
68
87
  value_expr = exprs.Expr.from_object(computed_with)
@@ -86,24 +105,24 @@ class Column:
86
105
  assert self.col_type is not None
87
106
 
88
107
  self.stored = stored
89
- self.dependent_cols: set[Column] = set() # cols with value_exprs that reference us; set by TableVersion
108
+ self.dependent_cols = set() # cols with value_exprs that reference us; set by TableVersion
90
109
  self.id = col_id
91
110
  self.is_pk = is_pk
111
+ self._media_validation = media_validation
92
112
  self.schema_version_add = schema_version_add
93
113
  self.schema_version_drop = schema_version_drop
94
114
 
95
115
  self._records_errors = records_errors
96
116
 
97
117
  # column in the stored table for the values of this Column
98
- self.sa_col: Optional[sql.schema.Column] = None
118
+ self.sa_col = None
99
119
  self.sa_col_type = sa_col_type
100
120
 
101
121
  # computed cols also have storage columns for the exception string and type
102
- self.sa_errormsg_col: Optional[sql.schema.Column] = None
103
- self.sa_errortype_col: Optional[sql.schema.Column] = None
122
+ self.sa_errormsg_col = None
123
+ self.sa_errortype_col = None
104
124
 
105
- from .table_version import TableVersion
106
- self.tbl: Optional[TableVersion] = None # set by owning TableVersion
125
+ self.tbl = None # set by owning TableVersion
107
126
 
108
127
  @property
109
128
  def value_expr(self) -> Optional[exprs.Expr]:
@@ -160,6 +179,13 @@ class Column:
160
179
  assert self.tbl is not None
161
180
  return f'{self.tbl.name}.{self.name}'
162
181
 
182
+ @property
183
+ def media_validation(self) -> MediaValidation:
184
+ if self._media_validation is not None:
185
+ return self._media_validation
186
+ assert self.tbl is not None
187
+ return self.tbl.media_validation
188
+
163
189
  def source(self) -> None:
164
190
  """
165
191
  If this is a computed col and the top-level expr is a function call, print the source, if possible.
@@ -1,8 +1,12 @@
1
+ from __future__ import annotations
1
2
  import dataclasses
3
+ import enum
2
4
  import itertools
3
5
  import logging
4
6
  from typing import Optional
5
7
 
8
+ import pixeltable.exceptions as excs
9
+
6
10
  _logger = logging.getLogger('pixeltable')
7
11
 
8
12
  # name of the position column in a component view
@@ -34,6 +38,20 @@ class UpdateStatus:
34
38
  self.cols_with_excs = list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs))
35
39
  return self
36
40
 
41
+
42
+ class MediaValidation(enum.Enum):
43
+ ON_READ = 0
44
+ ON_WRITE = 1
45
+
46
+ @classmethod
47
+ def validated(cls, name: str, error_prefix: str) -> MediaValidation:
48
+ try:
49
+ return cls[name.upper()]
50
+ except KeyError:
51
+ val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__.keys())
52
+ raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]')
53
+
54
+
37
55
  def is_valid_identifier(name: str) -> bool:
38
56
  return name.isidentifier() and not name.startswith('_')
39
57
 
@@ -13,7 +13,7 @@ from pixeltable.env import Env
13
13
  from pixeltable.utils.filecache import FileCache
14
14
 
15
15
  from .catalog import Catalog
16
- from .globals import UpdateStatus
16
+ from .globals import UpdateStatus, MediaValidation
17
17
  from .table import Table
18
18
  from .table_version import TableVersion
19
19
  from .table_version_path import TableVersionPath
@@ -35,8 +35,8 @@ class InsertableTable(Table):
35
35
  # MODULE-LOCAL, NOT PUBLIC
36
36
  @classmethod
37
37
  def _create(
38
- cls, dir_id: UUID, name: str, schema: dict[str, ts.ColumnType], df: Optional[pxt.DataFrame], primary_key: List[str],
39
- num_retained_versions: int, comment: str
38
+ cls, dir_id: UUID, name: str, schema: dict[str, ts.ColumnType], df: Optional[pxt.DataFrame],
39
+ primary_key: List[str], num_retained_versions: int, comment: str, media_validation: MediaValidation
40
40
  ) -> InsertableTable:
41
41
  columns = cls._create_columns(schema)
42
42
  cls._verify_schema(columns)
@@ -50,7 +50,9 @@ class InsertableTable(Table):
50
50
  col.is_pk = True
51
51
 
52
52
  with orm.Session(Env.get().engine, future=True) as session:
53
- _, tbl_version = TableVersion.create(session, dir_id, name, columns, num_retained_versions, comment)
53
+ _, tbl_version = TableVersion.create(
54
+ session, dir_id, name, columns, num_retained_versions=num_retained_versions, comment=comment,
55
+ media_validation=media_validation)
54
56
  tbl = cls(dir_id, tbl_version)
55
57
  # TODO We need to commit before doing the insertion, in order to avoid a primary key (version) collision
56
58
  # when the table metadata gets updated. Once we have a notion of user-defined transactions in
@@ -24,7 +24,7 @@ import pixeltable.type_system as ts
24
24
  from pixeltable.utils.filecache import FileCache
25
25
 
26
26
  from .column import Column
27
- from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier
27
+ from .globals import _ROWID_COLUMN_NAME, UpdateStatus, is_system_column_name, is_valid_identifier, MediaValidation
28
28
  from .schema_object import SchemaObject
29
29
  from .table_version import TableVersion
30
30
  from .table_version_path import TableVersionPath
@@ -91,6 +91,7 @@ class Table(SchemaObject):
91
91
  'num_retained_versions': 10,
92
92
  'is_view': False,
93
93
  'is_snapshot': False,
94
+ 'media_validation': 'on_write',
94
95
  }
95
96
  ```
96
97
  """
@@ -101,6 +102,7 @@ class Table(SchemaObject):
101
102
  md['schema_version'] = self._tbl_version.schema_version
102
103
  md['comment'] = self._comment
103
104
  md['num_retained_versions'] = self._num_retained_versions
105
+ md['media_validation'] = self._media_validation.name.lower()
104
106
  return md
105
107
 
106
108
  @property
@@ -244,6 +246,10 @@ class Table(SchemaObject):
244
246
  def _num_retained_versions(self):
245
247
  return self._tbl_version.num_retained_versions
246
248
 
249
+ @property
250
+ def _media_validation(self) -> MediaValidation:
251
+ return self._tbl_version.media_validation
252
+
247
253
  def _description(self) -> pd.DataFrame:
248
254
  cols = self._tbl_version_path.columns()
249
255
  df = pd.DataFrame({
@@ -422,7 +428,7 @@ class Table(SchemaObject):
422
428
  (on account of containing Python Callables or Exprs).
423
429
  """
424
430
  assert isinstance(spec, dict)
425
- valid_keys = {'type', 'value', 'stored'}
431
+ valid_keys = {'type', 'value', 'stored', 'media_validation'}
426
432
  has_type = False
427
433
  for k in spec.keys():
428
434
  if k not in valid_keys:
@@ -449,6 +455,9 @@ class Table(SchemaObject):
449
455
  if 'type' in spec:
450
456
  raise excs.Error(f'Column {name}: "type" is redundant if value is a Pixeltable expression')
451
457
 
458
+ if 'media_validation' in spec:
459
+ _ = catalog.MediaValidation.validated(spec['media_validation'], f'Column {name}: media_validation')
460
+
452
461
  if 'stored' in spec and not isinstance(spec['stored'], bool):
453
462
  raise excs.Error(f'Column {name}: "stored" must be a bool, got {spec["stored"]}')
454
463
  if not has_type:
@@ -462,6 +471,7 @@ class Table(SchemaObject):
462
471
  col_type: Optional[ts.ColumnType] = None
463
472
  value_expr: Optional[exprs.Expr] = None
464
473
  primary_key: Optional[bool] = None
474
+ media_validation: Optional[catalog.MediaValidation] = None
465
475
  stored = True
466
476
 
467
477
  if isinstance(spec, (ts.ColumnType, type, _GenericAlias)):
@@ -484,9 +494,15 @@ class Table(SchemaObject):
484
494
  value_expr = value_expr.copy()
485
495
  stored = spec.get('stored', True)
486
496
  primary_key = spec.get('primary_key')
497
+ media_validation_str = spec.get('media_validation')
498
+ media_validation = (
499
+ catalog.MediaValidation[media_validation_str.upper()] if media_validation_str is not None
500
+ else None
501
+ )
487
502
 
488
503
  column = Column(
489
- name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key)
504
+ name, col_type=col_type, computed_with=value_expr, stored=stored, is_pk=primary_key,
505
+ media_validation=media_validation)
490
506
  columns.append(column)
491
507
  return columns
492
508
 
@@ -26,7 +26,7 @@ from pixeltable.utils.media_store import MediaStore
26
26
 
27
27
  from ..func.globals import resolve_symbol
28
28
  from .column import Column
29
- from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier
29
+ from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier, MediaValidation
30
30
 
31
31
  if TYPE_CHECKING:
32
32
  from pixeltable import exec, store
@@ -53,6 +53,7 @@ class TableVersion:
53
53
  name: str
54
54
  version: int
55
55
  comment: str
56
+ media_validation: MediaValidation
56
57
  num_retained_versions: int
57
58
  schema_version: int
58
59
  view_md: Optional[schema.ViewMd]
@@ -109,6 +110,7 @@ class TableVersion:
109
110
  self.view_md = tbl_md.view_md # save this as-is, it's needed for _create_md()
110
111
  is_view = tbl_md.view_md is not None
111
112
  self.is_snapshot = (is_view and tbl_md.view_md.is_snapshot) or bool(is_snapshot)
113
+ self.media_validation = MediaValidation[schema_version_md.media_validation.upper()]
112
114
  # a mutable TableVersion doesn't have a static version
113
115
  self.effective_version = self.version if self.is_snapshot else None
114
116
 
@@ -182,7 +184,7 @@ class TableVersion:
182
184
  @classmethod
183
185
  def create(
184
186
  cls, session: orm.Session, dir_id: UUID, name: str, cols: list[Column], num_retained_versions: int,
185
- comment: str, base_path: Optional[pxt.catalog.TableVersionPath] = None,
187
+ comment: str, media_validation: MediaValidation, base_path: Optional[pxt.catalog.TableVersionPath] = None,
186
188
  view_md: Optional[schema.ViewMd] = None
187
189
  ) -> tuple[UUID, Optional[TableVersion]]:
188
190
  # assign ids
@@ -214,11 +216,17 @@ class TableVersion:
214
216
  tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
215
217
 
216
218
  # create schema.TableSchemaVersion
217
- schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
219
+ schema_col_md: dict[int, schema.SchemaColumn] = {}
220
+ for pos, col in enumerate(cols):
221
+ md = schema.SchemaColumn(
222
+ pos=pos, name=col.name,
223
+ media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
224
+ schema_col_md[col.id] = md
218
225
 
219
226
  schema_version_md = schema.TableSchemaVersionMd(
220
227
  schema_version=0, preceding_schema_version=None, columns=schema_col_md,
221
- num_retained_versions=num_retained_versions, comment=comment)
228
+ num_retained_versions=num_retained_versions, comment=comment,
229
+ media_validation=media_validation.name.lower())
222
230
  schema_version_record = schema.TableSchemaVersion(
223
231
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
224
232
 
@@ -285,10 +293,15 @@ class TableVersion:
285
293
  self.cols_by_name = {}
286
294
  self.cols_by_id = {}
287
295
  for col_md in tbl_md.column_md.values():
288
- col_name = schema_version_md.columns[col_md.id].name if col_md.id in schema_version_md.columns else None
296
+ schema_col_md = schema_version_md.columns[col_md.id] if col_md.id in schema_version_md.columns else None
297
+ col_name = schema_col_md.name if schema_col_md is not None else None
298
+ media_val = (
299
+ MediaValidation[schema_col_md.media_validation.upper()]
300
+ if schema_col_md is not None and schema_col_md.media_validation is not None else None
301
+ )
289
302
  col = Column(
290
303
  col_id=col_md.id, name=col_name, col_type=ts.ColumnType.from_dict(col_md.col_type),
291
- is_pk=col_md.is_pk, stored=col_md.stored,
304
+ is_pk=col_md.is_pk, stored=col_md.stored, media_validation=media_val,
292
305
  schema_version_add=col_md.schema_version_add, schema_version_drop=col_md.schema_version_drop,
293
306
  value_expr_dict=col_md.value_expr)
294
307
  col.tbl = self
@@ -349,7 +362,8 @@ class TableVersion:
349
362
  self.store_tbl = StoreTable(self)
350
363
 
351
364
  def _update_md(
352
- self, timestamp: float, conn: sql.engine.Connection, update_tbl_version: bool = True, preceding_schema_version: Optional[int] = None
365
+ self, timestamp: float, conn: sql.engine.Connection, update_tbl_version: bool = True,
366
+ preceding_schema_version: Optional[int] = None
353
367
  ) -> None:
354
368
  """Writes table metadata to the database.
355
369
 
@@ -710,20 +724,22 @@ class TableVersion:
710
724
 
711
725
  if conn is None:
712
726
  with Env.get().engine.begin() as conn:
713
- return self._insert(plan, conn, time.time(), print_stats=print_stats, rowids=rowids())
727
+ return self._insert(
728
+ plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
714
729
  else:
715
- return self._insert(plan, conn, time.time(), print_stats=print_stats, rowids=rowids())
730
+ return self._insert(
731
+ plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
716
732
 
717
733
  def _insert(
718
734
  self, exec_plan: 'exec.ExecNode', conn: sql.engine.Connection, timestamp: float, *,
719
- rowids: Optional[Iterator[int]] = None, print_stats: bool = False,
735
+ rowids: Optional[Iterator[int]] = None, print_stats: bool = False, abort_on_exc: bool = False
720
736
  ) -> UpdateStatus:
721
737
  """Insert rows produced by exec_plan and propagate to views"""
722
738
  # we're creating a new version
723
739
  self.version += 1
724
740
  result = UpdateStatus()
725
741
  num_rows, num_excs, cols_with_excs = self.store_tbl.insert_rows(
726
- exec_plan, conn, v_min=self.version, rowids=rowids)
742
+ exec_plan, conn, v_min=self.version, rowids=rowids, abort_on_exc=abort_on_exc)
727
743
  result.num_rows = num_rows
728
744
  result.num_excs = num_excs
729
745
  result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
@@ -1203,7 +1219,8 @@ class TableVersion:
1203
1219
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
1204
1220
  next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1205
1221
  column_md=self._create_column_md(self.cols), index_md=self.idx_md,
1206
- external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md)
1222
+ external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md,
1223
+ )
1207
1224
 
1208
1225
  def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
1209
1226
  return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
@@ -1211,11 +1228,14 @@ class TableVersion:
1211
1228
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
1212
1229
  column_md: dict[int, schema.SchemaColumn] = {}
1213
1230
  for pos, col in enumerate(self.cols_by_name.values()):
1214
- column_md[col.id] = schema.SchemaColumn(pos=pos, name=col.name)
1231
+ column_md[col.id] = schema.SchemaColumn(
1232
+ pos=pos, name=col.name,
1233
+ media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
1215
1234
  # preceding_schema_version to be set by the caller
1216
1235
  return schema.TableSchemaVersionMd(
1217
1236
  schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,
1218
- columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment)
1237
+ columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment,
1238
+ media_validation=self.media_validation.name.lower())
1219
1239
 
1220
1240
  def as_dict(self) -> dict:
1221
1241
  return {'id': str(self.id), 'effective_version': self.effective_version}
@@ -2,24 +2,21 @@ from __future__ import annotations
2
2
 
3
3
  import inspect
4
4
  import logging
5
- from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Set, Type
5
+ from typing import TYPE_CHECKING, Any, Iterable, Optional
6
6
  from uuid import UUID
7
7
 
8
8
  import sqlalchemy.orm as orm
9
9
 
10
- import pixeltable.catalog as catalog
11
10
  import pixeltable.exceptions as excs
12
- import pixeltable.exprs as exprs
13
- import pixeltable.func as func
14
11
  import pixeltable.metadata.schema as md_schema
12
+ import pixeltable.type_system as ts
13
+ from pixeltable import catalog, exprs, func
15
14
  from pixeltable.env import Env
16
- from pixeltable.exceptions import Error
17
15
  from pixeltable.iterators import ComponentIterator
18
- from pixeltable.type_system import IntType, InvalidType
19
16
 
20
17
  from .catalog import Catalog
21
18
  from .column import Column
22
- from .globals import _POS_COLUMN_NAME, UpdateStatus
19
+ from .globals import _POS_COLUMN_NAME, UpdateStatus, MediaValidation
23
20
  from .table import Table
24
21
  from .table_version import TableVersion
25
22
  from .table_version_path import TableVersionPath
@@ -52,9 +49,10 @@ class View(Table):
52
49
 
53
50
  @classmethod
54
51
  def _create(
55
- cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns: Dict[str, Any],
52
+ cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns: dict[str, Any],
56
53
  predicate: Optional['pxt.exprs.Expr'], is_snapshot: bool, num_retained_versions: int, comment: str,
57
- iterator_cls: Optional[Type[ComponentIterator]], iterator_args: Optional[Dict]
54
+ media_validation: MediaValidation,
55
+ iterator_cls: Optional[type[ComponentIterator]], iterator_args: Optional[dict]
58
56
  ) -> View:
59
57
  columns = cls._create_columns(additional_columns)
60
58
  cls._verify_schema(columns)
@@ -92,17 +90,17 @@ class View(Table):
92
90
  func.Parameter(param_name, param_type, kind=inspect.Parameter.POSITIONAL_OR_KEYWORD)
93
91
  for param_name, param_type in iterator_cls.input_schema().items()
94
92
  ]
95
- sig = func.Signature(InvalidType(), params)
93
+ sig = func.Signature(ts.InvalidType(), params)
96
94
  from pixeltable.exprs import FunctionCall
97
95
  FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
98
96
  except TypeError as e:
99
- raise Error(f'Cannot instantiate iterator with given arguments: {e}')
97
+ raise excs.Error(f'Cannot instantiate iterator with given arguments: {e}')
100
98
 
101
99
  # prepend pos and output_schema columns to cols:
102
100
  # a component view exposes the pos column of its rowid;
103
101
  # we create that column here, so it gets assigned a column id;
104
102
  # stored=False: it is not stored separately (it's already stored as part of the rowid)
105
- iterator_cols = [Column(_POS_COLUMN_NAME, IntType(), stored=False)]
103
+ iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
106
104
  output_dict, unstored_cols = iterator_cls.output_schema(**bound_args)
107
105
  iterator_cols.extend([
108
106
  Column(col_name, col_type, stored=col_name not in unstored_cols)
@@ -112,12 +110,12 @@ class View(Table):
112
110
  iterator_col_names = {col.name for col in iterator_cols}
113
111
  for col in columns:
114
112
  if col.name in iterator_col_names:
115
- raise Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
113
+ raise excs.Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
116
114
  columns = iterator_cols + columns
117
115
 
118
116
  with orm.Session(Env.get().engine, future=True) as session:
119
117
  from pixeltable.exprs import InlineDict
120
- iterator_args_expr = InlineDict(iterator_args) if iterator_args is not None else None
118
+ iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
121
119
  iterator_class_fqn = f'{iterator_cls.__module__}.{iterator_cls.__name__}' if iterator_cls is not None \
122
120
  else None
123
121
  base_version_path = cls._get_snapshot_path(base) if is_snapshot else base
@@ -142,7 +140,8 @@ class View(Table):
142
140
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None)
143
141
 
144
142
  id, tbl_version = TableVersion.create(
145
- session, dir_id, name, columns, num_retained_versions, comment, base_path=base_version_path, view_md=view_md)
143
+ session, dir_id, name, columns, num_retained_versions, comment, media_validation=media_validation,
144
+ base_path=base_version_path, view_md=view_md)
146
145
  if tbl_version is None:
147
146
  # this is purely a snapshot: we use the base's tbl version path
148
147
  view = cls(id, dir_id, name, base_version_path, base.tbl_id(), snapshot_only=True)
@@ -168,11 +167,11 @@ class View(Table):
168
167
 
169
168
  @classmethod
170
169
  def _verify_column(
171
- cls, col: Column, existing_column_names: Set[str], existing_query_names: Optional[Set[str]] = None
170
+ cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
172
171
  ) -> None:
173
172
  # make sure that columns are nullable or have a default
174
173
  if not col.col_type.nullable and not col.is_computed:
175
- raise Error(f'Column {col.name}: non-computed columns in views must be nullable')
174
+ raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
176
175
  super()._verify_column(col, existing_column_names, existing_query_names)
177
176
 
178
177
  @classmethod
pixeltable/dataframe.py CHANGED
@@ -371,15 +371,10 @@ class DataFrame:
371
371
  group_by_clause=group_by_clause, grouping_tbl=self.grouping_tbl,
372
372
  order_by_clause=order_by_clause, limit=self.limit_val)
373
373
 
374
- def collect(self) -> DataFrameResultSet:
375
- return self._collect()
376
-
377
- def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
374
+ def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
378
375
  try:
379
- result_rows = []
380
376
  for data_row in self._exec(conn):
381
- result_row = [data_row[e.slot_idx] for e in self._select_list_exprs]
382
- result_rows.append(result_row)
377
+ yield [data_row[e.slot_idx] for e in self._select_list_exprs]
383
378
  except excs.ExprEvalError as e:
384
379
  msg = f'In row {e.row_num} the {e.expr_msg} encountered exception ' f'{type(e.exc).__name__}:\n{str(e.exc)}'
385
380
  if len(e.input_vals) > 0:
@@ -399,7 +394,11 @@ class DataFrame:
399
394
  except sql.exc.DBAPIError as e:
400
395
  raise excs.Error(f'Error during SQL execution:\n{e}')
401
396
 
402
- return DataFrameResultSet(result_rows, self.schema)
397
+ def collect(self) -> DataFrameResultSet:
398
+ return self._collect()
399
+
400
+ def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
401
+ return DataFrameResultSet(list(self._output_row_iterator(conn)), self.schema)
403
402
 
404
403
  def count(self) -> int:
405
404
  from pixeltable.plan import Planner
pixeltable/env.py CHANGED
@@ -494,13 +494,18 @@ class Env:
494
494
  self.__register_package('anthropic')
495
495
  self.__register_package('boto3')
496
496
  self.__register_package('datasets')
497
+ self.__register_package('fiftyone')
497
498
  self.__register_package('fireworks', library_name='fireworks-ai')
499
+ self.__register_package('huggingface_hub', library_name='huggingface-hub')
498
500
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
501
+ self.__register_package('llama_cpp', library_name='llama-cpp-python')
499
502
  self.__register_package('mistralai')
500
503
  self.__register_package('mistune')
504
+ self.__register_package('ollama')
501
505
  self.__register_package('openai')
502
506
  self.__register_package('openpyxl')
503
507
  self.__register_package('pyarrow')
508
+ self.__register_package('replicate')
504
509
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
505
510
  self.__register_package('spacy')
506
511
  self.__register_package('tiktoken')
@@ -6,6 +6,5 @@ from .exec_context import ExecContext
6
6
  from .exec_node import ExecNode
7
7
  from .expr_eval_node import ExprEvalNode
8
8
  from .in_memory_data_node import InMemoryDataNode
9
- from .media_validation_node import MediaValidationNode
10
9
  from .row_update_node import RowUpdateNode
11
10
  from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode
@@ -2,11 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Iterable, Optional, Any, Iterator
5
+ from typing import Any, Iterable, Iterator, Optional, cast
6
6
 
7
7
  import pixeltable.catalog as catalog
8
8
  import pixeltable.exceptions as excs
9
9
  import pixeltable.exprs as exprs
10
+
10
11
  from .data_row_batch import DataRowBatch
11
12
  from .exec_node import ExecNode
12
13
 
@@ -28,13 +29,15 @@ class AggregationNode(ExecNode):
28
29
  self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
29
30
  agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
30
31
  ):
31
- super().__init__(row_builder, group_by + agg_fn_calls, input_exprs, input)
32
+ output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
33
+ output_exprs.extend(agg_fn_calls)
34
+ super().__init__(row_builder, output_exprs, input_exprs, input)
32
35
  self.input = input
33
36
  self.group_by = group_by
34
37
  self.input_exprs = list(input_exprs)
35
38
  self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=self.input_exprs)
36
39
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
37
- self.agg_fn_calls = self.agg_fn_eval_ctx.target_exprs
40
+ self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
38
41
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
39
42
  self.output_batch = DataRowBatch(tbl, row_builder, 0)
40
43
 
@@ -79,7 +79,7 @@ class CachePrefetchNode(ExecNode):
79
79
 
80
80
  return input_batch
81
81
 
82
- def _fetch_url(self, row: exprs.DataRow, slot_idx: int) -> Optional[str]:
82
+ def _fetch_url(self, row: exprs.DataRow, slot_idx: int) -> Optional[Path]:
83
83
  """Fetches a remote URL into Env.tmp_dir and returns its path"""
84
84
  url = row.file_urls[slot_idx]
85
85
  parsed = urllib.parse.urlparse(url)
@@ -49,7 +49,7 @@ class DataRowBatch:
49
49
  def __len__(self) -> int:
50
50
  return len(self.rows)
51
51
 
52
- def __getitem__(self, index: object) -> exprs.DataRow:
52
+ def __getitem__(self, index: int) -> exprs.DataRow:
53
53
  return self.rows[index]
54
54
 
55
55
  def flush_imgs(
@@ -74,21 +74,4 @@ class DataRowBatch:
74
74
  row.flush_img(slot_idx)
75
75
 
76
76
  def __iter__(self) -> Iterator[exprs.DataRow]:
77
- return DataRowBatchIterator(self)
78
-
79
-
80
- class DataRowBatchIterator:
81
- """
82
- Iterator over a DataRowBatch.
83
- """
84
- def __init__(self, batch: DataRowBatch):
85
- self.row_batch = batch
86
- self.index = 0
87
-
88
- def __next__(self) -> exprs.DataRow:
89
- if self.index >= len(self.row_batch.rows):
90
- raise StopIteration
91
- row = self.row_batch.rows[self.index]
92
- self.index += 1
93
- return row
94
-
77
+ return iter(self.rows)
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import abc
4
- from typing import Iterable, Optional, List, TYPE_CHECKING, Iterator
4
+ from typing import TYPE_CHECKING, Iterable, Iterator, List, Optional
5
5
 
6
6
  import pixeltable.exprs as exprs
7
+
7
8
  from .data_row_batch import DataRowBatch
8
9
  from .exec_context import ExecContext
9
10