pixeltable 0.2.21__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/column.py +41 -29
  5. pixeltable/catalog/globals.py +18 -0
  6. pixeltable/catalog/insertable_table.py +30 -10
  7. pixeltable/catalog/table.py +198 -86
  8. pixeltable/catalog/table_version.py +47 -53
  9. pixeltable/catalog/table_version_path.py +2 -2
  10. pixeltable/catalog/view.py +17 -18
  11. pixeltable/dataframe.py +27 -36
  12. pixeltable/env.py +7 -0
  13. pixeltable/exec/__init__.py +0 -1
  14. pixeltable/exec/aggregation_node.py +6 -3
  15. pixeltable/exec/cache_prefetch_node.py +189 -43
  16. pixeltable/exec/data_row_batch.py +5 -22
  17. pixeltable/exec/exec_context.py +2 -2
  18. pixeltable/exec/exec_node.py +3 -2
  19. pixeltable/exec/expr_eval_node.py +23 -16
  20. pixeltable/exec/in_memory_data_node.py +6 -3
  21. pixeltable/exec/sql_node.py +24 -25
  22. pixeltable/exprs/arithmetic_expr.py +12 -5
  23. pixeltable/exprs/array_slice.py +7 -7
  24. pixeltable/exprs/column_property_ref.py +37 -10
  25. pixeltable/exprs/column_ref.py +97 -14
  26. pixeltable/exprs/comparison.py +10 -5
  27. pixeltable/exprs/compound_predicate.py +8 -7
  28. pixeltable/exprs/data_row.py +27 -18
  29. pixeltable/exprs/expr.py +53 -52
  30. pixeltable/exprs/expr_set.py +5 -0
  31. pixeltable/exprs/function_call.py +32 -16
  32. pixeltable/exprs/globals.py +4 -1
  33. pixeltable/exprs/in_predicate.py +8 -7
  34. pixeltable/exprs/inline_expr.py +4 -4
  35. pixeltable/exprs/is_null.py +4 -4
  36. pixeltable/exprs/json_mapper.py +11 -12
  37. pixeltable/exprs/json_path.py +6 -11
  38. pixeltable/exprs/literal.py +5 -5
  39. pixeltable/exprs/method_ref.py +5 -4
  40. pixeltable/exprs/object_ref.py +2 -1
  41. pixeltable/exprs/row_builder.py +88 -36
  42. pixeltable/exprs/rowid_ref.py +12 -11
  43. pixeltable/exprs/similarity_expr.py +12 -7
  44. pixeltable/exprs/sql_element_cache.py +7 -5
  45. pixeltable/exprs/type_cast.py +8 -6
  46. pixeltable/exprs/variable.py +5 -4
  47. pixeltable/func/aggregate_function.py +9 -9
  48. pixeltable/func/expr_template_function.py +6 -5
  49. pixeltable/func/function.py +11 -10
  50. pixeltable/func/udf.py +6 -11
  51. pixeltable/functions/__init__.py +2 -2
  52. pixeltable/functions/globals.py +5 -7
  53. pixeltable/functions/huggingface.py +155 -45
  54. pixeltable/functions/llama_cpp.py +107 -0
  55. pixeltable/functions/mistralai.py +1 -1
  56. pixeltable/functions/ollama.py +147 -0
  57. pixeltable/functions/openai.py +1 -1
  58. pixeltable/functions/replicate.py +72 -0
  59. pixeltable/functions/string.py +9 -0
  60. pixeltable/functions/together.py +1 -1
  61. pixeltable/functions/util.py +5 -2
  62. pixeltable/globals.py +67 -26
  63. pixeltable/index/btree.py +16 -3
  64. pixeltable/index/embedding_index.py +4 -4
  65. pixeltable/io/__init__.py +1 -2
  66. pixeltable/io/fiftyone.py +178 -0
  67. pixeltable/io/globals.py +96 -2
  68. pixeltable/iterators/base.py +3 -2
  69. pixeltable/iterators/document.py +1 -1
  70. pixeltable/iterators/video.py +120 -63
  71. pixeltable/metadata/__init__.py +1 -1
  72. pixeltable/metadata/converters/convert_21.py +34 -0
  73. pixeltable/metadata/converters/util.py +45 -4
  74. pixeltable/metadata/notes.py +1 -0
  75. pixeltable/metadata/schema.py +8 -0
  76. pixeltable/plan.py +17 -15
  77. pixeltable/py.typed +0 -0
  78. pixeltable/store.py +7 -2
  79. pixeltable/tool/create_test_db_dump.py +1 -1
  80. pixeltable/tool/create_test_video.py +1 -1
  81. pixeltable/tool/embed_udf.py +1 -1
  82. pixeltable/tool/mypy_plugin.py +28 -5
  83. pixeltable/type_system.py +100 -36
  84. pixeltable/utils/coco.py +5 -5
  85. pixeltable/utils/documents.py +15 -1
  86. pixeltable/utils/formatter.py +12 -13
  87. pixeltable/utils/s3.py +6 -3
  88. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/METADATA +158 -49
  89. pixeltable-0.2.23.dist-info/RECORD +153 -0
  90. pixeltable/exec/media_validation_node.py +0 -43
  91. pixeltable-0.2.21.dist-info/RECORD +0 -148
  92. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/LICENSE +0 -0
  93. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/WHEEL +0 -0
  94. {pixeltable-0.2.21.dist-info → pixeltable-0.2.23.dist-info}/entry_points.txt +0 -0
@@ -26,7 +26,7 @@ from pixeltable.utils.media_store import MediaStore
26
26
 
27
27
  from ..func.globals import resolve_symbol
28
28
  from .column import Column
29
- from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier
29
+ from .globals import _POS_COLUMN_NAME, _ROWID_COLUMN_NAME, UpdateStatus, is_valid_identifier, MediaValidation
30
30
 
31
31
  if TYPE_CHECKING:
32
32
  from pixeltable import exec, store
@@ -53,6 +53,7 @@ class TableVersion:
53
53
  name: str
54
54
  version: int
55
55
  comment: str
56
+ media_validation: MediaValidation
56
57
  num_retained_versions: int
57
58
  schema_version: int
58
59
  view_md: Optional[schema.ViewMd]
@@ -109,6 +110,7 @@ class TableVersion:
109
110
  self.view_md = tbl_md.view_md # save this as-is, it's needed for _create_md()
110
111
  is_view = tbl_md.view_md is not None
111
112
  self.is_snapshot = (is_view and tbl_md.view_md.is_snapshot) or bool(is_snapshot)
113
+ self.media_validation = MediaValidation[schema_version_md.media_validation.upper()]
112
114
  # a mutable TableVersion doesn't have a static version
113
115
  self.effective_version = self.version if self.is_snapshot else None
114
116
 
@@ -182,7 +184,7 @@ class TableVersion:
182
184
  @classmethod
183
185
  def create(
184
186
  cls, session: orm.Session, dir_id: UUID, name: str, cols: list[Column], num_retained_versions: int,
185
- comment: str, base_path: Optional[pxt.catalog.TableVersionPath] = None,
187
+ comment: str, media_validation: MediaValidation, base_path: Optional[pxt.catalog.TableVersionPath] = None,
186
188
  view_md: Optional[schema.ViewMd] = None
187
189
  ) -> tuple[UUID, Optional[TableVersion]]:
188
190
  # assign ids
@@ -191,8 +193,6 @@ class TableVersion:
191
193
  col.id = pos
192
194
  col.schema_version_add = 0
193
195
  cols_by_name[col.name] = col
194
- if col.value_expr is None and col.compute_func is not None:
195
- cls._create_value_expr(col, base_path)
196
196
  if col.is_computed:
197
197
  col.check_value_expr()
198
198
 
@@ -214,11 +214,17 @@ class TableVersion:
214
214
  tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
215
215
 
216
216
  # create schema.TableSchemaVersion
217
- schema_col_md = {col.id: schema.SchemaColumn(pos=pos, name=col.name) for pos, col in enumerate(cols)}
217
+ schema_col_md: dict[int, schema.SchemaColumn] = {}
218
+ for pos, col in enumerate(cols):
219
+ md = schema.SchemaColumn(
220
+ pos=pos, name=col.name,
221
+ media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
222
+ schema_col_md[col.id] = md
218
223
 
219
224
  schema_version_md = schema.TableSchemaVersionMd(
220
225
  schema_version=0, preceding_schema_version=None, columns=schema_col_md,
221
- num_retained_versions=num_retained_versions, comment=comment)
226
+ num_retained_versions=num_retained_versions, comment=comment,
227
+ media_validation=media_validation.name.lower())
222
228
  schema_version_record = schema.TableSchemaVersion(
223
229
  tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
224
230
 
@@ -285,10 +291,15 @@ class TableVersion:
285
291
  self.cols_by_name = {}
286
292
  self.cols_by_id = {}
287
293
  for col_md in tbl_md.column_md.values():
288
- col_name = schema_version_md.columns[col_md.id].name if col_md.id in schema_version_md.columns else None
294
+ schema_col_md = schema_version_md.columns[col_md.id] if col_md.id in schema_version_md.columns else None
295
+ col_name = schema_col_md.name if schema_col_md is not None else None
296
+ media_val = (
297
+ MediaValidation[schema_col_md.media_validation.upper()]
298
+ if schema_col_md is not None and schema_col_md.media_validation is not None else None
299
+ )
289
300
  col = Column(
290
301
  col_id=col_md.id, name=col_name, col_type=ts.ColumnType.from_dict(col_md.col_type),
291
- is_pk=col_md.is_pk, stored=col_md.stored,
302
+ is_pk=col_md.is_pk, stored=col_md.stored, media_validation=media_val,
292
303
  schema_version_add=col_md.schema_version_add, schema_version_drop=col_md.schema_version_drop,
293
304
  value_expr_dict=col_md.value_expr)
294
305
  col.tbl = self
@@ -349,7 +360,8 @@ class TableVersion:
349
360
  self.store_tbl = StoreTable(self)
350
361
 
351
362
  def _update_md(
352
- self, timestamp: float, conn: sql.engine.Connection, update_tbl_version: bool = True, preceding_schema_version: Optional[int] = None
363
+ self, timestamp: float, conn: sql.engine.Connection, update_tbl_version: bool = True,
364
+ preceding_schema_version: Optional[int] = None
353
365
  ) -> None:
354
366
  """Writes table metadata to the database.
355
367
 
@@ -480,37 +492,35 @@ class TableVersion:
480
492
  self._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
481
493
  _logger.info(f'Dropped index {idx_md.name} on table {self.name}')
482
494
 
483
- def add_column(self, col: Column, print_stats: bool, on_error: Literal['abort', 'ignore']) -> UpdateStatus:
495
+ def add_columns(self, cols: Iterable[Column], print_stats: bool, on_error: Literal['abort', 'ignore']) -> UpdateStatus:
484
496
  """Adds a column to the table.
485
497
  """
486
498
  assert not self.is_snapshot
487
- assert is_valid_identifier(col.name)
488
- assert col.stored is not None
489
- assert col.name not in self.cols_by_name
490
- col.tbl = self
491
- col.id = self.next_col_id
492
- self.next_col_id += 1
493
-
494
- if col.compute_func is not None:
495
- # create value_expr from compute_func
496
- self._create_value_expr(col, self.path)
499
+ assert all(is_valid_identifier(col.name) for col in cols)
500
+ assert all(col.stored is not None for col in cols)
501
+ assert all(col.name not in self.cols_by_name for col in cols)
502
+ for col in cols:
503
+ col.tbl = self
504
+ col.id = self.next_col_id
505
+ self.next_col_id += 1
497
506
 
498
507
  # we're creating a new schema version
499
508
  self.version += 1
500
509
  preceding_schema_version = self.schema_version
501
510
  self.schema_version = self.version
502
511
  with Env.get().engine.begin() as conn:
503
- status = self._add_columns([col], conn, print_stats=print_stats, on_error=on_error)
504
- _ = self._add_default_index(col, conn)
512
+ status = self._add_columns(cols, conn, print_stats=print_stats, on_error=on_error)
513
+ for col in cols:
514
+ _ = self._add_default_index(col, conn)
505
515
  self._update_md(time.time(), conn, preceding_schema_version=preceding_schema_version)
506
- _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
516
+ _logger.info(f'Added columns {[col.name for col in cols]} to table {self.name}, new version: {self.version}')
507
517
 
508
518
  msg = (
509
519
  f'Added {status.num_rows} column value{"" if status.num_rows == 1 else "s"} '
510
520
  f'with {status.num_excs} error{"" if status.num_excs == 1 else "s"}.'
511
521
  )
512
522
  print(msg)
513
- _logger.info(f'Column {col.name}: {msg}')
523
+ _logger.info(f'Columns {[col.name for col in cols]}: {msg}')
514
524
  return status
515
525
 
516
526
  def _add_columns(
@@ -710,20 +720,22 @@ class TableVersion:
710
720
 
711
721
  if conn is None:
712
722
  with Env.get().engine.begin() as conn:
713
- return self._insert(plan, conn, time.time(), print_stats=print_stats, rowids=rowids())
723
+ return self._insert(
724
+ plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
714
725
  else:
715
- return self._insert(plan, conn, time.time(), print_stats=print_stats, rowids=rowids())
726
+ return self._insert(
727
+ plan, conn, time.time(), print_stats=print_stats, rowids=rowids(), abort_on_exc=fail_on_exception)
716
728
 
717
729
  def _insert(
718
730
  self, exec_plan: 'exec.ExecNode', conn: sql.engine.Connection, timestamp: float, *,
719
- rowids: Optional[Iterator[int]] = None, print_stats: bool = False,
731
+ rowids: Optional[Iterator[int]] = None, print_stats: bool = False, abort_on_exc: bool = False
720
732
  ) -> UpdateStatus:
721
733
  """Insert rows produced by exec_plan and propagate to views"""
722
734
  # we're creating a new version
723
735
  self.version += 1
724
736
  result = UpdateStatus()
725
737
  num_rows, num_excs, cols_with_excs = self.store_tbl.insert_rows(
726
- exec_plan, conn, v_min=self.version, rowids=rowids)
738
+ exec_plan, conn, v_min=self.version, rowids=rowids, abort_on_exc=abort_on_exc)
727
739
  result.num_rows = num_rows
728
740
  result.num_excs = num_excs
729
741
  result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
@@ -1124,28 +1136,6 @@ class TableVersion:
1124
1136
  names = [c.name for c in self.cols_by_name.values() if c.is_computed]
1125
1137
  return names
1126
1138
 
1127
- @classmethod
1128
- def _create_value_expr(cls, col: Column, path: pxt.catalog.TableVersionPath) -> None:
1129
- """
1130
- Create col.value_expr, given col.compute_func.
1131
- Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
1132
- Does not update Column.dependent_cols.
1133
- """
1134
- assert col.value_expr is None
1135
- assert col.compute_func is not None
1136
- from pixeltable import exprs
1137
- params = inspect.signature(col.compute_func).parameters
1138
- args: list[exprs.ColumnRef] = []
1139
- for param_name in params:
1140
- param = path.get_column(param_name)
1141
- if param is None:
1142
- raise excs.Error(
1143
- f'Column {col.name}: Callable parameter refers to an unknown column: {param_name}')
1144
- args.append(exprs.ColumnRef(param))
1145
- fn = func.make_function(
1146
- col.compute_func, return_type=col.col_type, param_types=[arg.col_type for arg in args])
1147
- col.set_value_expr(fn(*args))
1148
-
1149
1139
  def _record_refd_columns(self, col: Column) -> None:
1150
1140
  """Update Column.dependent_cols for all cols referenced in col.value_expr.
1151
1141
  """
@@ -1203,7 +1193,8 @@ class TableVersion:
1203
1193
  name=self.name, current_version=self.version, current_schema_version=self.schema_version,
1204
1194
  next_col_id=self.next_col_id, next_idx_id=self.next_idx_id, next_row_id=self.next_rowid,
1205
1195
  column_md=self._create_column_md(self.cols), index_md=self.idx_md,
1206
- external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md)
1196
+ external_stores=self._create_stores_md(self.external_stores.values()), view_md=self.view_md,
1197
+ )
1207
1198
 
1208
1199
  def _create_version_md(self, timestamp: float) -> schema.TableVersionMd:
1209
1200
  return schema.TableVersionMd(created_at=timestamp, version=self.version, schema_version=self.schema_version)
@@ -1211,11 +1202,14 @@ class TableVersion:
1211
1202
  def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
1212
1203
  column_md: dict[int, schema.SchemaColumn] = {}
1213
1204
  for pos, col in enumerate(self.cols_by_name.values()):
1214
- column_md[col.id] = schema.SchemaColumn(pos=pos, name=col.name)
1205
+ column_md[col.id] = schema.SchemaColumn(
1206
+ pos=pos, name=col.name,
1207
+ media_validation=col._media_validation.name.lower() if col._media_validation is not None else None)
1215
1208
  # preceding_schema_version to be set by the caller
1216
1209
  return schema.TableSchemaVersionMd(
1217
1210
  schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,
1218
- columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment)
1211
+ columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment,
1212
+ media_validation=self.media_validation.name.lower())
1219
1213
 
1220
1214
  def as_dict(self) -> dict:
1221
1215
  return {'id': str(self.id), 'effective_version': self.effective_version}
@@ -81,13 +81,13 @@ class TableVersionPath:
81
81
  return None
82
82
  return self.base.find_tbl_version(id)
83
83
 
84
- def __getattr__(self, col_name: str) -> exprs.ColumnRef:
84
+ def get_column_ref(self, col_name: str) -> exprs.ColumnRef:
85
85
  """Return a ColumnRef for the given column name."""
86
86
  from pixeltable.exprs import ColumnRef
87
87
  if col_name not in self.tbl_version.cols_by_name:
88
88
  if self.base is None:
89
89
  raise AttributeError(f'Column {col_name} unknown')
90
- return getattr(self.base, col_name)
90
+ return self.base.get_column_ref(col_name)
91
91
  col = self.tbl_version.cols_by_name[col_name]
92
92
  return ColumnRef(col)
93
93
 
@@ -2,24 +2,21 @@ from __future__ import annotations
2
2
 
3
3
  import inspect
4
4
  import logging
5
- from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Set, Type
5
+ from typing import TYPE_CHECKING, Any, Iterable, Literal, Optional
6
6
  from uuid import UUID
7
7
 
8
8
  import sqlalchemy.orm as orm
9
9
 
10
- import pixeltable.catalog as catalog
11
10
  import pixeltable.exceptions as excs
12
- import pixeltable.exprs as exprs
13
- import pixeltable.func as func
14
11
  import pixeltable.metadata.schema as md_schema
12
+ import pixeltable.type_system as ts
13
+ from pixeltable import catalog, exprs, func
15
14
  from pixeltable.env import Env
16
- from pixeltable.exceptions import Error
17
15
  from pixeltable.iterators import ComponentIterator
18
- from pixeltable.type_system import IntType, InvalidType
19
16
 
20
17
  from .catalog import Catalog
21
18
  from .column import Column
22
- from .globals import _POS_COLUMN_NAME, UpdateStatus
19
+ from .globals import _POS_COLUMN_NAME, UpdateStatus, MediaValidation
23
20
  from .table import Table
24
21
  from .table_version import TableVersion
25
22
  from .table_version_path import TableVersionPath
@@ -52,9 +49,10 @@ class View(Table):
52
49
 
53
50
  @classmethod
54
51
  def _create(
55
- cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns: Dict[str, Any],
52
+ cls, dir_id: UUID, name: str, base: TableVersionPath, additional_columns: dict[str, Any],
56
53
  predicate: Optional['pxt.exprs.Expr'], is_snapshot: bool, num_retained_versions: int, comment: str,
57
- iterator_cls: Optional[Type[ComponentIterator]], iterator_args: Optional[Dict]
54
+ media_validation: MediaValidation,
55
+ iterator_cls: Optional[type[ComponentIterator]], iterator_args: Optional[dict]
58
56
  ) -> View:
59
57
  columns = cls._create_columns(additional_columns)
60
58
  cls._verify_schema(columns)
@@ -92,17 +90,17 @@ class View(Table):
92
90
  func.Parameter(param_name, param_type, kind=inspect.Parameter.POSITIONAL_OR_KEYWORD)
93
91
  for param_name, param_type in iterator_cls.input_schema().items()
94
92
  ]
95
- sig = func.Signature(InvalidType(), params)
93
+ sig = func.Signature(ts.InvalidType(), params)
96
94
  from pixeltable.exprs import FunctionCall
97
95
  FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
98
96
  except TypeError as e:
99
- raise Error(f'Cannot instantiate iterator with given arguments: {e}')
97
+ raise excs.Error(f'Cannot instantiate iterator with given arguments: {e}')
100
98
 
101
99
  # prepend pos and output_schema columns to cols:
102
100
  # a component view exposes the pos column of its rowid;
103
101
  # we create that column here, so it gets assigned a column id;
104
102
  # stored=False: it is not stored separately (it's already stored as part of the rowid)
105
- iterator_cols = [Column(_POS_COLUMN_NAME, IntType(), stored=False)]
103
+ iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
106
104
  output_dict, unstored_cols = iterator_cls.output_schema(**bound_args)
107
105
  iterator_cols.extend([
108
106
  Column(col_name, col_type, stored=col_name not in unstored_cols)
@@ -112,12 +110,12 @@ class View(Table):
112
110
  iterator_col_names = {col.name for col in iterator_cols}
113
111
  for col in columns:
114
112
  if col.name in iterator_col_names:
115
- raise Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
113
+ raise excs.Error(f'Duplicate name: column {col.name} is already present in the iterator output schema')
116
114
  columns = iterator_cols + columns
117
115
 
118
116
  with orm.Session(Env.get().engine, future=True) as session:
119
117
  from pixeltable.exprs import InlineDict
120
- iterator_args_expr = InlineDict(iterator_args) if iterator_args is not None else None
118
+ iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
121
119
  iterator_class_fqn = f'{iterator_cls.__module__}.{iterator_cls.__name__}' if iterator_cls is not None \
122
120
  else None
123
121
  base_version_path = cls._get_snapshot_path(base) if is_snapshot else base
@@ -142,7 +140,8 @@ class View(Table):
142
140
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None)
143
141
 
144
142
  id, tbl_version = TableVersion.create(
145
- session, dir_id, name, columns, num_retained_versions, comment, base_path=base_version_path, view_md=view_md)
143
+ session, dir_id, name, columns, num_retained_versions, comment, media_validation=media_validation,
144
+ base_path=base_version_path, view_md=view_md)
146
145
  if tbl_version is None:
147
146
  # this is purely a snapshot: we use the base's tbl version path
148
147
  view = cls(id, dir_id, name, base_version_path, base.tbl_id(), snapshot_only=True)
@@ -168,11 +167,11 @@ class View(Table):
168
167
 
169
168
  @classmethod
170
169
  def _verify_column(
171
- cls, col: Column, existing_column_names: Set[str], existing_query_names: Optional[Set[str]] = None
170
+ cls, col: Column, existing_column_names: set[str], existing_query_names: Optional[set[str]] = None
172
171
  ) -> None:
173
172
  # make sure that columns are nullable or have a default
174
173
  if not col.col_type.nullable and not col.is_computed:
175
- raise Error(f'Column {col.name}: non-computed columns in views must be nullable')
174
+ raise excs.Error(f'Column {col.name}: non-computed columns in views must be nullable')
176
175
  super()._verify_column(col, existing_column_names, existing_query_names)
177
176
 
178
177
  @classmethod
@@ -217,7 +216,7 @@ class View(Table):
217
216
 
218
217
  def insert(
219
218
  self, rows: Optional[Iterable[dict[str, Any]]] = None, /, *, print_stats: bool = False,
220
- fail_on_exception: bool = True, **kwargs: Any
219
+ on_error: Literal['abort', 'ignore'] = 'abort', **kwargs: Any
221
220
  ) -> UpdateStatus:
222
221
  raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
223
222
 
pixeltable/dataframe.py CHANGED
@@ -8,7 +8,7 @@ import logging
8
8
  import mimetypes
9
9
  import traceback
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterator, List, Optional, Sequence, Set, Tuple, Union
11
+ from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union
12
12
 
13
13
  import pandas as pd
14
14
  import pandas.io.formats.style
@@ -34,14 +34,6 @@ __all__ = ['DataFrame']
34
34
  _logger = logging.getLogger('pixeltable')
35
35
 
36
36
 
37
- def _create_source_tag(file_path: str) -> str:
38
- src_url = get_file_uri(Env.get().http_address, file_path)
39
- mime = mimetypes.guess_type(src_url)[0]
40
- # if mime is None, the attribute string would not be valid html.
41
- mime_attr = f'type="{mime}"' if mime is not None else ''
42
- return f'<source src="{src_url}" {mime_attr} />'
43
-
44
-
45
37
  class DataFrameResultSet:
46
38
  def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
47
39
  self._rows = rows
@@ -77,7 +69,7 @@ class DataFrameResultSet:
77
69
  def to_pandas(self) -> pd.DataFrame:
78
70
  return pd.DataFrame.from_records(self._rows, columns=self._col_names)
79
71
 
80
- def _row_to_dict(self, row_idx: int) -> Dict[str, Any]:
72
+ def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
81
73
  return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
82
74
 
83
75
  def __getitem__(self, index: Any) -> Any:
@@ -111,22 +103,22 @@ class DataFrameResultSet:
111
103
  # def __init__(self, tbl: catalog.TableVersion):
112
104
  # self.tbl = tbl
113
105
  # # output of the SQL scan stage
114
- # self.sql_scan_output_exprs: List[exprs.Expr] = []
106
+ # self.sql_scan_output_exprs: list[exprs.Expr] = []
115
107
  # # output of the agg stage
116
- # self.agg_output_exprs: List[exprs.Expr] = []
108
+ # self.agg_output_exprs: list[exprs.Expr] = []
117
109
  # # Where clause of the Select stmt of the SQL scan stage
118
110
  # self.sql_where_clause: Optional[sql.ClauseElement] = None
119
111
  # # filter predicate applied to input rows of the SQL scan stage
120
112
  # self.filter: Optional[exprs.Predicate] = None
121
113
  # self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
122
- # self.agg_fn_calls: List[exprs.FunctionCall] = [] # derived from unique_exprs
114
+ # self.agg_fn_calls: list[exprs.FunctionCall] = [] # derived from unique_exprs
123
115
  # self.has_frame_col: bool = False # True if we're referencing the frame col
124
116
  #
125
117
  # self.evaluator: Optional[exprs.Evaluator] = None
126
- # self.sql_scan_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of SQL scan stage
127
- # self.agg_eval_ctx: List[exprs.Expr] = [] # needed to materialize output of agg stage
128
- # self.filter_eval_ctx: List[exprs.Expr] = []
129
- # self.group_by_eval_ctx: List[exprs.Expr] = []
118
+ # self.sql_scan_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of SQL scan stage
119
+ # self.agg_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of agg stage
120
+ # self.filter_eval_ctx: list[exprs.Expr] = []
121
+ # self.group_by_eval_ctx: list[exprs.Expr] = []
130
122
  #
131
123
  # def finalize_exec(self) -> None:
132
124
  # """
@@ -142,11 +134,11 @@ class DataFrame:
142
134
  def __init__(
143
135
  self,
144
136
  tbl: catalog.TableVersionPath,
145
- select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]] = None,
137
+ select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None,
146
138
  where_clause: Optional[exprs.Expr] = None,
147
- group_by_clause: Optional[List[exprs.Expr]] = None,
139
+ group_by_clause: Optional[list[exprs.Expr]] = None,
148
140
  grouping_tbl: Optional[catalog.TableVersion] = None,
149
- order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None, # List[(expr, asc)]
141
+ order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
150
142
  limit: Optional[int] = None,
151
143
  ):
152
144
  self.tbl = tbl
@@ -174,7 +166,7 @@ class DataFrame:
174
166
  @classmethod
175
167
  def _select_list_check_rep(
176
168
  cls,
177
- select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
169
+ select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
178
170
  ) -> None:
179
171
  """Validate basic select list types."""
180
172
  if select_list is None: # basic check for valid select list
@@ -371,15 +363,10 @@ class DataFrame:
371
363
  group_by_clause=group_by_clause, grouping_tbl=self.grouping_tbl,
372
364
  order_by_clause=order_by_clause, limit=self.limit_val)
373
365
 
374
- def collect(self) -> DataFrameResultSet:
375
- return self._collect()
376
-
377
- def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
366
+ def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
378
367
  try:
379
- result_rows = []
380
368
  for data_row in self._exec(conn):
381
- result_row = [data_row[e.slot_idx] for e in self._select_list_exprs]
382
- result_rows.append(result_row)
369
+ yield [data_row[e.slot_idx] for e in self._select_list_exprs]
383
370
  except excs.ExprEvalError as e:
384
371
  msg = f'In row {e.row_num} the {e.expr_msg} encountered exception ' f'{type(e.exc).__name__}:\n{str(e.exc)}'
385
372
  if len(e.input_vals) > 0:
@@ -399,7 +386,11 @@ class DataFrame:
399
386
  except sql.exc.DBAPIError as e:
400
387
  raise excs.Error(f'Error during SQL execution:\n{e}')
401
388
 
402
- return DataFrameResultSet(result_rows, self.schema)
389
+ def collect(self) -> DataFrameResultSet:
390
+ return self._collect()
391
+
392
+ def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
393
+ return DataFrameResultSet(list(self._output_row_iterator(conn)), self.schema)
403
394
 
404
395
  def count(self) -> int:
405
396
  from pixeltable.plan import Planner
@@ -412,8 +403,8 @@ class DataFrame:
412
403
 
413
404
  def _description(self) -> pd.DataFrame:
414
405
  """see DataFrame.describe()"""
415
- heading_vals: List[str] = []
416
- info_vals: List[str] = []
406
+ heading_vals: list[str] = []
407
+ info_vals: list[str] = []
417
408
  if self.select_list is not None:
418
409
  assert len(self.select_list) > 0
419
410
  heading_vals.append('Select')
@@ -498,7 +489,7 @@ class DataFrame:
498
489
 
499
490
  # check user provided names do not conflict among themselves
500
491
  # or with auto-generated ones
501
- seen: Set[str] = set()
492
+ seen: set[str] = set()
502
493
  _, names = DataFrame._normalize_select_list(self.tbl, select_list)
503
494
  for name in names:
504
495
  if name in seen:
@@ -541,7 +532,7 @@ class DataFrame:
541
532
  if self.group_by_clause is not None:
542
533
  raise excs.Error(f'Group-by already specified')
543
534
  grouping_tbl: Optional[catalog.TableVersion] = None
544
- group_by_clause: Optional[List[exprs.Expr]] = None
535
+ group_by_clause: Optional[list[exprs.Expr]] = None
545
536
  for item in grouping_items:
546
537
  if isinstance(item, catalog.Table):
547
538
  if len(grouping_items) > 1:
@@ -619,7 +610,7 @@ class DataFrame:
619
610
  def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> DataFrame:
620
611
  """
621
612
  Allowed:
622
- - [List[Expr]]/[Tuple[Expr]]: setting the select list
613
+ - [list[Expr]]/[tuple[Expr]]: setting the select list
623
614
  - [Expr]: setting a single-col select list
624
615
  """
625
616
  if isinstance(index, exprs.Expr):
@@ -628,7 +619,7 @@ class DataFrame:
628
619
  return self.select(*index)
629
620
  raise TypeError(f'Invalid index type: {type(index)}')
630
621
 
631
- def as_dict(self) -> Dict[str, Any]:
622
+ def as_dict(self) -> dict[str, Any]:
632
623
  """
633
624
  Returns:
634
625
  Dictionary representing this dataframe.
@@ -650,7 +641,7 @@ class DataFrame:
650
641
  return d
651
642
 
652
643
  @classmethod
653
- def from_dict(cls, d: Dict[str, Any]) -> 'DataFrame':
644
+ def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
654
645
  tbl = catalog.TableVersionPath.from_dict(d['tbl'])
655
646
  select_list = [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] \
656
647
  if d['select_list'] is not None else None
pixeltable/env.py CHANGED
@@ -494,18 +494,25 @@ class Env:
494
494
  self.__register_package('anthropic')
495
495
  self.__register_package('boto3')
496
496
  self.__register_package('datasets')
497
+ self.__register_package('fiftyone')
497
498
  self.__register_package('fireworks', library_name='fireworks-ai')
499
+ self.__register_package('huggingface_hub', library_name='huggingface-hub')
498
500
  self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
501
+ self.__register_package('llama_cpp', library_name='llama-cpp-python')
499
502
  self.__register_package('mistralai')
500
503
  self.__register_package('mistune')
504
+ self.__register_package('ollama')
501
505
  self.__register_package('openai')
502
506
  self.__register_package('openpyxl')
503
507
  self.__register_package('pyarrow')
508
+ self.__register_package('replicate')
509
+ self.__register_package('sentencepiece')
504
510
  self.__register_package('sentence_transformers', library_name='sentence-transformers')
505
511
  self.__register_package('spacy')
506
512
  self.__register_package('tiktoken')
507
513
  self.__register_package('together')
508
514
  self.__register_package('torch')
515
+ self.__register_package('torchaudio')
509
516
  self.__register_package('torchvision')
510
517
  self.__register_package('transformers')
511
518
  self.__register_package('whisper', library_name='openai-whisper')
@@ -6,6 +6,5 @@ from .exec_context import ExecContext
6
6
  from .exec_node import ExecNode
7
7
  from .expr_eval_node import ExprEvalNode
8
8
  from .in_memory_data_node import InMemoryDataNode
9
- from .media_validation_node import MediaValidationNode
10
9
  from .row_update_node import RowUpdateNode
11
10
  from .sql_node import SqlLookupNode, SqlScanNode, SqlAggregationNode, SqlNode
@@ -2,11 +2,12 @@ from __future__ import annotations
2
2
 
3
3
  import logging
4
4
  import sys
5
- from typing import Iterable, Optional, Any, Iterator
5
+ from typing import Any, Iterable, Iterator, Optional, cast
6
6
 
7
7
  import pixeltable.catalog as catalog
8
8
  import pixeltable.exceptions as excs
9
9
  import pixeltable.exprs as exprs
10
+
10
11
  from .data_row_batch import DataRowBatch
11
12
  from .exec_node import ExecNode
12
13
 
@@ -28,13 +29,15 @@ class AggregationNode(ExecNode):
28
29
  self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: Optional[list[exprs.Expr]],
29
30
  agg_fn_calls: list[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
30
31
  ):
31
- super().__init__(row_builder, group_by + agg_fn_calls, input_exprs, input)
32
+ output_exprs: list[exprs.Expr] = [] if group_by is None else list(group_by)
33
+ output_exprs.extend(agg_fn_calls)
34
+ super().__init__(row_builder, output_exprs, input_exprs, input)
32
35
  self.input = input
33
36
  self.group_by = group_by
34
37
  self.input_exprs = list(input_exprs)
35
38
  self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=self.input_exprs)
36
39
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
37
- self.agg_fn_calls = self.agg_fn_eval_ctx.target_exprs
40
+ self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
38
41
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
39
42
  self.output_batch = DataRowBatch(tbl, row_builder, 0)
40
43