pixeltable 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (139) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +520 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +373 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +113 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +187 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +61 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +88 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +27 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +413 -182
  88. pixeltable/tests/conftest.py +143 -87
  89. pixeltable/tests/test_audio.py +65 -0
  90. pixeltable/tests/test_catalog.py +27 -0
  91. pixeltable/tests/test_client.py +14 -14
  92. pixeltable/tests/test_component_view.py +372 -0
  93. pixeltable/tests/test_dataframe.py +433 -0
  94. pixeltable/tests/test_dirs.py +78 -62
  95. pixeltable/tests/test_document.py +117 -0
  96. pixeltable/tests/test_exprs.py +591 -135
  97. pixeltable/tests/test_function.py +297 -67
  98. pixeltable/tests/test_functions.py +283 -1
  99. pixeltable/tests/test_migration.py +43 -0
  100. pixeltable/tests/test_nos.py +54 -0
  101. pixeltable/tests/test_snapshot.py +208 -0
  102. pixeltable/tests/test_table.py +1085 -262
  103. pixeltable/tests/test_transactional_directory.py +42 -0
  104. pixeltable/tests/test_types.py +5 -11
  105. pixeltable/tests/test_video.py +149 -34
  106. pixeltable/tests/test_view.py +530 -0
  107. pixeltable/tests/utils.py +186 -45
  108. pixeltable/tool/create_test_db_dump.py +149 -0
  109. pixeltable/type_system.py +490 -126
  110. pixeltable/utils/__init__.py +17 -46
  111. pixeltable/utils/clip.py +12 -15
  112. pixeltable/utils/coco.py +136 -0
  113. pixeltable/utils/documents.py +39 -0
  114. pixeltable/utils/filecache.py +195 -0
  115. pixeltable/utils/help.py +11 -0
  116. pixeltable/utils/media_store.py +76 -0
  117. pixeltable/utils/parquet.py +126 -0
  118. pixeltable/utils/pytorch.py +172 -0
  119. pixeltable/utils/s3.py +13 -0
  120. pixeltable/utils/sql.py +17 -0
  121. pixeltable/utils/transactional_directory.py +35 -0
  122. pixeltable-0.2.0.dist-info/LICENSE +18 -0
  123. pixeltable-0.2.0.dist-info/METADATA +117 -0
  124. pixeltable-0.2.0.dist-info/RECORD +125 -0
  125. {pixeltable-0.1.1.dist-info → pixeltable-0.2.0.dist-info}/WHEEL +1 -1
  126. pixeltable/catalog.py +0 -1421
  127. pixeltable/exprs.py +0 -1745
  128. pixeltable/function.py +0 -269
  129. pixeltable/functions/clip.py +0 -10
  130. pixeltable/functions/pil/__init__.py +0 -23
  131. pixeltable/functions/tf.py +0 -21
  132. pixeltable/index.py +0 -57
  133. pixeltable/tests/test_dict.py +0 -24
  134. pixeltable/tests/test_tf.py +0 -69
  135. pixeltable/tf.py +0 -33
  136. pixeltable/utils/tf.py +0 -33
  137. pixeltable/utils/video.py +0 -32
  138. pixeltable-0.1.1.dist-info/METADATA +0 -31
  139. pixeltable-0.1.1.dist-info/RECORD +0 -36
@@ -0,0 +1,749 @@
1
+ from __future__ import annotations
2
+
3
+ import dataclasses
4
+ import importlib
5
+ import inspect
6
+ import logging
7
+ import time
8
+ from typing import Optional, List, Dict, Any, Tuple, Type, Set
9
+ from uuid import UUID
10
+
11
+ import sqlalchemy as sql
12
+ import sqlalchemy.orm as orm
13
+
14
+ import pixeltable
15
+ import pixeltable.func as func
16
+ from pixeltable import exceptions as excs
17
+ from pixeltable.env import Env
18
+ from pixeltable.iterators import ComponentIterator
19
+ from pixeltable.metadata import schema
20
+ from pixeltable.utils.filecache import FileCache
21
+ from pixeltable.utils.media_store import MediaStore
22
+ from .column import Column
23
+ from .globals import UpdateStatus, POS_COLUMN_NAME, is_valid_identifier
24
+
25
+ _logger = logging.getLogger('pixeltable')
26
+
27
+ class TableVersion:
28
+ """
29
+ TableVersion represents a particular version of a table/view along with its store table:
30
+ - the version can be mutable or a snapshot
31
+ - tables and their recursive views form a tree, and a mutable TableVersion also records its own
32
+ mutable views in order to propagate updates
33
+ - each view TableVersion records its base:
34
+ * the base is correct only for mutable views (snapshot versions form a DAG, not a tree)
35
+ * the base is useful for getting access to the StoreTable and the base id
36
+ * TODO: create a separate hierarchy of objects that records the version-independent tree of tables/views, and
37
+ have TableVersions reference those
38
+ - mutable TableVersions record their TableVersionPath, which is needed for expr evaluation in updates
39
+ """
40
+
41
+ def __init__(
42
+ self, id: UUID, tbl_md: schema.TableMd, version: int, schema_version_md: schema.TableSchemaVersionMd,
43
+ base: Optional[TableVersion] = None, base_path: Optional['pixeltable.catalog.TableVersionPath'] = None,
44
+ is_snapshot: Optional[bool] = None
45
+ ):
46
+ # only one of base and base_path can be non-None
47
+ assert base is None or base_path is None
48
+ self.id = id
49
+ self.name = tbl_md.name
50
+ self.version = version
51
+ self.comment = schema_version_md.comment
52
+ self.num_retained_versions = schema_version_md.num_retained_versions
53
+ self.schema_version = schema_version_md.schema_version
54
+ self.view_md = tbl_md.view_md # save this as-is, it's needed for _create_md()
55
+ is_view = tbl_md.view_md is not None
56
+ self.is_snapshot = (is_view and tbl_md.view_md.is_snapshot) or bool(is_snapshot)
57
+ # a mutable TableVersion doesn't have a static version
58
+ self.effective_version = self.version if self.is_snapshot else None
59
+
60
+ # mutable tables need their TableVersionPath for expr eval during updates
61
+ from .table_version_path import TableVersionPath
62
+ if self.is_snapshot:
63
+ self.path = None
64
+ else:
65
+ self.path = TableVersionPath(self, base=base_path) if base_path is not None else TableVersionPath(self)
66
+
67
+ self.base = base_path.tbl_version if base_path is not None else base
68
+ if self.is_snapshot:
69
+ self.next_col_id = -1
70
+ self.next_rowid = -1
71
+ else:
72
+ assert tbl_md.current_version == self.version
73
+ self.next_col_id = tbl_md.next_col_id
74
+ self.next_rowid = tbl_md.next_row_id
75
+ self.column_history = tbl_md.column_history
76
+
77
+ # view-specific initialization
78
+ from pixeltable import exprs
79
+ predicate_dict = None if not is_view or tbl_md.view_md.predicate is None else tbl_md.view_md.predicate
80
+ self.predicate = exprs.Expr.from_dict(predicate_dict) if predicate_dict is not None else None
81
+ self.mutable_views: List[TableVersion] = [] # targets for update propagation
82
+ if self.base is not None and not self.base.is_snapshot and not self.is_snapshot:
83
+ self.base.mutable_views.append(self)
84
+
85
+ # component view-specific initialization
86
+ self.iterator_cls: Optional[Type[ComponentIterator]] = None
87
+ self.iterator_args: Optional[exprs.InlineDict] = None
88
+ self.num_iterator_cols = 0
89
+ if is_view and tbl_md.view_md.iterator_class_fqn is not None:
90
+ module_name, class_name = tbl_md.view_md.iterator_class_fqn.rsplit('.', 1)
91
+ module = importlib.import_module(module_name)
92
+ self.iterator_cls = getattr(module, class_name)
93
+ self.iterator_args = exprs.Expr.from_dict(tbl_md.view_md.iterator_args)
94
+ assert isinstance(self.iterator_args, exprs.InlineDict)
95
+ output_schema, _ = self.iterator_cls.output_schema(**self.iterator_args.to_dict())
96
+ self.num_iterator_cols = len(output_schema)
97
+ assert tbl_md.view_md.iterator_args is not None
98
+
99
+ # register this table version now so that it's available when we're re-creating value exprs
100
+ import pixeltable.catalog as catalog
101
+ cat = catalog.Catalog.get()
102
+ cat.tbl_versions[(self.id, self.effective_version)] = self
103
+
104
+ # do this after we determined whether we're a component view, and before we create the store table
105
+ self._init_schema(schema_version_md)
106
+
107
+ def __hash__(self) -> int:
108
+ return hash(self.id)
109
+
110
+ def create_snapshot_copy(self) -> TableVersion:
111
+ """Create a snapshot copy of this TableVersion"""
112
+ assert not self.is_snapshot
113
+ return TableVersion(
114
+ self.id, self._create_md(), self.version,
115
+ self._create_schema_version_md(preceding_schema_version=0), # preceding_schema_version: dummy value
116
+ is_snapshot=True, base=self.base)
117
+
118
+ @classmethod
119
+ def create(
120
+ cls, session: orm.Session, dir_id: UUID, name: str, cols: List[Column], num_retained_versions: int, comment: str,
121
+ base_path: Optional['pixeltable.catalog.TableVersionPath'] = None, view_md: Optional[schema.ViewMd] = None
122
+ ) -> Tuple[UUID, Optional[TableVersion]]:
123
+ # assign ids
124
+ cols_by_name: Dict[str, Column] = {}
125
+ for pos, col in enumerate(cols):
126
+ col.id = pos
127
+ cols_by_name[col.name] = col
128
+ if col.value_expr is None and col.compute_func is not None:
129
+ cls._create_value_expr(col, base_path)
130
+ if col.is_computed:
131
+ col.check_value_expr()
132
+
133
+ ts = time.time()
134
+ # create schema.Table
135
+ column_history = {
136
+ col.id: schema.ColumnHistory(col_id=col.id, schema_version_add=0, schema_version_drop=None)
137
+ for col in cols
138
+ }
139
+ table_md = schema.TableMd(
140
+ name=name, current_version=0, current_schema_version=0,
141
+ next_col_id=len(cols), next_row_id=0, column_history=column_history,
142
+ view_md=view_md)
143
+ tbl_record = schema.Table(dir_id=dir_id, md=dataclasses.asdict(table_md))
144
+ session.add(tbl_record)
145
+ session.flush() # sets tbl_record.id
146
+ assert tbl_record.id is not None
147
+
148
+ # create schema.TableVersion
149
+ table_version_md = schema.TableVersionMd(created_at=ts, version=0, schema_version=0)
150
+ tbl_version_record = schema.TableVersion(
151
+ tbl_id=tbl_record.id, version=0, md=dataclasses.asdict(table_version_md))
152
+ session.add(tbl_version_record)
153
+
154
+ # create schema.TableSchemaVersion
155
+ column_md: Dict[int, schema.SchemaColumn] = {}
156
+ for pos, col in enumerate(cols):
157
+ # Column.dependent_cols for existing cols is wrong at this point, but init() will set it correctly
158
+ value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
159
+ column_md[col.id] = schema.SchemaColumn(
160
+ pos=pos, name=col.name, col_type=col.col_type.as_dict(),
161
+ is_pk=col.primary_key, value_expr=value_expr_dict, stored=col.stored, is_indexed=col.is_indexed)
162
+
163
+ schema_version_md = schema.TableSchemaVersionMd(
164
+ schema_version=0, preceding_schema_version=None, columns=column_md,
165
+ num_retained_versions=num_retained_versions, comment=comment)
166
+ schema_version_record = schema.TableSchemaVersion(
167
+ tbl_id=tbl_record.id, schema_version=0, md=dataclasses.asdict(schema_version_md))
168
+ session.add(schema_version_record)
169
+
170
+ # if this is purely a snapshot (it doesn't require any additional storage for columns and it # doesn't have a
171
+ # predicate to apply at runtime), we don't create a physical table and simply use the base's table version path
172
+ if view_md is not None and view_md.is_snapshot and view_md.predicate is None and len(cols) == 0:
173
+ return tbl_record.id, None
174
+
175
+ assert (base_path is not None) == (view_md is not None)
176
+ base = base_path.tbl_version if base_path is not None and view_md.is_snapshot else None
177
+ base_path = base_path if base_path is not None and not view_md.is_snapshot else None
178
+ tbl_version = cls(tbl_record.id, table_md, 0, schema_version_md, base=base, base_path=base_path)
179
+ tbl_version.store_tbl.create(session.connection())
180
+ # TODO: create pgvector indices
181
+ return tbl_record.id, tbl_version
182
+
183
+ @classmethod
184
+ def delete_md(cls, tbl_id: UUID, conn: sql.Connection) -> None:
185
+ conn.execute(
186
+ sql.delete(schema.TableSchemaVersion.__table__).where(schema.TableSchemaVersion.tbl_id == tbl_id))
187
+ conn.execute(
188
+ sql.delete(schema.TableVersion.__table__).where(schema.TableVersion.tbl_id == tbl_id))
189
+ conn.execute(sql.delete(schema.Table.__table__).where(schema.Table.id == tbl_id))
190
+
191
+ def drop(self) -> None:
192
+ with Env.get().engine.begin() as conn:
193
+ # delete this table and all associated data
194
+ MediaStore.delete(self.id)
195
+ FileCache.get().clear(tbl_id=self.id)
196
+ self.delete_md(self.id, conn)
197
+ self.store_tbl.drop(conn)
198
+
199
+ # de-register table version from catalog
200
+ from .catalog import Catalog
201
+ cat = Catalog.get()
202
+ del cat.tbl_versions[(self.id, self.effective_version)]
203
+ # TODO: remove from tbl_dependents
204
+
205
+ def _init_schema(self, schema_version_md: schema.TableSchemaVersionMd) -> None:
206
+ """Initialize self.cols as well as self.store_tbl"""
207
+ self.cols = [Column.from_md(col_id, col_md, self) for col_id, col_md in schema_version_md.columns.items()]
208
+ self.cols_by_name = {col.name: col for col in self.cols}
209
+ self.cols_by_id = {col.id: col for col in self.cols}
210
+
211
+ # make sure to traverse columns ordered by position = order in which cols were created;
212
+ # this guarantees that references always point backwards
213
+ from pixeltable import exprs
214
+ for col, col_md in zip(self.cols, schema_version_md.columns.values()):
215
+ col.tbl = self
216
+ if col_md.value_expr is not None:
217
+ col.value_expr = exprs.Expr.from_dict(col_md.value_expr)
218
+ self._record_value_expr(col)
219
+
220
+ # create the sqlalchemy schema; do this after instantiating columns, in order to determine whether they
221
+ # need to record errors
222
+ from pixeltable.store import StoreBase, StoreTable, StoreView, StoreComponentView
223
+ if self.is_component_view():
224
+ self.store_tbl: StoreBase = StoreComponentView(self)
225
+ elif self.is_view():
226
+ self.store_tbl: StoreBase = StoreView(self)
227
+ else:
228
+ self.store_tbl: StoreBase = StoreTable(self)
229
+
230
+ def _update_md(
231
+ self, ts: float, preceding_schema_version: Optional[int], conn: sql.engine.Connection) -> None:
232
+ """Update all recorded metadata in response to a data or schema change.
233
+ Args:
234
+ ts: timestamp of the change
235
+ preceding_schema_version: last schema version if schema change, else None
236
+ """
237
+ conn.execute(
238
+ sql.update(schema.Table.__table__)
239
+ .values({schema.Table.md: dataclasses.asdict(self._create_md())})
240
+ .where(schema.Table.id == self.id))
241
+ version_md = self._create_version_md(ts)
242
+ conn.execute(
243
+ sql.insert(schema.TableVersion.__table__)
244
+ .values(tbl_id=self.id, version=self.version, md=dataclasses.asdict(version_md)))
245
+ if preceding_schema_version is not None:
246
+ schema_version_md = self._create_schema_version_md(preceding_schema_version)
247
+ conn.execute(
248
+ sql.insert(schema.TableSchemaVersion.__table__)
249
+ .values(
250
+ tbl_id=self.id, schema_version=self.schema_version,
251
+ md=dataclasses.asdict(schema_version_md)))
252
+
253
+ def add_column(self, col: Column, print_stats: bool = False) -> UpdateStatus:
254
+ """Adds a column to the table.
255
+ """
256
+ assert not self.is_snapshot
257
+ assert is_valid_identifier(col.name)
258
+ assert col.stored is not None
259
+ assert col.name not in self.cols_by_name
260
+ col.tbl = self
261
+ col.id = self.next_col_id
262
+ self.next_col_id += 1
263
+
264
+ if col.compute_func is not None:
265
+ # create value_expr from compute_func
266
+ self._create_value_expr(col, self.path)
267
+ if col.value_expr is not None:
268
+ col.check_value_expr()
269
+ self._record_value_expr(col)
270
+
271
+ row_count = self.store_tbl.count()
272
+ if row_count > 0 and not col.col_type.nullable and not col.is_computed:
273
+ raise excs.Error(f'Cannot add non-nullable column "{col.name}" to table {self.name} with existing rows')
274
+
275
+ # we're creating a new schema version
276
+ ts = time.time()
277
+ self.version += 1
278
+ preceding_schema_version = self.schema_version
279
+ self.schema_version = self.version
280
+
281
+ self.cols.append(col)
282
+ self.cols_by_name[col.name] = col
283
+ self.cols_by_id[col.id] = col
284
+ self.column_history[col.id] = schema.ColumnHistory(col.id, self.schema_version, None)
285
+
286
+ with Env.get().engine.begin() as conn:
287
+ self._update_md(ts, preceding_schema_version, conn)
288
+ _logger.info(f'Added column {col.name} to table {self.name}, new version: {self.version}')
289
+ if col.is_stored:
290
+ self.store_tbl.add_column(col, conn)
291
+
292
+ print(f'Added column `{col.name}` to table `{self.name}`.')
293
+ if row_count == 0:
294
+ return UpdateStatus()
295
+ if (not col.is_computed or not col.is_stored) and not col.is_indexed:
296
+ return UpdateStatus(num_rows=row_count)
297
+ # compute values for the existing rows and compute embeddings, if this column is indexed;
298
+ # for some reason, it's not possible to run the following updates in the same transaction as the one
299
+ # that we just used to create the metadata (sqlalchemy hangs when exec() tries to run the query)
300
+ from pixeltable.plan import Planner
301
+ plan, value_expr_slot_idx, embedding_slot_idx = Planner.create_add_column_plan(self.path, col)
302
+ plan.ctx.num_rows = row_count
303
+ # TODO: create pgvector index, if col is indexed
304
+
305
+ try:
306
+ # TODO: do this in the same transaction as the metadata update
307
+ with Env.get().engine.begin() as conn:
308
+ plan.ctx.conn = conn
309
+ plan.open()
310
+ num_excs = self.store_tbl.load_column(col, plan, value_expr_slot_idx, embedding_slot_idx, conn)
311
+ except sql.exc.DBAPIError as e:
312
+ self.drop_column(col.name)
313
+ raise excs.Error(f'Error during SQL execution:\n{e}')
314
+ finally:
315
+ plan.close()
316
+
317
+ msg = f'Added {row_count} column value{"" if row_count == 1 else "s"} with {num_excs} error{"" if num_excs == 1 else "s"}.'
318
+ print(msg)
319
+ _logger.info(f'Column {col.name}: {msg}')
320
+ if print_stats:
321
+ plan.ctx.profile.print(num_rows=row_count)
322
+ return UpdateStatus(
323
+ num_rows=row_count, num_computed_values=row_count, num_excs=num_excs,
324
+ cols_with_excs=[f'{self.name}.{col.name}'] if num_excs > 0 else [])
325
+
326
+ def drop_column(self, name: str) -> None:
327
+ """Drop a column from the table.
328
+ """
329
+ assert not self.is_snapshot
330
+ if name not in self.cols_by_name:
331
+ raise excs.Error(f'Unknown column: {name}')
332
+ col = self.cols_by_name[name]
333
+ if len(col.dependent_cols) > 0:
334
+ raise excs.Error(
335
+ f'Cannot drop column {name} because the following columns depend on it:\n',
336
+ f'{", ".join([c.name for c in col.dependent_cols])}')
337
+
338
+ if col.value_expr is not None:
339
+ # update Column.dependent_cols
340
+ for c in self.cols:
341
+ if c == col:
342
+ break
343
+ c.dependent_cols.discard(col)
344
+
345
+ # we're creating a new schema version
346
+ ts = time.time()
347
+ self.version += 1
348
+ preceding_schema_version = self.schema_version
349
+ self.schema_version = self.version
350
+
351
+ self.cols.remove(col)
352
+ del self.cols_by_name[name]
353
+ del self.cols_by_id[col.id]
354
+ self.column_history[col.id].schema_version_drop = self.schema_version
355
+
356
+ with Env.get().engine.begin() as conn:
357
+ self._update_md(ts, preceding_schema_version, conn)
358
+ if col.is_stored:
359
+ self.store_tbl.drop_column()
360
+ _logger.info(f'Dropped column {name} from table {self.name}, new version: {self.version}')
361
+
362
+ def rename_column(self, old_name: str, new_name: str) -> None:
363
+ """Rename a column.
364
+ """
365
+ assert not self.is_snapshot
366
+ if old_name not in self.cols_by_name:
367
+ raise excs.Error(f'Unknown column: {old_name}')
368
+ if not is_valid_identifier(new_name):
369
+ raise excs.Error(f"Invalid column name: '{new_name}'")
370
+ if new_name in self.cols_by_name:
371
+ raise excs.Error(f'Column {new_name} already exists')
372
+ col = self.cols_by_name[old_name]
373
+ del self.cols_by_name[old_name]
374
+ col.name = new_name
375
+ self.cols_by_name[new_name] = col
376
+
377
+ # we're creating a new schema version
378
+ ts = time.time()
379
+ self.version += 1
380
+ preceding_schema_version = self.schema_version
381
+ self.schema_version = self.version
382
+
383
+ with Env.get().engine.begin() as conn:
384
+ self._update_md(ts, preceding_schema_version, conn)
385
+ _logger.info(f'Renamed column {old_name} to {new_name} in table {self.name}, new version: {self.version}')
386
+
387
+ def set_comment(self, new_comment: Optional[str]):
388
+ _logger.info(f'[{self.name}] Updating comment: {new_comment}')
389
+ self.comment = new_comment
390
+ self._commit_new_schema_version()
391
+
392
+ def set_num_retained_versions(self, new_num_retained_versions: int):
393
+ _logger.info(f'[{self.name}] Updating num_retained_versions: {new_num_retained_versions} (was {self.num_retained_versions})')
394
+ self.num_retained_versions = new_num_retained_versions
395
+ self._commit_new_schema_version()
396
+
397
+ def _commit_new_schema_version(self):
398
+ # we're creating a new schema version
399
+ ts = time.time()
400
+ self.version += 1
401
+ preceding_schema_version = self.schema_version
402
+ self.schema_version = self.version
403
+ with Env.get().engine.begin() as conn:
404
+ self._update_md(ts, preceding_schema_version, conn)
405
+ _logger.info(f'[{self.name}] Updating table schema to version: {self.version}')
406
+
407
+ def insert(
408
+ self, rows: List[Dict[str, Any]], print_stats: bool = False, fail_on_exception : bool = True
409
+ ) -> UpdateStatus:
410
+ """Insert rows into this table.
411
+ """
412
+ assert self.is_insertable()
413
+ from pixeltable.plan import Planner
414
+ plan = Planner.create_insert_plan(self, rows, ignore_errors=not fail_on_exception)
415
+ ts = time.time()
416
+ with Env.get().engine.begin() as conn:
417
+ return self._insert(plan, conn, ts, print_stats)
418
+
419
+ def _insert(
420
+ self, exec_plan: exec.ExecNode, conn: sql.engine.Connection, ts: float, print_stats: bool = False,
421
+ ) -> UpdateStatus:
422
+ """Insert rows produced by exec_plan and propagate to views"""
423
+ # we're creating a new version
424
+ self.version += 1
425
+ result = UpdateStatus()
426
+ num_rows, num_excs, cols_with_excs = self.store_tbl.insert_rows(exec_plan, conn, v_min=self.version)
427
+ self.next_rowid = num_rows
428
+ result.num_rows = num_rows
429
+ result.num_excs = num_excs
430
+ result.num_computed_values += exec_plan.ctx.num_computed_exprs * num_rows
431
+ result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
432
+ self._update_md(ts, None, conn)
433
+
434
+ # update views
435
+ for view in self.mutable_views:
436
+ from pixeltable.plan import Planner
437
+ plan, _ = Planner.create_view_load_plan(view.path, propagates_insert=True)
438
+ status = view._insert(plan, conn, ts, print_stats)
439
+ result.num_rows += status.num_rows
440
+ result.num_excs += status.num_excs
441
+ result.num_computed_values += status.num_computed_values
442
+ result.cols_with_excs += status.cols_with_excs
443
+
444
+ result.cols_with_excs = list(dict.fromkeys(result.cols_with_excs).keys()) # remove duplicates
445
+ if print_stats:
446
+ plan.ctx.profile.print(num_rows=num_rows)
447
+ _logger.info(f'TableVersion {self.name}: new version {self.version}')
448
+ return result
449
+
450
+ def update(
451
+ self, update_targets: Optional[List[Tuple[Column, 'pixeltable.exprs.Expr']]] = None,
452
+ where_clause: Optional['pixeltable.exprs.Predicate'] = None, cascade: bool = True
453
+ ) -> UpdateStatus:
454
+ """Update rows in this table.
455
+ Args:
456
+ update_targets: a list of (column, value) pairs specifying the columns to update and their new values.
457
+ where_clause: a Predicate to filter rows to update.
458
+ cascade: if True, also update all computed columns that transitively depend on the updated columns,
459
+ including within views.
460
+ """
461
+ if update_targets is None:
462
+ update_targets = []
463
+ assert not self.is_snapshot
464
+ from pixeltable.plan import Planner
465
+ plan, updated_cols, recomputed_cols = \
466
+ Planner.create_update_plan(self.path, update_targets, [], where_clause, cascade)
467
+ with Env.get().engine.begin() as conn:
468
+ ts = time.time()
469
+ result = self._update(
470
+ plan, where_clause.sql_expr() if where_clause is not None else None, recomputed_cols,
471
+ base_versions=[], conn=conn, ts=ts, cascade=cascade)
472
+ result.updated_cols = updated_cols
473
+ return result
474
+
475
+ def _update(
476
+ self, plan: Optional[exec.ExecNode], where_clause: Optional[sql.ClauseElement],
477
+ recomputed_view_cols: List[Column], base_versions: List[Optional[int]], conn: sql.engine.Connection,
478
+ ts: float, cascade: bool
479
+ ) -> UpdateStatus:
480
+ result = UpdateStatus()
481
+ if plan is not None:
482
+ # we're creating a new version
483
+ self.version += 1
484
+ result.num_rows, result.num_excs, cols_with_excs = \
485
+ self.store_tbl.insert_rows(plan, conn, v_min=self.version)
486
+ result.cols_with_excs = [f'{self.name}.{self.cols_by_id[cid].name}' for cid in cols_with_excs]
487
+ self.store_tbl.delete_rows(
488
+ self.version, base_versions=base_versions, match_on_vmin=True, where_clause=where_clause, conn=conn)
489
+ self._update_md(ts, None, conn)
490
+
491
+ if cascade:
492
+ base_versions = [None if plan is None else self.version] + base_versions # don't update in place
493
+ # propagate to views
494
+ for view in self.mutable_views:
495
+ recomputed_cols = [col for col in recomputed_view_cols if col.tbl is view]
496
+ plan: Optional[exec.ExecNode] = None
497
+ if len(recomputed_cols) > 0:
498
+ from pixeltable.plan import Planner
499
+ plan = Planner.create_view_update_plan(view.path, recompute_targets=recomputed_cols)
500
+ status = view._update(
501
+ plan, None, recomputed_view_cols, base_versions=base_versions, conn=conn, ts=ts, cascade=True)
502
+ result.num_rows += status.num_rows
503
+ result.num_excs += status.num_excs
504
+ result.cols_with_excs += status.cols_with_excs
505
+
506
+ result.cols_with_excs = list(dict.fromkeys(result.cols_with_excs).keys()) # remove duplicates
507
+ return result
508
+
509
+ def delete(self, where: Optional['pixeltable.exprs.Predicate'] = None) -> UpdateStatus:
510
+ """Delete rows in this table.
511
+ Args:
512
+ where: a Predicate to filter rows to delete.
513
+ """
514
+ assert self.is_insertable()
515
+ from pixeltable.plan import Planner
516
+ analysis_info = Planner.analyze(self, where)
517
+ ts = time.time()
518
+ with Env.get().engine.begin() as conn:
519
+ num_rows = self._delete(analysis_info.sql_where_clause, base_versions=[], conn=conn, ts=ts)
520
+
521
+ status = UpdateStatus(num_rows=num_rows)
522
+ return status
523
+
524
+ def _delete(
525
+ self, where: Optional['pixeltable.exprs.Predicate'], base_versions: List[Optional[int]],
526
+ conn: sql.engine.Connection, ts: float) -> int:
527
+ """Delete rows in this table and propagate to views.
528
+ Args:
529
+ where: a Predicate to filter rows to delete.
530
+ Returns:
531
+ number of deleted rows
532
+ """
533
+ sql_where_clause = where.sql_expr() if where is not None else None
534
+ num_rows = self.store_tbl.delete_rows(
535
+ self.version + 1, base_versions=base_versions, match_on_vmin=False, where_clause=sql_where_clause,
536
+ conn=conn)
537
+ if num_rows > 0:
538
+ # we're creating a new version
539
+ self.version += 1
540
+ self._update_md(ts, None, conn)
541
+ else:
542
+ pass
543
+ for view in self.mutable_views:
544
+ num_rows += view._delete(where=None, base_versions=[self.version] + base_versions, conn=conn, ts=ts)
545
+ return num_rows
546
+
547
+ def revert(self) -> None:
548
+ """Reverts the table to the previous version.
549
+ """
550
+ assert not self.is_snapshot
551
+ if self.version == 0:
552
+ raise excs.Error('Cannot revert version 0')
553
+ with orm.Session(Env.get().engine, future=True) as session:
554
+ self._revert(session)
555
+ session.commit()
556
+
557
+ def _revert(self, session: orm.Session) -> None:
558
+ """Reverts this table version and propagates to views"""
559
+ conn = session.connection()
560
+ # make sure we don't have a snapshot referencing this version
561
+ # (unclear how to express this with sqlalchemy)
562
+ query = (
563
+ f"select ts.dir_id, ts.md->'name' "
564
+ f"from {schema.Table.__tablename__} ts "
565
+ f"cross join lateral jsonb_path_query(md, '$.view_md.base_versions[*]') as tbl_version "
566
+ f"where tbl_version->>0 = '{self.id.hex}' and (tbl_version->>1)::int = {self.version}"
567
+ )
568
+ result = list(conn.execute(sql.text(query)))
569
+ if len(result) > 0:
570
+ names = [row[1] for row in result]
571
+ raise excs.Error((
572
+ f'Current version is needed for {len(result)} snapshot{"s" if len(result) > 1 else ""} '
573
+ f'({", ".join(names)})'
574
+ ))
575
+
576
+ conn = session.connection()
577
+ # delete newly-added data
578
+ MediaStore.delete(self.id, version=self.version)
579
+ conn.execute(sql.delete(self.store_tbl.sa_tbl).where(self.store_tbl.sa_tbl.c.v_min == self.version))
580
+ # revert new deletions
581
+ conn.execute(
582
+ sql.update(self.store_tbl.sa_tbl) \
583
+ .values({self.store_tbl.sa_tbl.c.v_max: schema.Table.MAX_VERSION})
584
+ .where(self.store_tbl.sa_tbl.c.v_max == self.version))
585
+
586
+ if self.version == self.schema_version:
587
+ # the current version involved a schema change:
588
+ # if the schema change was to add a column, we now need to drop it
589
+ added_col_ids = [
590
+ col_history.col_id for col_history in self.column_history.values()
591
+ if col_history.schema_version_add == self.schema_version
592
+ ]
593
+ assert len(added_col_ids) <= 1
594
+ added_col: Optional[Column] = None
595
+ if len(added_col_ids) == 1:
596
+ added_col_id = added_col_ids[0]
597
+ # drop this newly-added column and its ColumnHistory record
598
+ c = self.cols_by_id[added_col_id]
599
+ if c.is_stored:
600
+ added_col = c
601
+ del self.column_history[c.id]
602
+
603
+ # we need to determine the preceding schema version and reload the schema
604
+ schema_version_md_dict = session.query(schema.TableSchemaVersion.md) \
605
+ .where(schema.TableSchemaVersion.tbl_id == self.id) \
606
+ .where(schema.TableSchemaVersion.schema_version == self.schema_version) \
607
+ .scalar()
608
+ preceding_schema_version = schema_version_md_dict['preceding_schema_version']
609
+ preceding_schema_version_md_dict = session.query(schema.TableSchemaVersion.md) \
610
+ .where(schema.TableSchemaVersion.tbl_id == self.id) \
611
+ .where(schema.TableSchemaVersion.schema_version == preceding_schema_version) \
612
+ .scalar()
613
+ preceding_schema_version_md = schema.md_from_dict(
614
+ schema.TableSchemaVersionMd, preceding_schema_version_md_dict)
615
+ self._init_schema(preceding_schema_version_md)
616
+
617
+ # physically drop the column, but only after we have re-created the schema
618
+ if added_col is not None:
619
+ self.store_tbl.drop_column(added_col, conn)
620
+
621
+ conn.execute(
622
+ sql.delete(schema.TableSchemaVersion.__table__)
623
+ .where(schema.TableSchemaVersion.tbl_id == self.id)
624
+ .where(schema.TableSchemaVersion.schema_version == self.schema_version))
625
+ self.schema_version = preceding_schema_version
626
+ self.comment = preceding_schema_version_md.comment
627
+ self.num_retained_versions = preceding_schema_version_md.num_retained_versions
628
+
629
+ conn.execute(
630
+ sql.delete(schema.TableVersion.__table__)
631
+ .where(schema.TableVersion.tbl_id == self.id)
632
+ .where(schema.TableVersion.version == self.version)
633
+ )
634
+ self.version -= 1
635
+ conn.execute(
636
+ sql.update(schema.Table.__table__)
637
+ .values({schema.Table.md: dataclasses.asdict(self._create_md())})
638
+ .where(schema.Table.id == self.id))
639
+
640
+ # propagate to views
641
+ for view in self.mutable_views:
642
+ view._revert(session)
643
+ _logger.info(f'TableVersion {self.name}: reverted to version {self.version}')
644
+
645
+ def is_view(self) -> bool:
646
+ return self.base is not None
647
+
648
+ def is_component_view(self) -> bool:
649
+ return self.iterator_cls is not None
650
+
651
+ def is_insertable(self) -> bool:
652
+ """Returns True if this corresponds to an InsertableTable"""
653
+ return not self.is_snapshot and not self.is_view()
654
+
655
+ def is_iterator_column(self, col: Column) -> bool:
656
+ """Returns True if col is produced by an iterator"""
657
+ # the iterator columns directly follow the pos column
658
+ return self.is_component_view() and col.id > 0 and col.id < self.num_iterator_cols + 1
659
+
660
+ def is_system_column(self, col: Column) -> bool:
661
+ """Return True if column was created by Pixeltable"""
662
+ if col.name == POS_COLUMN_NAME and self.is_component_view():
663
+ return True
664
+ return False
665
+
666
+ def user_columns(self) -> List[Column]:
667
+ """Return all non-system columns"""
668
+ return [c for c in self.cols if not self.is_system_column(c)]
669
+
670
+ def get_required_col_names(self) -> List[str]:
671
+ """Return the names of all columns for which values must be specified in insert()"""
672
+ assert not self.is_view()
673
+ names = [c.name for c in self.cols if not c.is_computed and not c.col_type.nullable]
674
+ return names
675
+
676
+ def get_computed_col_names(self) -> List[str]:
677
+ """Return the names of all computed columns"""
678
+ names = [c.name for c in self.cols if c.is_computed]
679
+ return names
680
+
681
+ @classmethod
682
+ def _create_value_expr(cls, col: Column, path: 'TableVersionPath') -> None:
683
+ """
684
+ Create col.value_expr, given col.compute_func.
685
+ Interprets compute_func's parameters to be references to columns and construct ColumnRefs as args.
686
+ Does not update Column.dependent_cols.
687
+ """
688
+ assert col.value_expr is None
689
+ assert col.compute_func is not None
690
+ from pixeltable import exprs
691
+ params = inspect.signature(col.compute_func).parameters
692
+ args: List[exprs.ColumnRef] = []
693
+ for param_name in params:
694
+ param = path.get_column(param_name)
695
+ if param is None:
696
+ raise excs.Error(
697
+ f'Column {col.name}: Callable parameter refers to an unknown column: {param_name}')
698
+ args.append(exprs.ColumnRef(param))
699
+ fn = func.make_function(
700
+ col.compute_func, return_type=col.col_type, param_types=[arg.col_type for arg in args])
701
+ col.value_expr = fn(*args)
702
+
703
+ def _record_value_expr(self, col: Column) -> None:
704
+ """Update Column.dependent_cols for all cols referenced in col.value_expr.
705
+ """
706
+ assert col.value_expr is not None
707
+ from pixeltable.exprs import ColumnRef
708
+ refd_cols = [e.col for e in col.value_expr.subexprs(expr_class=ColumnRef)]
709
+ for refd_col in refd_cols:
710
+ refd_col.dependent_cols.add(col)
711
+
712
+ def get_dependent_columns(self, cols: List[Column]) -> Set[Column]:
713
+ """
714
+ Return the set of columns that transitively depend on any of the given ones.
715
+ """
716
+ if len(cols) == 0:
717
+ return []
718
+ result: Set[Column] = set()
719
+ for col in cols:
720
+ result.update(col.dependent_cols)
721
+ result.update(self.get_dependent_columns(result))
722
+ return result
723
+
724
+ def num_rowid_columns(self) -> int:
725
+ """Return the number of columns of the rowids, without accessing store_tbl"""
726
+ if self.is_component_view():
727
+ return 1 + self.base.num_rowid_columns()
728
+ return 1
729
+
730
+ def _create_md(self) -> schema.TableMd:
731
+ return schema.TableMd(
732
+ name=self.name, current_version=self.version, current_schema_version=self.schema_version,
733
+ next_col_id=self.next_col_id, next_row_id=self.next_rowid, column_history=self.column_history,
734
+ view_md=self.view_md)
735
+
736
+ def _create_version_md(self, ts: float) -> schema.TableVersionMd:
737
+ return schema.TableVersionMd(created_at=ts, version=self.version, schema_version=self.schema_version)
738
+
739
+ def _create_schema_version_md(self, preceding_schema_version: int) -> schema.TableSchemaVersionMd:
740
+ column_md: Dict[int, schema.SchemaColumn] = {}
741
+ for pos, col in enumerate(self.cols):
742
+ value_expr_dict = col.value_expr.as_dict() if col.value_expr is not None else None
743
+ column_md[col.id] = schema.SchemaColumn(
744
+ pos=pos, name=col.name, col_type=col.col_type.as_dict(),
745
+ is_pk=col.primary_key, value_expr=value_expr_dict, stored=col.stored, is_indexed=col.is_indexed)
746
+ # preceding_schema_version to be set by the caller
747
+ return schema.TableSchemaVersionMd(
748
+ schema_version=self.schema_version, preceding_schema_version=preceding_schema_version,
749
+ columns=column_md, num_retained_versions=self.num_retained_versions, comment=self.comment)