pixeltable 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -0,0 +1,39 @@
1
+ from typing import Any, Optional
2
+
3
+ import sqlalchemy as sql
4
+
5
+ from pixeltable.metadata import register_converter
6
+ from pixeltable.metadata.converters.util import convert_table_md
7
+
8
+
9
+ @register_converter(version=38)
10
+ def _(engine: sql.engine.Engine) -> None:
11
+ convert_table_md(engine, substitution_fn=__substitute_md)
12
+
13
+
14
+ def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
15
+ if k == 'col_mapping':
16
+ assert isinstance(v, list)
17
+ return k, [__col_mapping_entry(e) for e in v]
18
+ if k == 'stored_proxies':
19
+ assert isinstance(v, list)
20
+ return k, [__stored_proxies_entry(e) for e in v]
21
+ return None
22
+
23
+
24
+ def __col_mapping_entry(e: list) -> list:
25
+ assert isinstance(e, list)
26
+ assert isinstance(e[0], dict)
27
+ assert isinstance(e[1], str)
28
+ return [__col_handle(e[0]), e[1]]
29
+
30
+
31
+ def __stored_proxies_entry(e: list) -> list:
32
+ assert isinstance(e, list)
33
+ assert isinstance(e[0], dict)
34
+ assert isinstance(e[1], dict)
35
+ return [__col_handle(e[0]), __col_handle(e[1])]
36
+
37
+
38
+ def __col_handle(e: dict) -> dict:
39
+ return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}
@@ -2,6 +2,7 @@
2
2
  # rather than as a comment, so that the existence of a description can be enforced by
3
3
  # the unit tests when new versions are added.
4
4
  VERSION_NOTES = {
5
+ 39: 'ColumnHandles in external stores',
5
6
  38: 'Added TableMd.view_sn',
6
7
  37: 'Add support for the sample() method on DataFrames',
7
8
  36: 'Added Table.lock_dummy',
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ from pixeltable.metadata import schema
6
+
7
+
8
+ class MetadataUtils:
9
+ @classmethod
10
+ def _diff_md(
11
+ cls, old_md: Optional[dict[int, schema.SchemaColumn]], new_md: Optional[dict[int, schema.SchemaColumn]]
12
+ ) -> str:
13
+ """Return a string reporting the differences in a specific entry in two dictionaries
14
+
15
+ Results are formatted as follows:
16
+ - If `old_md` is `None`, returns 'Initial Version'.
17
+ - If `old_md` and `new_md` are the same, returns an empty string.
18
+ - If there are additions, changes, or deletions, returns a string summarizing the changes.
19
+ """
20
+ assert new_md is not None
21
+ if old_md is None:
22
+ return 'Initial Version'
23
+ if old_md == new_md:
24
+ return ''
25
+ added = {k: v.name for k, v in new_md.items() if k not in old_md}
26
+ changed = {
27
+ k: f'{old_md[k].name!r} to {v.name!r}'
28
+ for k, v in new_md.items()
29
+ if k in old_md and old_md[k].name != v.name
30
+ }
31
+ deleted = {k: v.name for k, v in old_md.items() if k not in new_md}
32
+ if len(added) == 0 and len(changed) == 0 and len(deleted) == 0:
33
+ return ''
34
+ # Format the result
35
+ t = []
36
+ if len(added) > 0:
37
+ t.append('Added: ' + ', '.join(added.values()))
38
+ if len(changed) > 0:
39
+ t.append('Renamed: ' + ', '.join(changed.values()))
40
+ if len(deleted) > 0:
41
+ t.append('Deleted: ' + ', '.join(deleted.values()))
42
+ r = ', '.join(t)
43
+ return r
44
+
45
+ @classmethod
46
+ def _create_md_change_dict(
47
+ cls, md_list: Optional[list[tuple[int, dict[int, schema.SchemaColumn]]]]
48
+ ) -> dict[int, str]:
49
+ """Return a dictionary of schema changes by version
50
+ Args:
51
+ md_list: a list of tuples, each containing a version number and a metadata dictionary.
52
+ """
53
+ r: dict[int, str] = {}
54
+ if md_list is None or len(md_list) == 0:
55
+ return r
56
+
57
+ # Sort the list in place by version number
58
+ md_list.sort()
59
+
60
+ first_retrieved_version = md_list[0][0]
61
+ if first_retrieved_version == 0:
62
+ prev_md = None
63
+ prev_ver = -1
64
+ start = 0
65
+ else:
66
+ prev_md = md_list[0][1]
67
+ prev_ver = first_retrieved_version
68
+ start = 1
69
+
70
+ for ver, curr_md in md_list[start:]:
71
+ if ver == prev_ver:
72
+ continue
73
+ assert ver > prev_ver
74
+ tf = cls._diff_md(prev_md, curr_md)
75
+ if tf != '':
76
+ r[ver] = tf
77
+ prev_md = curr_md
78
+ return r
pixeltable/plan.py CHANGED
@@ -378,7 +378,7 @@ class Planner:
378
378
 
379
379
  cls.__check_valid_columns(tbl, stored_cols, 'inserted into')
380
380
 
381
- row_builder = exprs.RowBuilder([], stored_cols, [])
381
+ row_builder = exprs.RowBuilder([], stored_cols, [], tbl)
382
382
 
383
383
  # create InMemoryDataNode for 'rows'
384
384
  plan: exec.ExecNode = exec.InMemoryDataNode(
@@ -473,15 +473,19 @@ class Planner:
473
473
  assert isinstance(tbl, catalog.TableVersionPath)
474
474
  target = tbl.tbl_version.get() # the one we need to update
475
475
  updated_cols = list(update_targets.keys())
476
+ recomputed_cols: set[Column]
476
477
  if len(recompute_targets) > 0:
477
- recomputed_cols = set(recompute_targets)
478
+ assert len(update_targets) == 0
479
+ recomputed_cols = {*recompute_targets}
480
+ if cascade:
481
+ recomputed_cols |= target.get_dependent_columns(recomputed_cols)
478
482
  else:
479
483
  recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
480
- # regardless of cascade, we need to update all indices on any updated column
481
- idx_val_cols = target.get_idx_val_columns(updated_cols)
482
- recomputed_cols.update(idx_val_cols)
483
- # we only need to recompute stored columns (unstored ones are substituted away)
484
- recomputed_cols = {c for c in recomputed_cols if c.is_stored}
484
+ # regardless of cascade, we need to update all indices on any updated/recomputed column
485
+ idx_val_cols = target.get_idx_val_columns(set(updated_cols) | recomputed_cols)
486
+ recomputed_cols.update(idx_val_cols)
487
+ # we only need to recompute stored columns (unstored ones are substituted away)
488
+ recomputed_cols = {c for c in recomputed_cols if c.is_stored}
485
489
 
486
490
  cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
487
491
 
@@ -588,7 +592,7 @@ class Planner:
588
592
  sql_exprs = list(
589
593
  exprs.Expr.list_subexprs(analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False)
590
594
  )
591
- row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs)
595
+ row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs, target)
592
596
  analyzer.finalize(row_builder)
593
597
  sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, sql_exprs, sa_key_cols, key_vals)
594
598
  col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
@@ -602,8 +606,7 @@ class Planner:
602
606
  row_builder.set_slot_idxs(select_list, remove_duplicates=False)
603
607
  for i, col in enumerate(all_base_cols):
604
608
  plan.row_builder.add_table_column(col, select_list[i].slot_idx)
605
-
606
- ctx = exec.ExecContext(row_builder)
609
+ ctx = exec.ExecContext(row_builder, num_computed_exprs=len(recomputed_exprs))
607
610
  # we're returning everything to the user, so we might as well do it in a single batch
608
611
  ctx.batch_size = 0
609
612
  plan.set_ctx(ctx)
@@ -695,7 +698,7 @@ class Planner:
695
698
  base_analyzer = Analyzer(
696
699
  from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
697
700
  )
698
- row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
701
+ row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [], target)
699
702
 
700
703
  # if we're propagating an insert, we only want to see those base rows that were created for the current version
701
704
  # execution plan:
@@ -832,7 +835,11 @@ class Planner:
832
835
  order_by_clause=order_by_clause,
833
836
  sample_clause=sample_clause,
834
837
  )
835
- row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
838
+ # If the from_clause has a single table, we can use it as the context table for the RowBuilder.
839
+ # Otherwise there is no context table, but that's ok, because the context table is only needed for
840
+ # table mutations, which can't happen during a join.
841
+ context_tbl = from_clause.tbls[0].tbl_version.get() if len(from_clause.tbls) == 1 else None
842
+ row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [], context_tbl)
836
843
 
837
844
  analyzer.finalize(row_builder)
838
845
  # select_list: we need to materialize everything that's been collected
@@ -1035,16 +1042,14 @@ class Planner:
1035
1042
  return Analyzer(FromClause(tbls=[tbl]), [], where_clause=where_clause)
1036
1043
 
1037
1044
  @classmethod
1038
- def create_add_column_plan(
1039
- cls, tbl: catalog.TableVersionPath, col: catalog.Column
1040
- ) -> tuple[exec.ExecNode, Optional[int]]:
1045
+ def create_add_column_plan(cls, tbl: catalog.TableVersionPath, col: catalog.Column) -> exec.ExecNode:
1041
1046
  """Creates a plan for InsertableTable.add_column()
1042
1047
  Returns:
1043
1048
  plan: the plan to execute
1044
1049
  value_expr slot idx for the plan output (for computed cols)
1045
1050
  """
1046
1051
  assert isinstance(tbl, catalog.TableVersionPath)
1047
- row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
1052
+ row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[], tbl=tbl.tbl_version.get())
1048
1053
  analyzer = Analyzer(FromClause(tbls=[tbl]), row_builder.default_eval_ctx.target_exprs)
1049
1054
  plan = cls._create_query_plan(
1050
1055
  row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
@@ -1056,5 +1061,4 @@ class Planner:
1056
1061
  # we want to flush images
1057
1062
  if col.is_computed and col.is_stored and col.col_type.is_image_type():
1058
1063
  plan.set_stored_img_cols(row_builder.output_slot_idxs())
1059
- value_expr_slot_idx = row_builder.output_slot_idxs()[0].slot_idx if col.is_computed else None
1060
- return plan, value_expr_slot_idx
1064
+ return plan
pixeltable/store.py CHANGED
@@ -7,13 +7,14 @@ import sys
7
7
  import urllib.parse
8
8
  import urllib.request
9
9
  import warnings
10
- from typing import Any, Iterable, Iterator, Literal, Optional, Union
10
+ from typing import Any, Iterable, Iterator, Optional, Union
11
11
 
12
12
  import more_itertools
13
13
  import sqlalchemy as sql
14
14
  from tqdm import TqdmWarning, tqdm
15
15
 
16
- from pixeltable import catalog, exceptions as excs, exprs
16
+ from pixeltable import catalog, exceptions as excs
17
+ from pixeltable.catalog import RowCountStats, UpdateStatus
17
18
  from pixeltable.env import Env
18
19
  from pixeltable.exec import ExecNode
19
20
  from pixeltable.metadata import schema
@@ -41,7 +42,10 @@ class StoreBase:
41
42
  v_max_col: sql.Column
42
43
  base: Optional[StoreBase]
43
44
 
44
- __INSERT_BATCH_SIZE = 1000
45
+ # In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
46
+ # benefit to going higher.
47
+ # TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
48
+ __INSERT_BATCH_SIZE = 10_000
45
49
 
46
50
  def __init__(self, tbl_version: catalog.TableVersion):
47
51
  self.tbl_version = catalog.TableVersionHandle(
@@ -124,13 +128,14 @@ class StoreBase:
124
128
 
125
129
  def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
126
130
  """Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
127
- pxt_tmp_dir = str(Env.get().tmp_dir)
128
131
  if file_url is None:
129
132
  return None
133
+ assert isinstance(file_url, str), type(file_url)
134
+ pxt_tmp_dir = str(Env.get().tmp_dir)
130
135
  parsed = urllib.parse.urlparse(file_url)
131
136
  # We should never be passed a local file path here. The "len > 1" ensures that Windows
132
137
  # file paths aren't mistaken for URLs with a single-character scheme.
133
- assert len(parsed.scheme) > 1
138
+ assert len(parsed.scheme) > 1, file_url
134
139
  if parsed.scheme != 'file':
135
140
  # remote url
136
141
  return file_url
@@ -145,27 +150,11 @@ class StoreBase:
145
150
  return new_file_url
146
151
 
147
152
  def _move_tmp_media_files(
148
- self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
153
+ self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
149
154
  ) -> None:
150
155
  """Move tmp media files that we generated to a permanent location"""
151
- for c in media_cols:
152
- for table_row in table_rows:
153
- file_url = table_row[c.store_name()]
154
- table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
155
-
156
- def _create_table_row(
157
- self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
158
- ) -> tuple[dict[str, Any], int]:
159
- """Return Tuple[complete table row, # of exceptions] for insert()
160
- Creates a row that includes the PK columns, with the values from input_row.pk.
161
- Returns:
162
- Tuple[complete table row, # of exceptions]
163
- """
164
- table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
165
- assert len(pk) == len(self._pk_cols)
166
- for pk_col, pk_val in zip(self._pk_cols, pk):
167
- table_row[pk_col.name] = pk_val
168
- return table_row, num_excs
156
+ for n, col in media_cols_by_sql_idx.items():
157
+ table_row[n] = self._move_tmp_media_file(table_row[n], col, v_min)
169
158
 
170
159
  def count(self) -> int:
171
160
  """Return the number of rows visible in self.tbl_version"""
@@ -231,9 +220,7 @@ class StoreBase:
231
220
  if col.store_name() not in existing_cols:
232
221
  self.add_column(col)
233
222
 
234
- def load_column(
235
- self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
236
- ) -> int:
223
+ def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
237
224
  """Update store column of a computed column with values produced by an execution plan
238
225
 
239
226
  Returns:
@@ -247,60 +234,51 @@ class StoreBase:
247
234
  num_rows = 0
248
235
  # create temp table to store output of exec_plan, with the same primary key as the store table
249
236
  tmp_name = f'temp_{self._storage_name()}'
250
- tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
251
- tmp_cols = tmp_pk_cols.copy()
237
+ tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
238
+ tmp_val_col_sql_idx = len(tmp_pk_cols)
252
239
  tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
253
- tmp_cols.append(tmp_val_col)
240
+ tmp_cols = [*tmp_pk_cols, tmp_val_col]
254
241
  # add error columns if the store column records errors
255
242
  if col.records_errors:
256
243
  tmp_errortype_col = sql.Column(col.sa_errortype_col.name, col.sa_errortype_col.type)
257
- tmp_cols.append(tmp_errortype_col)
258
244
  tmp_errormsg_col = sql.Column(col.sa_errormsg_col.name, col.sa_errormsg_col.type)
259
- tmp_cols.append(tmp_errormsg_col)
245
+ tmp_cols.extend((tmp_errortype_col, tmp_errormsg_col))
246
+ tmp_col_names = [col.name for col in tmp_cols]
247
+
260
248
  tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
261
249
  conn = Env.get().conn
262
250
  tmp_tbl.create(bind=conn)
263
251
 
252
+ row_builder = exec_plan.row_builder
253
+
264
254
  try:
255
+ table_rows: list[tuple[Any]] = []
256
+
265
257
  # insert rows from exec_plan into temp table
266
- # TODO: unify the table row construction logic with RowBuilder.create_table_row()
267
258
  for row_batch in exec_plan:
268
259
  num_rows += len(row_batch)
269
- tbl_rows: list[dict[str, Any]] = []
270
- for result_row in row_batch:
271
- tbl_row: dict[str, Any] = {}
272
- for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
273
- tbl_row[pk_col.name] = pk_val
274
-
275
- if col.is_computed:
276
- if result_row.has_exc(value_expr_slot_idx):
277
- num_excs += 1
278
- value_exc = result_row.get_exc(value_expr_slot_idx)
279
- if on_error == 'abort':
280
- raise excs.Error(
281
- f'Error while evaluating computed column `{col.name}`:\n{value_exc}'
282
- ) from value_exc
283
- # we store a NULL value and record the exception/exc type
284
- error_type = type(value_exc).__name__
285
- error_msg = str(value_exc)
286
- tbl_row[col.sa_col.name] = None
287
- tbl_row[col.sa_errortype_col.name] = error_type
288
- tbl_row[col.sa_errormsg_col.name] = error_msg
289
- else:
290
- if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
291
- # we have yet to store this image
292
- filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
293
- result_row.flush_img(value_expr_slot_idx, filepath)
294
- val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
295
- if col.col_type.is_media_type():
296
- val = self._move_tmp_media_file(val, col, result_row.pk[-1])
297
- tbl_row[col.sa_col.name] = val
298
- if col.records_errors:
299
- tbl_row[col.sa_errortype_col.name] = None
300
- tbl_row[col.sa_errormsg_col.name] = None
301
-
302
- tbl_rows.append(tbl_row)
303
- conn.execute(sql.insert(tmp_tbl), tbl_rows)
260
+ batch_table_rows: list[tuple[Any]] = []
261
+
262
+ for row in row_batch:
263
+ if abort_on_exc and row.has_exc():
264
+ exc = row.get_first_exc()
265
+ raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
266
+ table_row, num_row_exc = row_builder.create_table_row(row, None, row.pk)
267
+ if col.col_type.is_media_type():
268
+ table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(
269
+ table_row[tmp_val_col_sql_idx], col, row.pk[-1]
270
+ )
271
+ num_excs += num_row_exc
272
+ batch_table_rows.append(tuple(table_row))
273
+
274
+ table_rows.extend(batch_table_rows)
275
+
276
+ if len(table_rows) >= self.__INSERT_BATCH_SIZE:
277
+ self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
278
+ table_rows.clear()
279
+
280
+ if len(table_rows) > 0:
281
+ self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
304
282
 
305
283
  # update store table with values from temp table
306
284
  update_stmt = sql.update(self.sa_tbl)
@@ -313,6 +291,7 @@ class StoreBase:
313
291
  )
314
292
  log_explain(_logger, update_stmt, conn)
315
293
  conn.execute(update_stmt)
294
+
316
295
  finally:
317
296
 
318
297
  def remove_tmp_tbl() -> None:
@@ -320,6 +299,7 @@ class StoreBase:
320
299
  tmp_tbl.drop(bind=conn)
321
300
 
322
301
  run_cleanup(remove_tmp_tbl, raise_error=True)
302
+
323
303
  return num_excs
324
304
 
325
305
  def insert_rows(
@@ -329,7 +309,7 @@ class StoreBase:
329
309
  show_progress: bool = True,
330
310
  rowids: Optional[Iterator[int]] = None,
331
311
  abort_on_exc: bool = False,
332
- ) -> tuple[int, int, set[int]]:
312
+ ) -> tuple[set[int], UpdateStatus]:
333
313
  """Insert rows into the store table and update the catalog table's md
334
314
  Returns:
335
315
  number of inserted rows, number of exceptions, set of column ids that have exceptions
@@ -341,50 +321,81 @@ class StoreBase:
341
321
  cols_with_excs: set[int] = set()
342
322
  progress_bar: Optional[tqdm] = None # create this only after we started executing
343
323
  row_builder = exec_plan.row_builder
344
- media_cols = [info.col for info in row_builder.table_columns if info.col.col_type.is_media_type()]
345
- conn = Env.get().conn
324
+
325
+ store_col_names, media_cols_by_idx = row_builder.store_column_names()
346
326
 
347
327
  try:
328
+ table_rows: list[tuple[Any]] = []
348
329
  exec_plan.open()
330
+
349
331
  for row_batch in exec_plan:
350
332
  num_rows += len(row_batch)
351
- for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
352
- # compute batch of rows and convert them into table rows
353
- table_rows: list[dict[str, Any]] = []
354
- batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
355
- for row_idx in range(batch_start_idx, batch_stop_idx):
356
- row = row_batch[row_idx]
357
- # if abort_on_exc == True, we need to check for media validation exceptions
358
- if abort_on_exc and row.has_exc():
359
- exc = row.get_first_exc()
360
- raise exc
361
-
362
- rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
363
- pk = (*rowid, v_min)
364
- table_row, num_row_exc = self._create_table_row(row, row_builder, cols_with_excs, pk=pk)
365
- num_excs += num_row_exc
366
- table_rows.append(table_row)
367
-
368
- if show_progress:
369
- if progress_bar is None:
370
- warnings.simplefilter('ignore', category=TqdmWarning)
371
- progress_bar = tqdm(
372
- desc=f'Inserting rows into `{self.tbl_version.get().name}`',
373
- unit=' rows',
374
- ncols=100,
375
- file=sys.stdout,
376
- )
377
- progress_bar.update(1)
378
-
379
- # insert batch of rows
380
- self._move_tmp_media_files(table_rows, media_cols, v_min)
381
- conn.execute(sql.insert(self.sa_tbl), table_rows)
333
+ batch_table_rows: list[tuple[Any]] = []
334
+
335
+ # compute batch of rows and convert them into table rows
336
+ for row in row_batch:
337
+ # if abort_on_exc == True, we need to check for media validation exceptions
338
+ if abort_on_exc and row.has_exc():
339
+ exc = row.get_first_exc()
340
+ raise exc
341
+
342
+ rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
343
+ pk = (*rowid, v_min)
344
+ assert len(pk) == len(self._pk_cols)
345
+ table_row, num_row_exc = row_builder.create_table_row(row, cols_with_excs, pk)
346
+ num_excs += num_row_exc
347
+
348
+ if show_progress:
349
+ if progress_bar is None:
350
+ warnings.simplefilter('ignore', category=TqdmWarning)
351
+ progress_bar = tqdm(
352
+ desc=f'Inserting rows into `{self.tbl_version.get().name}`',
353
+ unit=' rows',
354
+ ncols=100,
355
+ file=sys.stdout,
356
+ )
357
+ progress_bar.update(1)
358
+
359
+ self._move_tmp_media_files(table_row, media_cols_by_idx, v_min)
360
+ batch_table_rows.append(tuple(table_row))
361
+
362
+ table_rows.extend(batch_table_rows)
363
+
364
+ # if a batch is ready for insertion into the database, insert it
365
+ if len(table_rows) >= self.__INSERT_BATCH_SIZE:
366
+ self.sql_insert(self.sa_tbl, store_col_names, table_rows)
367
+ table_rows.clear()
368
+
369
+ # insert any remaining rows
370
+ if len(table_rows) > 0:
371
+ self.sql_insert(self.sa_tbl, store_col_names, table_rows)
372
+
382
373
  if progress_bar is not None:
383
374
  progress_bar.close()
384
- return num_rows, num_excs, cols_with_excs
375
+ computed_values = exec_plan.ctx.num_computed_exprs * num_rows
376
+ row_counts = RowCountStats(
377
+ ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values
378
+ ) # insert (StoreBase)
379
+
380
+ return cols_with_excs, UpdateStatus(row_count_stats=row_counts)
381
+
385
382
  finally:
386
383
  exec_plan.close()
387
384
 
385
+ @classmethod
386
+ def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
387
+ assert len(table_rows) > 0
388
+ conn = Env.get().conn
389
+ conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
390
+
391
+ # TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
392
+ # differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
393
+ # insertion in psycopg and can be used if/when we decide to pursue that optimization.
394
+ # col_names_str = ", ".join(store_col_names)
395
+ # placeholders_str = ", ".join('%s' for _ in store_col_names)
396
+ # stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
397
+ # conn.exec_driver_sql(stmt_text, table_rows)
398
+
388
399
  def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
389
400
  """Return filter for base versions"""
390
401
  v = versions[0]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: pixeltable
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
5
5
  License: Apache-2.0
6
6
  Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai