PyPI - pixeltable - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

pixeltable 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (28) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +2 -1
pixeltable/catalog/catalog.py +75 -21
pixeltable/catalog/column.py +10 -0
pixeltable/catalog/globals.py +121 -18
pixeltable/catalog/insertable_table.py +2 -1
pixeltable/catalog/table.py +135 -4
pixeltable/catalog/table_version.py +106 -66
pixeltable/catalog/table_version_handle.py +26 -1
pixeltable/catalog/view.py +4 -2
pixeltable/exprs/column_property_ref.py +2 -11
pixeltable/exprs/column_ref.py +19 -17
pixeltable/exprs/data_row.py +9 -0
pixeltable/exprs/row_builder.py +44 -13
pixeltable/io/external_store.py +79 -52
pixeltable/io/globals.py +1 -1
pixeltable/io/label_studio.py +45 -41
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/utils.py +78 -0
pixeltable/plan.py +22 -18
pixeltable/store.py +114 -103
{pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/METADATA +1 -1
{pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/RECORD +28 -26
{pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/LICENSE +0 -0
{pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/WHEEL +0 -0
{pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/entry_points.txt +0 -0

pixeltable/metadata/converters/convert_38.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=38)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if k == 'col_mapping':
+        assert isinstance(v, list)
+        return k, [__col_mapping_entry(e) for e in v]
+    if k == 'stored_proxies':
+        assert isinstance(v, list)
+        return k, [__stored_proxies_entry(e) for e in v]
+    return None
+def __col_mapping_entry(e: list) -> list:
+    assert isinstance(e, list)
+    assert isinstance(e[0], dict)
+    assert isinstance(e[1], str)
+    return [__col_handle(e[0]), e[1]]
+def __stored_proxies_entry(e: list) -> list:
+    assert isinstance(e, list)
+    assert isinstance(e[0], dict)
+    assert isinstance(e[1], dict)
+    return [__col_handle(e[0]), __col_handle(e[1])]
+def __col_handle(e: dict) -> dict:
+    return {'tbl_version': {'id': e['tbl_id'], 'effective_version': None}, 'col_id': e['col_id']}

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    39: 'ColumnHandles in external stores',
     38: 'Added TableMd.view_sn',
     37: 'Add support for the sample() method on DataFrames',
     36: 'Added Table.lock_dummy',

pixeltable/metadata/utils.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+from typing import Optional
+from pixeltable.metadata import schema
+class MetadataUtils:
+    @classmethod
+    def _diff_md(
+        cls, old_md: Optional[dict[int, schema.SchemaColumn]], new_md: Optional[dict[int, schema.SchemaColumn]]
+    ) -> str:
+        """Return a string reporting the differences in a specific entry in two dictionaries
+        Results are formatted as follows:
+        - If `old_md` is `None`, returns 'Initial Version'.
+        - If `old_md` and `new_md` are the same, returns an empty string.
+        - If there are additions, changes, or deletions, returns a string summarizing the changes.
+        """
+        assert new_md is not None
+        if old_md is None:
+            return 'Initial Version'
+        if old_md == new_md:
+            return ''
+        added = {k: v.name for k, v in new_md.items() if k not in old_md}
+        changed = {
+            k: f'{old_md[k].name!r} to {v.name!r}'
+            for k, v in new_md.items()
+            if k in old_md and old_md[k].name != v.name
+        }
+        deleted = {k: v.name for k, v in old_md.items() if k not in new_md}
+        if len(added) == 0 and len(changed) == 0 and len(deleted) == 0:
+            return ''
+        # Format the result
+        t = []
+        if len(added) > 0:
+            t.append('Added: ' + ', '.join(added.values()))
+        if len(changed) > 0:
+            t.append('Renamed: ' + ', '.join(changed.values()))
+        if len(deleted) > 0:
+            t.append('Deleted: ' + ', '.join(deleted.values()))
+        r = ', '.join(t)
+        return r
+    @classmethod
+    def _create_md_change_dict(
+        cls, md_list: Optional[list[tuple[int, dict[int, schema.SchemaColumn]]]]
+    ) -> dict[int, str]:
+        """Return a dictionary of schema changes by version
+        Args:
+            md_list: a list of tuples, each containing a version number and a metadata dictionary.
+        """
+        r: dict[int, str] = {}
+        if md_list is None or len(md_list) == 0:
+            return r
+        # Sort the list in place by version number
+        md_list.sort()
+        first_retrieved_version = md_list[0][0]
+        if first_retrieved_version == 0:
+            prev_md = None
+            prev_ver = -1
+            start = 0
+        else:
+            prev_md = md_list[0][1]
+            prev_ver = first_retrieved_version
+            start = 1
+        for ver, curr_md in md_list[start:]:
+            if ver == prev_ver:
+                continue
+            assert ver > prev_ver
+            tf = cls._diff_md(prev_md, curr_md)
+            if tf != '':
+                r[ver] = tf
+            prev_md = curr_md
+        return r

pixeltable/plan.py CHANGED Viewed

@@ -378,7 +378,7 @@ class Planner:
         cls.__check_valid_columns(tbl, stored_cols, 'inserted into')
-        row_builder = exprs.RowBuilder([], stored_cols, [])
+        row_builder = exprs.RowBuilder([], stored_cols, [], tbl)
         # create InMemoryDataNode for 'rows'
         plan: exec.ExecNode = exec.InMemoryDataNode(
@@ -473,15 +473,19 @@ class Planner:
         assert isinstance(tbl, catalog.TableVersionPath)
         target = tbl.tbl_version.get()  # the one we need to update
         updated_cols = list(update_targets.keys())
+        recomputed_cols: set[Column]
         if len(recompute_targets) > 0:
-            recomputed_cols = set(recompute_targets)
+            assert len(update_targets) == 0
+            recomputed_cols = {*recompute_targets}
+            if cascade:
+                recomputed_cols |= target.get_dependent_columns(recomputed_cols)
         else:
             recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
-            # regardless of cascade, we need to update all indices on any updated column
-            idx_val_cols = target.get_idx_val_columns(updated_cols)
-            recomputed_cols.update(idx_val_cols)
-            # we only need to recompute stored columns (unstored ones are substituted away)
-            recomputed_cols = {c for c in recomputed_cols if c.is_stored}
+        # regardless of cascade, we need to update all indices on any updated/recomputed column
+        idx_val_cols = target.get_idx_val_columns(set(updated_cols) | recomputed_cols)
+        recomputed_cols.update(idx_val_cols)
+        # we only need to recompute stored columns (unstored ones are substituted away)
+        recomputed_cols = {c for c in recomputed_cols if c.is_stored}
         cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
@@ -588,7 +592,7 @@ class Planner:
         sql_exprs = list(
             exprs.Expr.list_subexprs(analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False)
         )
-        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs)
+        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs, target)
         analyzer.finalize(row_builder)
         sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, sql_exprs, sa_key_cols, key_vals)
         col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
@@ -602,8 +606,7 @@ class Planner:
         row_builder.set_slot_idxs(select_list, remove_duplicates=False)
         for i, col in enumerate(all_base_cols):
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
-        ctx = exec.ExecContext(row_builder)
+        ctx = exec.ExecContext(row_builder, num_computed_exprs=len(recomputed_exprs))
         # we're returning everything to the user, so we might as well do it in a single batch
         ctx.batch_size = 0
         plan.set_ctx(ctx)
@@ -695,7 +698,7 @@ class Planner:
         base_analyzer = Analyzer(
             from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
         )
-        row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
+        row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [], target)
         # if we're propagating an insert, we only want to see those base rows that were created for the current version
         # execution plan:
@@ -832,7 +835,11 @@ class Planner:
             order_by_clause=order_by_clause,
             sample_clause=sample_clause,
         )
-        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
+        # If the from_clause has a single table, we can use it as the context table for the RowBuilder.
+        # Otherwise there is no context table, but that's ok, because the context table is only needed for
+        # table mutations, which can't happen during a join.
+        context_tbl = from_clause.tbls[0].tbl_version.get() if len(from_clause.tbls) == 1 else None
+        row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [], context_tbl)
         analyzer.finalize(row_builder)
         # select_list: we need to materialize everything that's been collected
@@ -1035,16 +1042,14 @@ class Planner:
         return Analyzer(FromClause(tbls=[tbl]), [], where_clause=where_clause)
     @classmethod
-    def create_add_column_plan(
-        cls, tbl: catalog.TableVersionPath, col: catalog.Column
-    ) -> tuple[exec.ExecNode, Optional[int]]:
+    def create_add_column_plan(cls, tbl: catalog.TableVersionPath, col: catalog.Column) -> exec.ExecNode:
         """Creates a plan for InsertableTable.add_column()
         Returns:
             plan: the plan to execute
             value_expr slot idx for the plan output (for computed cols)
         """
         assert isinstance(tbl, catalog.TableVersionPath)
-        row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
+        row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[], tbl=tbl.tbl_version.get())
         analyzer = Analyzer(FromClause(tbls=[tbl]), row_builder.default_eval_ctx.target_exprs)
         plan = cls._create_query_plan(
             row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
@@ -1056,5 +1061,4 @@ class Planner:
         # we want to flush images
         if col.is_computed and col.is_stored and col.col_type.is_image_type():
             plan.set_stored_img_cols(row_builder.output_slot_idxs())
-        value_expr_slot_idx = row_builder.output_slot_idxs()[0].slot_idx if col.is_computed else None
-        return plan, value_expr_slot_idx
+        return plan

pixeltable/store.py CHANGED Viewed

@@ -7,13 +7,14 @@ import sys
 import urllib.parse
 import urllib.request
 import warnings
-from typing import Any, Iterable, Iterator, Literal, Optional, Union
+from typing import Any, Iterable, Iterator, Optional, Union
 import more_itertools
 import sqlalchemy as sql
 from tqdm import TqdmWarning, tqdm
-from pixeltable import catalog, exceptions as excs, exprs
+from pixeltable import catalog, exceptions as excs
+from pixeltable.catalog import RowCountStats, UpdateStatus
 from pixeltable.env import Env
 from pixeltable.exec import ExecNode
 from pixeltable.metadata import schema
@@ -41,7 +42,10 @@ class StoreBase:
     v_max_col: sql.Column
     base: Optional[StoreBase]
-    __INSERT_BATCH_SIZE = 1000
+    # In my cursory experiments this was the optimal batch size: it was an improvement over 5_000 and there was no real
+    # benefit to going higher.
+    # TODO: Perform more rigorous experiments with different table structures and OS environments to refine this.
+    __INSERT_BATCH_SIZE = 10_000
     def __init__(self, tbl_version: catalog.TableVersion):
         self.tbl_version = catalog.TableVersionHandle(
@@ -124,13 +128,14 @@ class StoreBase:
     def _move_tmp_media_file(self, file_url: Optional[str], col: catalog.Column, v_min: int) -> str:
         """Move tmp media file with given url to Env.media_dir and return new url, or given url if not a tmp_dir file"""
-        pxt_tmp_dir = str(Env.get().tmp_dir)
         if file_url is None:
             return None
+        assert isinstance(file_url, str), type(file_url)
+        pxt_tmp_dir = str(Env.get().tmp_dir)
         parsed = urllib.parse.urlparse(file_url)
         # We should never be passed a local file path here. The "len > 1" ensures that Windows
         # file paths aren't mistaken for URLs with a single-character scheme.
-        assert len(parsed.scheme) > 1
+        assert len(parsed.scheme) > 1, file_url
         if parsed.scheme != 'file':
             # remote url
             return file_url
@@ -145,27 +150,11 @@ class StoreBase:
         return new_file_url
     def _move_tmp_media_files(
-        self, table_rows: list[dict[str, Any]], media_cols: list[catalog.Column], v_min: int
+        self, table_row: list[Any], media_cols_by_sql_idx: dict[int, catalog.Column], v_min: int
     ) -> None:
         """Move tmp media files that we generated to a permanent location"""
-        for c in media_cols:
-            for table_row in table_rows:
-                file_url = table_row[c.store_name()]
-                table_row[c.store_name()] = self._move_tmp_media_file(file_url, c, v_min)
-    def _create_table_row(
-        self, input_row: exprs.DataRow, row_builder: exprs.RowBuilder, exc_col_ids: set[int], pk: tuple[int, ...]
-    ) -> tuple[dict[str, Any], int]:
-        """Return Tuple[complete table row, # of exceptions] for insert()
-        Creates a row that includes the PK columns, with the values from input_row.pk.
-        Returns:
-            Tuple[complete table row, # of exceptions]
-        """
-        table_row, num_excs = row_builder.create_table_row(input_row, exc_col_ids)
-        assert len(pk) == len(self._pk_cols)
-        for pk_col, pk_val in zip(self._pk_cols, pk):
-            table_row[pk_col.name] = pk_val
-        return table_row, num_excs
+        for n, col in media_cols_by_sql_idx.items():
+            table_row[n] = self._move_tmp_media_file(table_row[n], col, v_min)
     def count(self) -> int:
         """Return the number of rows visible in self.tbl_version"""
@@ -231,9 +220,7 @@ class StoreBase:
             if col.store_name() not in existing_cols:
                 self.add_column(col)
-    def load_column(
-        self, col: catalog.Column, exec_plan: ExecNode, value_expr_slot_idx: int, on_error: Literal['abort', 'ignore']
-    ) -> int:
+    def load_column(self, col: catalog.Column, exec_plan: ExecNode, abort_on_exc: bool) -> int:
         """Update store column of a computed column with values produced by an execution plan
         Returns:
@@ -247,60 +234,51 @@ class StoreBase:
         num_rows = 0
         # create temp table to store output of exec_plan, with the same primary key as the store table
         tmp_name = f'temp_{self._storage_name()}'
-        tmp_pk_cols = [sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns()]
-        tmp_cols = tmp_pk_cols.copy()
+        tmp_pk_cols = tuple(sql.Column(col.name, col.type, primary_key=True) for col in self.pk_columns())
+        tmp_val_col_sql_idx = len(tmp_pk_cols)
         tmp_val_col = sql.Column(col.sa_col.name, col.sa_col.type)
-        tmp_cols.append(tmp_val_col)
+        tmp_cols = [*tmp_pk_cols, tmp_val_col]
         # add error columns if the store column records errors
         if col.records_errors:
             tmp_errortype_col = sql.Column(col.sa_errortype_col.name, col.sa_errortype_col.type)
-            tmp_cols.append(tmp_errortype_col)
             tmp_errormsg_col = sql.Column(col.sa_errormsg_col.name, col.sa_errormsg_col.type)
-            tmp_cols.append(tmp_errormsg_col)
+            tmp_cols.extend((tmp_errortype_col, tmp_errormsg_col))
+        tmp_col_names = [col.name for col in tmp_cols]
         tmp_tbl = sql.Table(tmp_name, self.sa_md, *tmp_cols, prefixes=['TEMPORARY'])
         conn = Env.get().conn
         tmp_tbl.create(bind=conn)
+        row_builder = exec_plan.row_builder
         try:
+            table_rows: list[tuple[Any]] = []
             # insert rows from exec_plan into temp table
-            # TODO: unify the table row construction logic with RowBuilder.create_table_row()
             for row_batch in exec_plan:
                 num_rows += len(row_batch)
-                tbl_rows: list[dict[str, Any]] = []
-                for result_row in row_batch:
-                    tbl_row: dict[str, Any] = {}
-                    for pk_col, pk_val in zip(self.pk_columns(), result_row.pk):
-                        tbl_row[pk_col.name] = pk_val
-                    if col.is_computed:
-                        if result_row.has_exc(value_expr_slot_idx):
-                            num_excs += 1
-                            value_exc = result_row.get_exc(value_expr_slot_idx)
-                            if on_error == 'abort':
-                                raise excs.Error(
-                                    f'Error while evaluating computed column `{col.name}`:\n{value_exc}'
-                                ) from value_exc
-                            # we store a NULL value and record the exception/exc type
-                            error_type = type(value_exc).__name__
-                            error_msg = str(value_exc)
-                            tbl_row[col.sa_col.name] = None
-                            tbl_row[col.sa_errortype_col.name] = error_type
-                            tbl_row[col.sa_errormsg_col.name] = error_msg
-                        else:
-                            if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
-                                # we have yet to store this image
-                                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
-                                result_row.flush_img(value_expr_slot_idx, filepath)
-                            val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
-                            if col.col_type.is_media_type():
-                                val = self._move_tmp_media_file(val, col, result_row.pk[-1])
-                            tbl_row[col.sa_col.name] = val
-                            if col.records_errors:
-                                tbl_row[col.sa_errortype_col.name] = None
-                                tbl_row[col.sa_errormsg_col.name] = None
-                    tbl_rows.append(tbl_row)
-                conn.execute(sql.insert(tmp_tbl), tbl_rows)
+                batch_table_rows: list[tuple[Any]] = []
+                for row in row_batch:
+                    if abort_on_exc and row.has_exc():
+                        exc = row.get_first_exc()
+                        raise excs.Error(f'Error while evaluating computed column {col.name!r}:\n{exc}') from exc
+                    table_row, num_row_exc = row_builder.create_table_row(row, None, row.pk)
+                    if col.col_type.is_media_type():
+                        table_row[tmp_val_col_sql_idx] = self._move_tmp_media_file(
+                            table_row[tmp_val_col_sql_idx], col, row.pk[-1]
+                        )
+                    num_excs += num_row_exc
+                    batch_table_rows.append(tuple(table_row))
+                table_rows.extend(batch_table_rows)
+                if len(table_rows) >= self.__INSERT_BATCH_SIZE:
+                    self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
+                    table_rows.clear()
+            if len(table_rows) > 0:
+                self.sql_insert(tmp_tbl, tmp_col_names, table_rows)
             # update store table with values from temp table
             update_stmt = sql.update(self.sa_tbl)
@@ -313,6 +291,7 @@ class StoreBase:
                 )
             log_explain(_logger, update_stmt, conn)
             conn.execute(update_stmt)
         finally:
             def remove_tmp_tbl() -> None:
@@ -320,6 +299,7 @@ class StoreBase:
                 tmp_tbl.drop(bind=conn)
             run_cleanup(remove_tmp_tbl, raise_error=True)
         return num_excs
     def insert_rows(
@@ -329,7 +309,7 @@ class StoreBase:
         show_progress: bool = True,
         rowids: Optional[Iterator[int]] = None,
         abort_on_exc: bool = False,
-    ) -> tuple[int, int, set[int]]:
+    ) -> tuple[set[int], UpdateStatus]:
         """Insert rows into the store table and update the catalog table's md
         Returns:
             number of inserted rows, number of exceptions, set of column ids that have exceptions
@@ -341,50 +321,81 @@ class StoreBase:
         cols_with_excs: set[int] = set()
         progress_bar: Optional[tqdm] = None  # create this only after we started executing
         row_builder = exec_plan.row_builder
-        media_cols = [info.col for info in row_builder.table_columns if info.col.col_type.is_media_type()]
-        conn = Env.get().conn
+        store_col_names, media_cols_by_idx = row_builder.store_column_names()
         try:
+            table_rows: list[tuple[Any]] = []
             exec_plan.open()
             for row_batch in exec_plan:
                 num_rows += len(row_batch)
-                for batch_start_idx in range(0, len(row_batch), self.__INSERT_BATCH_SIZE):
-                    # compute batch of rows and convert them into table rows
-                    table_rows: list[dict[str, Any]] = []
-                    batch_stop_idx = min(batch_start_idx + self.__INSERT_BATCH_SIZE, len(row_batch))
-                    for row_idx in range(batch_start_idx, batch_stop_idx):
-                        row = row_batch[row_idx]
-                        # if abort_on_exc == True, we need to check for media validation exceptions
-                        if abort_on_exc and row.has_exc():
-                            exc = row.get_first_exc()
-                            raise exc
-                        rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
-                        pk = (*rowid, v_min)
-                        table_row, num_row_exc = self._create_table_row(row, row_builder, cols_with_excs, pk=pk)
-                        num_excs += num_row_exc
-                        table_rows.append(table_row)
-                        if show_progress:
-                            if progress_bar is None:
-                                warnings.simplefilter('ignore', category=TqdmWarning)
-                                progress_bar = tqdm(
-                                    desc=f'Inserting rows into `{self.tbl_version.get().name}`',
-                                    unit=' rows',
-                                    ncols=100,
-                                    file=sys.stdout,
-                                )
-                            progress_bar.update(1)
-                    # insert batch of rows
-                    self._move_tmp_media_files(table_rows, media_cols, v_min)
-                    conn.execute(sql.insert(self.sa_tbl), table_rows)
+                batch_table_rows: list[tuple[Any]] = []
+                # compute batch of rows and convert them into table rows
+                for row in row_batch:
+                    # if abort_on_exc == True, we need to check for media validation exceptions
+                    if abort_on_exc and row.has_exc():
+                        exc = row.get_first_exc()
+                        raise exc
+                    rowid = (next(rowids),) if rowids is not None else row.pk[:-1]
+                    pk = (*rowid, v_min)
+                    assert len(pk) == len(self._pk_cols)
+                    table_row, num_row_exc = row_builder.create_table_row(row, cols_with_excs, pk)
+                    num_excs += num_row_exc
+                    if show_progress:
+                        if progress_bar is None:
+                            warnings.simplefilter('ignore', category=TqdmWarning)
+                            progress_bar = tqdm(
+                                desc=f'Inserting rows into `{self.tbl_version.get().name}`',
+                                unit=' rows',
+                                ncols=100,
+                                file=sys.stdout,
+                            )
+                        progress_bar.update(1)
+                    self._move_tmp_media_files(table_row, media_cols_by_idx, v_min)
+                    batch_table_rows.append(tuple(table_row))
+                table_rows.extend(batch_table_rows)
+                # if a batch is ready for insertion into the database, insert it
+                if len(table_rows) >= self.__INSERT_BATCH_SIZE:
+                    self.sql_insert(self.sa_tbl, store_col_names, table_rows)
+                    table_rows.clear()
+            # insert any remaining rows
+            if len(table_rows) > 0:
+                self.sql_insert(self.sa_tbl, store_col_names, table_rows)
             if progress_bar is not None:
                 progress_bar.close()
-            return num_rows, num_excs, cols_with_excs
+            computed_values = exec_plan.ctx.num_computed_exprs * num_rows
+            row_counts = RowCountStats(
+                ins_rows=num_rows, num_excs=num_excs, computed_values=computed_values
+            )  # insert (StoreBase)
+            return cols_with_excs, UpdateStatus(row_count_stats=row_counts)
         finally:
             exec_plan.close()
+    @classmethod
+    def sql_insert(cls, sa_tbl: sql.Table, store_col_names: list[str], table_rows: list[tuple[Any]]) -> None:
+        assert len(table_rows) > 0
+        conn = Env.get().conn
+        conn.execute(sql.insert(sa_tbl), [dict(zip(store_col_names, table_row)) for table_row in table_rows])
+        # TODO: Inserting directly via psycopg delivers a small performance benefit, but is somewhat fraught due to
+        #     differences in the data representation that SQLAlchemy/psycopg expect. The below code will do the
+        #     insertion in psycopg and can be used if/when we decide to pursue that optimization.
+        # col_names_str = ", ".join(store_col_names)
+        # placeholders_str = ", ".join('%s' for _ in store_col_names)
+        # stmt_text = f'INSERT INTO {self.sa_tbl.name} ({col_names_str}) VALUES ({placeholders_str})'
+        # conn.exec_driver_sql(stmt_text, table_rows)
     def _versions_clause(self, versions: list[Optional[int]], match_on_vmin: bool) -> sql.ColumnElement[bool]:
         """Return filter for base versions"""
         v = versions[0]

{pixeltable-0.4.1.dist-info → pixeltable-0.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: pixeltable
-Version: 0.4.1
+Version: 0.4.2
 Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
 License: Apache-2.0
 Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai

pixeltable 0.4.1__py3-none-any.whl → 0.4.2__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.1py3-none-any.whl → 0.4.2py3-none-any.whl