PyPI - pixeltable - Versions diffs - 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl - Mend

pixeltable 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (76) hide show

pixeltable/__init__.py +15 -33
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +1 -1
pixeltable/catalog/column.py +28 -16
pixeltable/catalog/dir.py +2 -2
pixeltable/catalog/insertable_table.py +5 -55
pixeltable/catalog/named_function.py +2 -2
pixeltable/catalog/schema_object.py +2 -7
pixeltable/catalog/table.py +298 -204
pixeltable/catalog/table_version.py +104 -139
pixeltable/catalog/table_version_path.py +22 -4
pixeltable/catalog/view.py +20 -10
pixeltable/dataframe.py +128 -25
pixeltable/env.py +21 -14
pixeltable/exec/exec_context.py +5 -0
pixeltable/exec/exec_node.py +1 -0
pixeltable/exec/in_memory_data_node.py +29 -24
pixeltable/exec/sql_scan_node.py +1 -1
pixeltable/exprs/column_ref.py +13 -8
pixeltable/exprs/data_row.py +4 -0
pixeltable/exprs/expr.py +16 -1
pixeltable/exprs/function_call.py +4 -4
pixeltable/exprs/row_builder.py +29 -20
pixeltable/exprs/similarity_expr.py +4 -3
pixeltable/ext/functions/yolox.py +2 -1
pixeltable/func/__init__.py +1 -0
pixeltable/func/aggregate_function.py +14 -12
pixeltable/func/callable_function.py +8 -6
pixeltable/func/expr_template_function.py +13 -19
pixeltable/func/function.py +3 -6
pixeltable/func/query_template_function.py +84 -0
pixeltable/func/signature.py +68 -23
pixeltable/func/udf.py +13 -10
pixeltable/functions/__init__.py +6 -91
pixeltable/functions/eval.py +26 -14
pixeltable/functions/fireworks.py +25 -23
pixeltable/functions/globals.py +62 -0
pixeltable/functions/huggingface.py +20 -16
pixeltable/functions/image.py +170 -1
pixeltable/functions/openai.py +95 -128
pixeltable/functions/string.py +10 -2
pixeltable/functions/together.py +95 -84
pixeltable/functions/util.py +16 -0
pixeltable/functions/video.py +94 -16
pixeltable/functions/whisper.py +78 -0
pixeltable/globals.py +1 -1
pixeltable/io/__init__.py +10 -0
pixeltable/io/external_store.py +370 -0
pixeltable/io/globals.py +50 -22
pixeltable/{datatransfer → io}/label_studio.py +279 -166
pixeltable/io/parquet.py +1 -1
pixeltable/iterators/__init__.py +9 -0
pixeltable/iterators/string.py +40 -0
pixeltable/metadata/__init__.py +6 -8
pixeltable/metadata/converters/convert_10.py +2 -4
pixeltable/metadata/converters/convert_12.py +7 -2
pixeltable/metadata/converters/convert_13.py +6 -8
pixeltable/metadata/converters/convert_14.py +2 -4
pixeltable/metadata/converters/convert_15.py +40 -25
pixeltable/metadata/converters/convert_16.py +18 -0
pixeltable/metadata/converters/util.py +11 -8
pixeltable/metadata/schema.py +3 -6
pixeltable/plan.py +8 -7
pixeltable/store.py +1 -1
pixeltable/tool/create_test_db_dump.py +145 -54
pixeltable/tool/embed_udf.py +9 -0
pixeltable/type_system.py +1 -2
pixeltable/utils/code.py +34 -0
{pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/METADATA +2 -2
pixeltable-0.2.9.dist-info/RECORD +131 -0
pixeltable/datatransfer/__init__.py +0 -1
pixeltable/datatransfer/remote.py +0 -113
pixeltable/functions/pil/image.py +0 -147
pixeltable-0.2.7.dist-info/RECORD +0 -126
{pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/LICENSE +0 -0
{pixeltable-0.2.7.dist-info → pixeltable-0.2.9.dist-info}/WHEEL +0 -0

pixeltable/dataframe.py CHANGED Viewed

@@ -9,7 +9,7 @@ import logging
 import mimetypes
 import traceback
 from pathlib import Path
-from typing import List, Optional, Any, Dict, Generator, Tuple, Set
+from typing import List, Optional, Any, Dict, Iterator, Tuple, Set
 import PIL.Image
 import cv2
@@ -22,6 +22,7 @@ import pixeltable.catalog as catalog
 import pixeltable.exceptions as excs
 import pixeltable.exprs as exprs
 import pixeltable.type_system as ts
+import pixeltable.func as func
 from pixeltable.catalog import is_valid_identifier
 from pixeltable.env import Env
 from pixeltable.plan import Planner
@@ -344,7 +345,37 @@ class DataFrame:
         assert set(out_names) == seen_out_names
         return out_exprs, out_names
-    def _exec(self) -> Generator[exprs.DataRow, None, None]:
+    def _vars(self) -> dict[str, exprs.Variable]:
+        """
+        Return a dict mapping variable name to Variable for all Variables contained in any component of the DataFrame
+        """
+        all_exprs: list[exprs.Expr] = []
+        all_exprs.extend(self._select_list_exprs)
+        if self.where_clause is not None:
+            all_exprs.append(self.where_clause)
+        if self.group_by_clause is not None:
+            all_exprs.extend(self.group_by_clause)
+        if self.order_by_clause is not None:
+            all_exprs.extend([expr for expr, _ in self.order_by_clause])
+        vars = exprs.Expr.list_subexprs(all_exprs, expr_class=exprs.Variable)
+        unique_vars: dict[str, exprs.Variable] = {}
+        for var in vars:
+            if var.name not in unique_vars:
+                unique_vars[var.name] = var
+            else:
+                if unique_vars[var.name].col_type != var.col_type:
+                    raise excs.Error(f'Multiple definitions of parameter {var.name}')
+        return unique_vars
+    def parameters(self) -> dict[str, ColumnType]:
+        """Return a dict mapping parameter name to parameter type.
+        Parameters are Variables contained in any component of the DataFrame.
+        """
+        vars = self._vars()
+        return {name: var.col_type for name, var in vars.items()}
+    def _exec(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[exprs.DataRow]:
         """Run the query and return rows as a generator.
         This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
         """
@@ -361,6 +392,7 @@ class DataFrame:
         for item in self._select_list_exprs:
             item.bind_rel_paths(None)
         plan = Planner.create_query_plan(
             self.tbl,
             self._select_list_exprs,
@@ -370,8 +402,8 @@ class DataFrame:
             limit=self.limit_val if self.limit_val is not None else 0,
         )  # limit_val == 0: no limit_val
-        with Env.get().engine.begin() as conn:
-            plan.ctx.conn = conn
+        def exec_plan(conn: sql.engine.Connection) -> Iterator[exprs.DataRow]:
+            plan.ctx.set_conn(conn)
             plan.open()
             try:
                 for row_batch in plan:
@@ -379,7 +411,12 @@ class DataFrame:
                         yield data_row
             finally:
                 plan.close()
-            return
+        if conn is None:
+            with Env.get().engine.begin() as conn:
+                yield from exec_plan(conn)
+        else:
+            yield from exec_plan(conn)
     def show(self, n: int = 20) -> DataFrameResultSet:
         assert n is not None
@@ -407,10 +444,54 @@ class DataFrame:
     def get_column_types(self) -> List[ColumnType]:
         return [expr.col_type for expr in self._select_list_exprs]
+    def bind(self, args: dict[str, Any]) -> DataFrame:
+        """Bind arguments to parameters and return a new DataFrame."""
+        # substitute Variables with the corresponding values according to 'args', converted to Literals
+        select_list_exprs = copy.deepcopy(self._select_list_exprs)
+        where_clause = copy.deepcopy(self.where_clause)
+        group_by_clause = copy.deepcopy(self.group_by_clause)
+        order_by_exprs = [copy.deepcopy(order_by_expr) for order_by_expr, _ in self.order_by_clause] \
+            if self.order_by_clause is not None else None
+        var_exprs: dict[exprs.Expr, exprs.Expr] = {}
+        vars = self._vars()
+        for arg_name, arg_val in args.items():
+            if arg_name not in vars:
+                # ignore unused variables
+                continue
+            var_expr = vars[arg_name]
+            arg_expr = exprs.Expr.from_object(arg_val)
+            if arg_expr is None:
+                raise excs.Error(f'Cannot convert argument {arg_val} to a Pixeltable expression')
+            var_exprs[var_expr] = arg_expr
+        exprs.Expr.list_substitute(select_list_exprs, var_exprs)
+        if where_clause is not None:
+            where_clause.substitute(var_exprs)
+        if group_by_clause is not None:
+            exprs.Expr.list_substitute(group_by_clause, var_exprs)
+        if order_by_exprs is not None:
+            exprs.Expr.list_substitute(order_by_exprs, var_exprs)
+        select_list = list(zip(select_list_exprs, self._column_names))
+        order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None
+        if order_by_exprs is not None:
+            order_by_clause = [
+                (expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
+            ]
+        return DataFrame(
+            self.tbl, select_list=select_list, where_clause=where_clause,
+            group_by_clause=group_by_clause, grouping_tbl=self.grouping_tbl,
+            order_by_clause=order_by_clause, limit=self.limit_val)
     def collect(self) -> DataFrameResultSet:
+        return self._collect()
+    def _collect(self, conn: Optional[sql.engine.Connection] = None) -> DataFrameResultSet:
         try:
             result_rows = []
-            for data_row in self._exec():
+            for data_row in self._exec(conn):
                 result_row = [data_row[e.slot_idx] for e in self._select_list_exprs]
                 result_rows.append(result_row)
         except excs.ExprEvalError as e:
@@ -579,10 +660,10 @@ class DataFrame:
                 if len(grouping_items) > 1:
                     raise excs.Error(f'group_by(): only one table can be specified')
                 # we need to make sure that the grouping table is a base of self.tbl
-                base = self.tbl.find_tbl_version(item.tbl_version_path.tbl_id())
+                base = self.tbl.find_tbl_version(item._tbl_version_path.tbl_id())
                 if base is None or base.id == self.tbl.tbl_id():
                     raise excs.Error(f'group_by(): {item.name} is not a base table of {self.tbl.tbl_name()}')
-                grouping_tbl = item.tbl_version_path.tbl_version
+                grouping_tbl = item._tbl_version_path.tbl_version
                 break
             if not isinstance(item, exprs.Expr):
                 raise excs.Error(f'Invalid expression in group_by(): {item}')
@@ -615,6 +696,7 @@ class DataFrame:
         )
     def limit(self, n: int) -> DataFrame:
+        # TODO: allow n to be a Variable that can be substituted in bind()
         assert n is not None and isinstance(n, int)
         return DataFrame(
             self.tbl,
@@ -643,7 +725,7 @@ class DataFrame:
             return self.select(*index)
         raise TypeError(f'Invalid index type: {type(index)}')
-    def _as_dict(self) -> Dict[str, Any]:
+    def as_dict(self) -> Dict[str, Any]:
         """
         Returns:
             Dictionary representing this dataframe.
@@ -651,22 +733,46 @@ class DataFrame:
         tbl_versions = self.tbl.get_tbl_versions()
         d = {
             '_classname': 'DataFrame',
-            'tbl_ids': [str(t.id) for t in tbl_versions],
-            'tbl_versions': [t.version for t in tbl_versions],
-            'select_list': [(e.as_dict(), name) for (e, name) in self.select_list]
-            if self.select_list is not None
-            else None,
+            'tbl': self.tbl.as_dict(),
+            'select_list':
+                [(e.as_dict(), name) for (e, name) in self.select_list] if self.select_list is not None else None,
             'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
-            'group_by_clause': [e.as_dict() for e in self.group_by_clause]
-            if self.group_by_clause is not None
-            else None,
-            'order_by_clause': [(e.as_dict(), asc) for (e, asc) in self.order_by_clause]
-            if self.order_by_clause is not None
-            else None,
+            'group_by_clause':
+                [e.as_dict() for e in self.group_by_clause] if self.group_by_clause is not None else None,
+            'grouping_tbl': self.grouping_tbl.as_dict() if self.grouping_tbl is not None else None,
+            'order_by_clause':
+                [(e.as_dict(), asc) for (e,asc) in self.order_by_clause] if self.order_by_clause is not None else None,
             'limit_val': self.limit_val,
         }
         return d
+    @classmethod
+    def from_dict(cls, d: Dict[str, Any]) -> 'DataFrame':
+        tbl = catalog.TableVersionPath.from_dict(d['tbl'])
+        select_list = [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] \
+            if d['select_list'] is not None else None
+        where_clause = exprs.Predicate.from_dict(d['where_clause']) \
+            if d['where_clause'] is not None else None
+        group_by_clause = [exprs.Expr.from_dict(e) for e in d['group_by_clause']] \
+            if d['group_by_clause'] is not None else None
+        grouping_tbl = catalog.TableVersion.from_dict(d['grouping_tbl']) \
+            if d['grouping_tbl'] is not None else None
+        order_by_clause = [(exprs.Expr.from_dict(e), asc) for e, asc in d['order_by_clause']] \
+            if d['order_by_clause'] is not None else None
+        limit_val = d['limit_val']
+        return DataFrame(
+            tbl, select_list=select_list, where_clause=where_clause, group_by_clause=group_by_clause,
+            grouping_tbl=grouping_tbl, order_by_clause=order_by_clause, limit=limit_val)
+    def _hash_result_set(self) -> str:
+        """Return a hash that changes when the result set changes."""
+        d = self.as_dict()
+        # add list of referenced table versions (the actual versions, not the effective ones) in order to force cache
+        # invalidation when any of the referenced tables changes
+        d['tbl_versions'] = [tbl_version.version for tbl_version in self.tbl.get_tbl_versions()]
+        summary_string = json.dumps(d)
+        return hashlib.sha256(summary_string.encode()).hexdigest()
     def to_coco_dataset(self) -> Path:
         """Convert the dataframe to a COCO dataset.
         This dataframe must return a single json-typed output column in the following format:
@@ -686,9 +792,7 @@ class DataFrame:
         """
         from pixeltable.utils.coco import write_coco_dataset
-        summary_string = json.dumps(self._as_dict())
-        cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
+        cache_key = self._hash_result_set()
         dest_path = Env.get().dataset_cache_dir / f'coco_{cache_key}'
         if dest_path.exists():
             assert dest_path.is_dir()
@@ -737,8 +841,7 @@ class DataFrame:
         from pixeltable.io.parquet import save_parquet  # pylint: disable=import-outside-toplevel
         from pixeltable.utils.pytorch import PixeltablePytorchDataset  # pylint: disable=import-outside-toplevel
-        summary_string = json.dumps(self._as_dict())
-        cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
+        cache_key = self._hash_result_set()
         dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet')  # pylint: disable = protected-access
         if dest_path.exists():  # fast path: use cache

pixeltable/env.py CHANGED Viewed

@@ -14,7 +14,7 @@ import uuid
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Optional, Dict, Any, List
+from typing import Callable, Optional, Dict, Any, List, TYPE_CHECKING
 import pgserver
 import sqlalchemy as sql
@@ -25,6 +25,9 @@ import pixeltable.exceptions as excs
 from pixeltable import metadata
 from pixeltable.utils.http_server import make_server
+if TYPE_CHECKING:
+    import spacy
 class Env:
     """
@@ -63,12 +66,10 @@ class Env:
         # info about installed packages that are utilized by some parts of the code;
         # package name -> version; version == []: package is installed, but we haven't determined the version yet
         self._installed_packages: Dict[str, Optional[List[int]]] = {}
-        self._spacy_nlp: Optional[Any] = None  # spacy.Language
+        self._spacy_nlp: Optional[spacy.Language] = None
         self._httpd: Optional[http.server.HTTPServer] = None
         self._http_address: Optional[str] = None
-        self._registered_clients: dict[str, ApiClient] = {}
         # logging-related state
         self._logger = logging.getLogger('pixeltable')
         self._logger.setLevel(logging.DEBUG)  # allow everything to pass, we filter in _log_filter()
@@ -177,8 +178,6 @@ class Env:
         if self._initialized:
             return
-        # Disable spurious warnings
-        warnings.simplefilter('ignore', category=TqdmWarning)
         os.environ['TOKENIZERS_PARALLELISM'] = 'false'
         self._initialized = True
@@ -203,6 +202,12 @@ class Env:
         else:
             self._config = {}
+        # Disable spurious warnings
+        warnings.simplefilter('ignore', category=TqdmWarning)
+        if 'hide_warnings' in self._config and self._config['hide_warnings']:
+            # Disable more warnings
+            warnings.simplefilter('ignore', category=UserWarning)
         if self._home.exists() and not self._home.is_dir():
             raise RuntimeError(f'{self._home} is not a directory')
@@ -354,11 +359,6 @@ class Env:
     def _upgrade_metadata(self) -> None:
         metadata.upgrade_md(self._sa_engine)
-    def _register_client(self, name: str, init_fn: Callable) -> None:
-        sig = inspect.signature(init_fn)
-        param_names = list(sig.parameters.keys())
-        self._registered_clients[name] = ApiClient(init_fn=init_fn, param_names=param_names)
     def get_client(self, name: str) -> Any:
         """
         Gets the client with the specified name, initializing it if necessary.
@@ -366,7 +366,7 @@ class Env:
         Args:
             - name: The name of the client
         """
-        cl = self._registered_clients[name]
+        cl = _registered_clients[name]
         if cl.client_obj is not None:
             return cl.client_obj  # Already initialized
@@ -430,6 +430,7 @@ class Env:
         check('torchvision')
         check('transformers')
         check('sentence_transformers')
+        check('whisper')
         check('yolox')
         check('whisperx')
         check('boto3')
@@ -507,7 +508,7 @@ class Env:
         return self._sa_engine
     @property
-    def spacy_nlp(self) -> Any:
+    def spacy_nlp(self) -> spacy.Language:
         assert self._spacy_nlp is not None
         return self._spacy_nlp
@@ -537,11 +538,17 @@ def register_client(name: str) -> Callable:
         - name (str): The name of the API client (e.g., 'openai' or 'label-studio').
     """
     def decorator(fn: Callable) -> None:
-        Env.get()._register_client(name, fn)
+        global _registered_clients
+        sig = inspect.signature(fn)
+        param_names = list(sig.parameters.keys())
+        _registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
     return decorator
+_registered_clients: dict[str, ApiClient] = {}
 @dataclass
 class ApiClient:
     init_fn: Callable

pixeltable/exec/exec_context.py CHANGED Viewed

@@ -13,6 +13,7 @@ class ExecContext:
     ):
         self.show_pbar = show_pbar
         self.batch_size = batch_size
+        self.row_builder = row_builder
         self.profile = exprs.ExecProfile(row_builder)
         # num_rows is used to compute the total number of computed cells used for the progress bar
         self.num_rows: Optional[int] = None
@@ -20,3 +21,7 @@ class ExecContext:
         self.pk_clause = pk_clause
         self.num_computed_exprs = num_computed_exprs
         self.ignore_errors = ignore_errors
+    def set_conn(self, conn: sql.engine.Connection) -> None:
+        self.conn = conn
+        self.row_builder.set_conn(conn)

pixeltable/exec/exec_node.py CHANGED Viewed

@@ -11,6 +11,7 @@ class ExecNode(abc.ABC):
     def __init__(
             self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
             input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
+        self.output_exprs = output_exprs
         self.row_builder = row_builder
         self.input = input
         # we flush all image slots that aren't part of our output but are needed to create our output

pixeltable/exec/in_memory_data_node.py CHANGED Viewed

@@ -1,25 +1,29 @@
-from typing import List, Dict, Any, Optional
-import urllib
 import logging
-import os
+from typing import List, Dict, Any, Optional
-from .data_row_batch import DataRowBatch
-from .exec_node import ExecNode
 import pixeltable.catalog as catalog
 import pixeltable.exprs as exprs
-import pixeltable.env as env
 from pixeltable.utils.media_store import MediaStore
+from .data_row_batch import DataRowBatch
+from .exec_node import ExecNode
 _logger = logging.getLogger('pixeltable')
 class InMemoryDataNode(ExecNode):
-    """Outputs in-memory data as a row batch of a particular table"""
+    """
+    Outputs in-memory data as a DataRowBatch of a particular table.
+    Populates slots of all non-computed columns (ie, output ColumnRefs)
+    - with the values provided in the input rows
+    - if an input row doesn't provide a value, sets the slot to the column default
+    """
     def __init__(
             self, tbl: catalog.TableVersionPath, rows: List[Dict[str, Any]],
             row_builder: exprs.RowBuilder, start_row_id: int,
     ):
-        super().__init__(row_builder, [], [], None)
+        # we materialize all output slots
+        output_exprs = [e for e in row_builder.get_output_exprs() if isinstance(e, exprs.ColumnRef)]
+        super().__init__(row_builder, output_exprs, [], None)
         assert tbl.is_insertable()
         self.tbl = tbl
         self.input_rows = rows
@@ -29,21 +33,22 @@ class InMemoryDataNode(ExecNode):
     def _open(self) -> None:
         """Create row batch and populate with self.input_rows"""
-        column_info = {info.col.id: info for info in self.row_builder.output_slot_idxs()}
-        # exclude system columns
-        user_column_info = {info.col.name: info for _, info in column_info.items() if info.col.name is not None}
-        # stored columns that are not computed
-        inserted_col_ids = set([
-            info.col.id for info in self.row_builder.output_slot_idxs()
-            if info.col.is_stored and not info.col.is_computed
-        ])
+        user_cols_by_name = {
+            col_ref.col.name: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
+            for col_ref in self.output_exprs if col_ref.col.name is not None
+        }
+        output_cols_by_idx = {
+            col_ref.slot_idx: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
+            for col_ref in self.output_exprs
+        }
+        output_slot_idxs = {e.slot_idx for e in self.output_exprs}
         self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
         for row_idx, input_row in enumerate(self.input_rows):
             # populate the output row with the values provided in the input row
-            input_col_ids: List[int] = []
+            input_slot_idxs: set[int] = set()
             for col_name, val in input_row.items():
-                col_info = user_column_info.get(col_name)
+                col_info = user_cols_by_name.get(col_name)
                 assert col_info is not None
                 if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
@@ -52,12 +57,12 @@ class InMemoryDataNode(ExecNode):
                     open(path, 'wb').write(val)
                     val = path
                 self.output_rows[row_idx][col_info.slot_idx] = val
-                input_col_ids.append(col_info.col.id)
+                input_slot_idxs.add(col_info.slot_idx)
-            # set the remaining stored non-computed columns to null
-            null_col_ids = inserted_col_ids - set(input_col_ids)
-            for col_id in null_col_ids:
-                col_info = column_info.get(col_id)
+            # set the remaining output slots to their default values (presently None)
+            missing_slot_idxs =  output_slot_idxs - input_slot_idxs
+            for slot_idx in missing_slot_idxs:
+                col_info = output_cols_by_idx.get(slot_idx)
                 assert col_info is not None
                 self.output_rows[row_idx][col_info.slot_idx] = None

pixeltable/exec/sql_scan_node.py CHANGED Viewed

@@ -37,7 +37,6 @@ class SqlScanNode(ExecNode):
             order_by_items = []
         if exact_version_only is None:
             exact_version_only = []
-        super().__init__(row_builder, [], [], None)
         self.tbl = tbl
         target = tbl.tbl_version  # the stored table we're scanning
         self.sql_exprs = exprs.ExprSet(select_list)
@@ -45,6 +44,7 @@ class SqlScanNode(ExecNode):
         for iter_arg in row_builder.unstored_iter_args.values():
             sql_subexprs = iter_arg.subexprs(filter=lambda e: e.sql_expr() is not None, traverse_matches=False)
             [self.sql_exprs.append(e) for e in sql_subexprs]
+        super().__init__(row_builder, self.sql_exprs, [], None)  # we materialize self.sql_exprs
         self.filter = filter
         self.filter_eval_ctx = \
             row_builder.create_eval_ctx([filter], exclude=select_list) if filter is not None else None

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Optional, List, Any, Dict, Tuple
+from typing import Optional, Any, Tuple
 from uuid import UUID
 import sqlalchemy as sql
@@ -38,7 +38,7 @@ class ColumnRef(Expr):
         self.iter_arg_ctx = iter_arg_ctx
         assert len(self.iter_arg_ctx.target_slot_idxs) == 1  # a single inline dict
-    def _id_attrs(self) -> List[Tuple[str, Any]]:
+    def _id_attrs(self) -> list[Tuple[str, Any]]:
         return super()._id_attrs() + [('tbl_id', self.col.tbl.id), ('col_id', self.col.id)]
     def __getattr__(self, name: str) -> Expr:
@@ -64,8 +64,8 @@ class ColumnRef(Expr):
         return super().__getattr__(name)
     def similarity(self, other: Any) -> Expr:
-        if isinstance(other, Expr):
-            raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not an expression')
+        # if isinstance(other, Expr):
+        #     raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not an expression')
         item = Expr.from_object(other)
         if item is None or not(item.col_type.is_string_type() or item.col_type.is_image_type()):
             raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not a {type(other)}')
@@ -86,7 +86,8 @@ class ColumnRef(Expr):
     def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
         if not self.is_unstored_iter_col:
-            assert data_row.has_val[self.slot_idx]
+            # supply default
+            data_row[self.slot_idx] = None
             return
         # if this is a new base row, we need to instantiate a new iterator
@@ -99,16 +100,20 @@ class ColumnRef(Expr):
         res = next(self.iterator)
         data_row[self.slot_idx] = res[self.col.name]
-    def _as_dict(self) -> Dict:
+    def _as_dict(self) -> dict:
         tbl = self.col.tbl
         version = tbl.version if tbl.is_snapshot else None
         return {'tbl_id': str(tbl.id), 'tbl_version': version, 'col_id': self.col.id}
     @classmethod
-    def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
+    def get_column(cls, d: dict) -> catalog.Column:
         tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
         tbl_version = catalog.Catalog.get().tbl_versions[(tbl_id, version)]
         # don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
         col = next(col for col in tbl_version.cols if col.id == col_id)
-        return cls(col)
+        return col
+    @classmethod
+    def _from_dict(cls, d: dict, _: list[Expr]) -> Expr:
+        col = cls.get_column(d)
+        return cls(col)

pixeltable/exprs/data_row.py CHANGED Viewed

@@ -133,6 +133,10 @@ class DataRow:
             np.save(buffer, np_array)
             return buffer.getvalue()
+        # for JSON columns, we need to store None as an explicit NULL, otherwise it stores a json 'null'
+        if self.vals[index] is None and sa_col_type is not None and isinstance(sa_col_type, sql.JSON):
+            return sql.sql.null()
         return self.vals[index]
     def __setitem__(self, idx: object, val: Any) -> None:

pixeltable/exprs/expr.py CHANGED Viewed

@@ -158,7 +158,9 @@ class Expr(abc.ABC):
         return result
     @classmethod
-    def copy_list(cls, expr_list: List[Expr]) -> List[Expr]:
+    def copy_list(cls, expr_list: Optional[List[Expr]]) -> Optional[List[Expr]]:
+        if expr_list is None:
+            return None
         return [e.copy() for e in expr_list]
     def __deepcopy__(self, memo=None) -> Expr:
@@ -297,6 +299,19 @@ class Expr(abc.ABC):
             ids.update(e.tbl_ids())
         return ids
+    @classmethod
+    def get_refd_columns(cls, expr_dict: dict[str, Any]) -> list[catalog.Column]:
+        """Return Columns referenced by expr_dict."""
+        result: list[catalog.Column] = []
+        assert '_classname' in expr_dict
+        from .column_ref import ColumnRef
+        if expr_dict['_classname'] == 'ColumnRef':
+            result.append(ColumnRef.get_column(expr_dict))
+        if 'components' in expr_dict:
+            for component_dict in expr_dict['components']:
+                result.extend(cls.get_refd_columns(component_dict))
+        return result
     @classmethod
     def from_object(cls, o: object) -> Optional[Expr]:
         """

pixeltable/exprs/function_call.py CHANGED Viewed

@@ -54,7 +54,7 @@ class FunctionCall(Expr):
         self.arg_types: List[ts.ColumnType] = []
         self.kwarg_types: Dict[str, ts.ColumnType] = {}
         # the prefix of parameters that are bound can be passed by position
-        for param in fn.py_signature.parameters.values():
+        for param in fn.signature.py_signature.parameters.values():
             if param.name not in bound_args or param.kind == inspect.Parameter.KEYWORD_ONLY:
                 break
             arg = bound_args[param.name]
@@ -67,7 +67,7 @@ class FunctionCall(Expr):
                 self.arg_types.append(signature.parameters[param.name].col_type)
         # the remaining args are passed as keywords
-        kw_param_names = set(bound_args.keys()) - set(list(fn.py_signature.parameters.keys())[:len(self.args)])
+        kw_param_names = set(bound_args.keys()) - set(list(fn.signature.py_signature.parameters.keys())[:len(self.args)])
         for param_name in kw_param_names:
             arg = bound_args[param_name]
             if isinstance(arg, Expr):
@@ -75,7 +75,7 @@ class FunctionCall(Expr):
                 self.components.append(arg.copy())
             else:
                 self.kwargs[param_name] = (None, arg)
-            if fn.py_signature.parameters[param_name].kind != inspect.Parameter.VAR_KEYWORD:
+            if fn.signature.py_signature.parameters[param_name].kind != inspect.Parameter.VAR_KEYWORD:
                 self.kwarg_types[param_name] = signature.parameters[param_name].col_type
         # window function state:
@@ -117,7 +117,7 @@ class FunctionCall(Expr):
         self.id = self._create_id()
     def _create_rowid_refs(self, tbl: catalog.Table) -> List[Expr]:
-        target = tbl.tbl_version_path.tbl_version
+        target = tbl._tbl_version_path.tbl_version
         return [RowidRef(target, i) for i in range(target.num_rowid_columns())]
     @classmethod

pixeltable 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.7py3-none-any.whl → 0.2.9py3-none-any.whl