PyPI - pixeltable - Versions diffs - 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl - Mend

pixeltable 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (99) hide show

pixeltable/__init__.py +18 -9
pixeltable/__version__.py +3 -0
pixeltable/catalog/column.py +31 -50
pixeltable/catalog/insertable_table.py +7 -6
pixeltable/catalog/table.py +171 -57
pixeltable/catalog/table_version.py +417 -140
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/dataframe.py +239 -121
pixeltable/env.py +82 -16
pixeltable/exec/__init__.py +2 -1
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/data_row_batch.py +6 -7
pixeltable/exec/expr_eval_node.py +28 -28
pixeltable/exec/in_memory_data_node.py +11 -7
pixeltable/exec/sql_scan_node.py +7 -6
pixeltable/exprs/__init__.py +4 -3
pixeltable/exprs/column_ref.py +9 -0
pixeltable/exprs/comparison.py +3 -3
pixeltable/exprs/data_row.py +5 -1
pixeltable/exprs/expr.py +15 -7
pixeltable/exprs/function_call.py +17 -15
pixeltable/exprs/image_member_access.py +9 -28
pixeltable/exprs/in_predicate.py +96 -0
pixeltable/exprs/inline_array.py +13 -11
pixeltable/exprs/inline_dict.py +15 -13
pixeltable/exprs/literal.py +16 -4
pixeltable/exprs/row_builder.py +15 -41
pixeltable/exprs/similarity_expr.py +65 -0
pixeltable/ext/__init__.py +5 -0
pixeltable/ext/functions/yolox.py +92 -0
pixeltable/func/__init__.py +0 -2
pixeltable/func/aggregate_function.py +18 -15
pixeltable/func/callable_function.py +57 -13
pixeltable/func/expr_template_function.py +20 -3
pixeltable/func/function.py +35 -4
pixeltable/func/globals.py +24 -14
pixeltable/func/signature.py +23 -27
pixeltable/func/udf.py +13 -12
pixeltable/functions/__init__.py +8 -8
pixeltable/functions/eval.py +7 -8
pixeltable/functions/huggingface.py +64 -17
pixeltable/functions/openai.py +36 -3
pixeltable/functions/pil/image.py +61 -64
pixeltable/functions/together.py +21 -0
pixeltable/functions/util.py +11 -0
pixeltable/globals.py +425 -0
pixeltable/index/__init__.py +2 -0
pixeltable/index/base.py +51 -0
pixeltable/index/embedding_index.py +168 -0
pixeltable/io/__init__.py +3 -0
pixeltable/{utils → io}/hf_datasets.py +48 -17
pixeltable/io/pandas.py +148 -0
pixeltable/{utils → io}/parquet.py +58 -33
pixeltable/iterators/__init__.py +1 -1
pixeltable/iterators/base.py +4 -0
pixeltable/iterators/document.py +218 -97
pixeltable/iterators/video.py +8 -9
pixeltable/metadata/__init__.py +7 -3
pixeltable/metadata/converters/convert_12.py +3 -0
pixeltable/metadata/converters/convert_13.py +41 -0
pixeltable/metadata/schema.py +45 -22
pixeltable/plan.py +15 -51
pixeltable/store.py +38 -41
pixeltable/tool/create_test_db_dump.py +39 -4
pixeltable/type_system.py +47 -96
pixeltable/utils/documents.py +42 -12
pixeltable/utils/http_server.py +70 -0
{pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/METADATA +14 -10
pixeltable-0.2.6.dist-info/RECORD +119 -0
{pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/WHEEL +1 -1
pixeltable/client.py +0 -604
pixeltable/exprs/image_similarity_predicate.py +0 -58
pixeltable/func/batched_function.py +0 -53
pixeltable/tests/conftest.py +0 -177
pixeltable/tests/functions/test_fireworks.py +0 -42
pixeltable/tests/functions/test_functions.py +0 -60
pixeltable/tests/functions/test_huggingface.py +0 -158
pixeltable/tests/functions/test_openai.py +0 -152
pixeltable/tests/functions/test_together.py +0 -111
pixeltable/tests/test_audio.py +0 -65
pixeltable/tests/test_catalog.py +0 -27
pixeltable/tests/test_client.py +0 -21
pixeltable/tests/test_component_view.py +0 -370
pixeltable/tests/test_dataframe.py +0 -439
pixeltable/tests/test_dirs.py +0 -107
pixeltable/tests/test_document.py +0 -120
pixeltable/tests/test_exprs.py +0 -805
pixeltable/tests/test_function.py +0 -324
pixeltable/tests/test_migration.py +0 -43
pixeltable/tests/test_nos.py +0 -54
pixeltable/tests/test_snapshot.py +0 -208
pixeltable/tests/test_table.py +0 -1267
pixeltable/tests/test_transactional_directory.py +0 -42
pixeltable/tests/test_types.py +0 -22
pixeltable/tests/test_video.py +0 -159
pixeltable/tests/test_view.py +0 -530
pixeltable/tests/utils.py +0 -408
pixeltable-0.2.4.dist-info/RECORD +0 -132
{pixeltable-0.2.4.dist-info → pixeltable-0.2.6.dist-info}/LICENSE +0 -0

pixeltable/env.py CHANGED Viewed

@@ -10,8 +10,8 @@ import os
 import socketserver
 import sys
 import threading
-import typing
 import uuid
+import warnings
 from pathlib import Path
 from typing import Callable, Optional, Dict, Any, List
@@ -19,22 +19,28 @@ import pgserver
 import sqlalchemy as sql
 import yaml
 from sqlalchemy_utils.functions import database_exists, create_database, drop_database
+from tqdm import TqdmWarning
 import pixeltable.exceptions as excs
 from pixeltable import metadata
+from pixeltable.utils.http_server import make_server
 class Env:
     """
     Store for runtime globals.
     """
     _instance: Optional[Env] = None
     _log_fmt_str = '%(asctime)s %(levelname)s %(name)s %(filename)s:%(lineno)d: %(message)s'
     @classmethod
     def get(cls) -> Env:
         if cls._instance is None:
-            cls._instance = Env()
+            env = Env()
+            env._set_up()
+            env._upgrade_metadata()
+            cls._instance = env
         return cls._instance
     def __init__(self):
@@ -45,7 +51,7 @@ class Env:
         self._log_dir: Optional[Path] = None  # log files
         self._tmp_dir: Optional[Path] = None  # any tmp files
         self._sa_engine: Optional[sql.engine.base.Engine] = None
-        self._pgdata_dir : Optional[Path] = None
+        self._pgdata_dir: Optional[Path] = None
         self._db_name: Optional[str] = None
         self._db_server: Optional[pgserver.PostgresServer] = None
         self._db_url: Optional[str] = None
@@ -55,7 +61,7 @@ class Env:
         self._installed_packages: Dict[str, Optional[List[int]]] = {}
         self._nos_client: Optional[Any] = None
         self._spacy_nlp: Optional[Any] = None  # spacy.Language
-        self._httpd: Optional[socketserver.TCPServer] = None
+        self._httpd: Optional[http.server.ThreadingHTTPServer] = None
         self._http_address: Optional[str] = None
         self._registered_clients: dict[str, Any] = {}
@@ -93,13 +99,43 @@ class Env:
         assert self._http_address is not None
         return self._http_address
+    def configure_logging(
+        self,
+        *,
+        to_stdout: Optional[bool] = None,
+        level: Optional[int] = None,
+        add: Optional[str] = None,
+        remove: Optional[str] = None,
+    ) -> None:
+        """Configure logging.
+        Args:
+            to_stdout: if True, also log to stdout
+            level: default log level
+            add: comma-separated list of 'module name:log level' pairs; ex.: add='video:10'
+            remove: comma-separated list of module names
+        """
+        if to_stdout is not None:
+            self.log_to_stdout(to_stdout)
+        if level is not None:
+            self.set_log_level(level)
+        if add is not None:
+            for module, level in [t.split(':') for t in add.split(',')]:
+                self.set_module_log_level(module, int(level))
+        if remove is not None:
+            for module in remove.split(','):
+                self.set_module_log_level(module, None)
+        if to_stdout is None and level is None and add is None and remove is None:
+            self.print_log_config()
     def print_log_config(self) -> None:
         print(f'logging to {self._logfilename}')
         print(f'{"" if self._log_to_stdout else "not "}logging to stdout')
         print(f'default log level: {logging.getLevelName(self._default_log_level)}')
         print(
             f'module log levels: '
-            f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}')
+            f'{",".join([name + ":" + logging.getLevelName(val) for name, val in self._module_log_level.items()])}'
+        )
     def log_to_stdout(self, enable: bool = True) -> None:
         self._log_to_stdout = enable
@@ -134,10 +170,14 @@ class Env:
         else:
             return False
-    def set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
+    def _set_up(self, echo: bool = False, reinit_db: bool = False) -> None:
         if self._initialized:
             return
+        # Disable spurious warnings
+        warnings.simplefilter('ignore', category=TqdmWarning)
+        os.environ['TOKENIZERS_PARALLELISM'] = 'false'
         self._initialized = True
         home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
         assert self._home is None or self._home == home
@@ -188,11 +228,29 @@ class Env:
         fh = logging.FileHandler(self._log_dir / self._logfilename, mode='w')
         fh.setFormatter(logging.Formatter(self._log_fmt_str))
         self._logger.addHandler(fh)
+        # configure sqlalchemy logging
         sql_logger = logging.getLogger('sqlalchemy.engine')
         sql_logger.setLevel(logging.INFO)
         sql_logger.addHandler(fh)
         sql_logger.propagate = False
+        # configure pyav logging
+        av_logfilename = self._logfilename.replace('.log', '_av.log')
+        av_fh = logging.FileHandler(self._log_dir / av_logfilename, mode='w')
+        av_fh.setFormatter(logging.Formatter(self._log_fmt_str))
+        av_logger = logging.getLogger('libav')
+        av_logger.addHandler(av_fh)
+        av_logger.propagate = False
+        # configure web-server logging
+        http_logfilename = self._logfilename.replace('.log', '_http.log')
+        http_fh = logging.FileHandler(self._log_dir / http_logfilename, mode='w')
+        http_fh.setFormatter(logging.Formatter(self._log_fmt_str))
+        http_logger = logging.getLogger('pixeltable.http.server')
+        http_logger.addHandler(http_fh)
+        http_logger.propagate = False
         # empty tmp dir
         for path in glob.glob(f'{self._tmp_dir}/*'):
             os.remove(path)
@@ -213,6 +271,7 @@ class Env:
             create_database(self.db_url)
             self._sa_engine = sql.create_engine(self.db_url, echo=echo, future=True)
             from pixeltable.metadata import schema
             schema.Base.metadata.create_all(self._sa_engine)
             metadata.create_system_info(self._sa_engine)
             # enable pgvector
@@ -229,11 +288,12 @@ class Env:
         self._set_up_runtime()
         self.log_to_stdout(False)
-    def upgrade_metadata(self) -> None:
+    def _upgrade_metadata(self) -> None:
         metadata.upgrade_md(self._sa_engine)
     def _create_nos_client(self) -> None:
         import nos
         self._logger.info('connecting to NOS')
         nos.init(logging_level=logging.DEBUG)
         self._nos_client = nos.client.InferenceClient()
@@ -242,6 +302,7 @@ class Env:
         # now that we have a client, we can create the module
         import importlib
         try:
             importlib.import_module('pixeltable.functions.nos')
             # it's already been created
@@ -249,6 +310,7 @@ class Env:
         except ImportError:
             pass
         from pixeltable.functions.util import create_nos_modules
         _ = create_nos_modules()
     def get_client(self, name: str, init: Callable, environ: Optional[str] = None) -> Any:
@@ -282,16 +344,13 @@ class Env:
         """
         The http server root is the file system root.
         eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
+        in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
         This arrangement enables serving media hosted within _home,
         as well as external media inserted into pixeltable or produced by pixeltable.
         The port is chosen dynamically to prevent conflicts.
         """
         # Port 0 means OS picks one for us.
-        address = ("127.0.0.1", 0)
-        class FixedRootHandler(http.server.SimpleHTTPRequestHandler):
-            def __init__(self, *args, **kwargs):
-                super().__init__(*args, directory='/', **kwargs)
-        self._httpd = socketserver.TCPServer(address, FixedRootHandler)
+        self._httpd = make_server('127.0.0.1', 0)
         port = self._httpd.server_address[1]
         self._http_address = f'http://127.0.0.1:{port}'
@@ -320,11 +379,14 @@ class Env:
         check('torchvision')
         check('transformers')
         check('sentence_transformers')
+        check('yolox')
         check('boto3')
+        check('fitz')  # pymupdf
         check('pyarrow')
         check('spacy')  # TODO: deal with en-core-web-sm
         if self.is_installed_package('spacy'):
             import spacy
             self._spacy_nlp = spacy.load('en_core_web_sm')
         check('tiktoken')
         check('openai')
@@ -333,6 +395,7 @@ class Env:
         check('nos')
         if self.is_installed_package('nos'):
             self._create_nos_client()
+        check('openpyxl')
     def require_package(self, package: str, min_version: Optional[List[int]] = None) -> None:
         assert package in self._installed_packages
@@ -350,9 +413,12 @@ class Env:
         if len(min_version) < len(installed_version):
             normalized_min_version = min_version + [0] * (len(installed_version) - len(min_version))
         if any([a < b for a, b in zip(installed_version, normalized_min_version)]):
-            raise excs.Error((
-                f'The installed version of package {package} is {".".join([str[v] for v in installed_version])}, '
-                f'but version  >={".".join([str[v] for v in min_version])} is required'))
+            raise excs.Error(
+                (
+                    f'The installed version of package {package} is {".".join([str[v] for v in installed_version])}, '
+                    f'but version  >={".".join([str[v] for v in min_version])} is required'
+                )
+            )
     def num_tmp_files(self) -> int:
         return len(glob.glob(f'{self._tmp_dir}/*'))
@@ -397,4 +463,4 @@ class Env:
     @property
     def spacy_nlp(self) -> Any:
         assert self._spacy_nlp is not None
-        return self._spacy_nlp
+        return self._spacy_nlp

pixeltable/exec/__init__.py CHANGED Viewed

@@ -6,4 +6,5 @@ from .exec_node import ExecNode
 from .expr_eval_node import ExprEvalNode
 from .in_memory_data_node import InMemoryDataNode
 from .sql_scan_node import SqlScanNode
-from .media_validation_node import MediaValidationNode
+from .media_validation_node import MediaValidationNode
+from .data_row_batch import DataRowBatch

pixeltable/exec/cache_prefetch_node.py CHANGED Viewed

@@ -89,7 +89,7 @@ class CachePrefetchNode(ExecNode):
         # preserve the file extension, if there is one
         extension = ''
         if parsed.path != '':
-            p = Path(urllib.parse.unquote(parsed.path))
+            p = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
             extension = p.suffix
         tmp_path = env.Env.get().create_tmp_path(extension=extension)
         try:

pixeltable/exec/data_row_batch.py CHANGED Viewed

@@ -14,9 +14,8 @@ class DataRowBatch:
     Contains the metadata needed to initialize DataRows.
     """
-    def __init__(self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, len: int = 0):
-        self.tbl_id = tbl.id
-        self.tbl_version = tbl.version
+    def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
+        self.tbl = tbl
         self.row_builder = row_builder
         self.img_slot_idxs = [e.slot_idx for e in row_builder.unique_exprs if e.col_type.is_image_type()]
         # non-image media slots
@@ -42,9 +41,10 @@ class DataRowBatch:
     def set_row_ids(self, row_ids: List[int]) -> None:
         """Sets pks for rows in batch"""
+        assert self.tbl is not None
         assert len(row_ids) == len(self.rows)
         for row, row_id in zip(self.rows, row_ids):
-            row.set_pk((row_id, self.tbl_version))
+            row.set_pk((row_id, self.tbl))
     def __len__(self) -> int:
         return len(self.rows)
@@ -57,6 +57,7 @@ class DataRowBatch:
             flushed_slot_idxs: Optional[List[int]] = None
     ) -> None:
         """Flushes images in the given range of rows."""
+        assert self.tbl is not None
         if stored_img_info is None:
             stored_img_info = []
         if flushed_slot_idxs is None:
@@ -67,12 +68,10 @@ class DataRowBatch:
             idx_range = slice(0, len(self.rows))
         for row in self.rows[idx_range]:
             for info in stored_img_info:
-                filepath = str(MediaStore.prepare_media_path(self.tbl_id, info.col.id, self.tbl_version))
+                filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.version))
                 row.flush_img(info.slot_idx, filepath)
             for slot_idx in flushed_slot_idxs:
                 row.flush_img(slot_idx)
-        #_logger.debug(
-            #f'flushed images in range {idx_range}: slot_idxs={flushed_slot_idxs} stored_img_info={stored_img_info}')
     def __iter__(self) -> Iterator[exprs.DataRow]:
         return DataRowBatchIterator(self)

pixeltable/exec/expr_eval_node.py CHANGED Viewed

@@ -1,20 +1,20 @@
-import sys
-import warnings
-from typing import List, Optional, Tuple
-from dataclasses import dataclass, field
 import logging
+import sys
 import time
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional
 from tqdm import tqdm, TqdmWarning
+import pixeltable.exprs as exprs
+from pixeltable.func import CallableFunction
 from .data_row_batch import DataRowBatch
 from .exec_node import ExecNode
-import pixeltable.exprs as exprs
-import pixeltable.func as func
 _logger = logging.getLogger('pixeltable')
 class ExprEvalNode(ExecNode):
     """Materializes expressions
     """
@@ -22,7 +22,7 @@ class ExprEvalNode(ExecNode):
     class Cohort:
         """List of exprs that form an evaluation context and contain calls to at most one external function"""
         exprs: List[exprs.Expr]
-        ext_function: Optional[func.BatchedFunction]
+        batched_fn: Optional[CallableFunction]
         segment_ctxs: List[exprs.RowBuilder.EvalCtx]
         target_slot_idxs: List[int]
         batch_size: int = 8
@@ -63,12 +63,12 @@ class ExprEvalNode(ExecNode):
         if self.pbar is not None:
             self.pbar.close()
-    def _get_batched_fn(self, expr: exprs.Expr) -> Optional[func.BatchedFunction]:
-        if not isinstance(expr, exprs.FunctionCall):
-            return None
-        return expr.fn if isinstance(expr.fn, func.BatchedFunction) else None
+    def _get_batched_fn(self, expr: exprs.Expr) -> Optional[CallableFunction]:
+        if isinstance(expr, exprs.FunctionCall) and isinstance(expr.fn, CallableFunction) and expr.fn.is_batched:
+            return expr.fn
+        return None
-    def _is_ext_call(self, expr: exprs.Expr) -> bool:
+    def _is_batched_fn_call(self, expr: exprs.Expr) -> bool:
         return self._get_batched_fn(expr) is not None
     def _create_cohorts(self) -> None:
@@ -76,14 +76,14 @@ class ExprEvalNode(ExecNode):
         # break up all_exprs into cohorts such that each cohort contains calls to at most one external function;
         # seed the cohorts with only the ext fn calls
         cohorts: List[List[exprs.Expr]] = []
-        current_ext_function: Optional[func.BatchedFunction] = None
+        current_batched_fn: Optional[CallableFunction] = None
         for e in all_exprs:
-            if not self._is_ext_call(e):
+            if not self._is_batched_fn_call(e):
                 continue
-            if current_ext_function is None or current_ext_function != e.fn:
+            if current_batched_fn is None or current_batched_fn != e.fn:
                 # create a new cohort
                 cohorts.append([])
-                current_ext_function = e.fn
+                current_batched_fn = e.fn
             cohorts[-1].append(e)
         # expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
@@ -115,18 +115,18 @@ class ExprEvalNode(ExecNode):
             assert len(cohort) > 0
             # create the first segment here, so we can avoid checking for an empty list in the loop
             segments = [[cohort[0]]]
-            is_ext_segment = self._is_ext_call(cohort[0])
-            ext_fn: Optional[func.BatchedFunction] = self._get_batched_fn(cohort[0])
+            is_batched_segment = self._is_batched_fn_call(cohort[0])
+            batched_fn: Optional[CallableFunction] = self._get_batched_fn(cohort[0])
             for e in cohort[1:]:
-                if self._is_ext_call(e):
+                if self._is_batched_fn_call(e):
                     segments.append([e])
-                    is_ext_segment = True
-                    ext_fn = self._get_batched_fn(e)
+                    is_batched_segment = True
+                    batched_fn = self._get_batched_fn(e)
                 else:
-                    if is_ext_segment:
+                    if is_batched_segment:
                         # start a new segment
                         segments.append([])
-                        is_ext_segment = False
+                        is_batched_segment = False
                     segments[-1].append(e)
             # we create the EvalCtxs manually because create_eval_ctx() would repeat the dependencies of each segment
@@ -135,21 +135,21 @@ class ExprEvalNode(ExecNode):
                     slot_idxs=[e.slot_idx for e in s], exprs=s, target_slot_idxs=[], target_exprs=[])
                 for s in segments
             ]
-            cohort_info = self.Cohort(cohort, ext_fn, segment_ctxs, target_slot_idxs[i])
+            cohort_info = self.Cohort(cohort, batched_fn, segment_ctxs, target_slot_idxs[i])
             self.cohorts.append(cohort_info)
     def _exec_cohort(self, cohort: Cohort, rows: DataRowBatch) -> None:
         """Compute the cohort for the entire input batch by dividing it up into sub-batches"""
         batch_start_idx = 0  # start row of the current sub-batch
         # for multi-resolution models, we re-assess the correct ext fn batch size for each input batch
-        ext_batch_size = cohort.ext_function.get_batch_size() if cohort.ext_function is not None else None
+        ext_batch_size = cohort.batched_fn.get_batch_size() if cohort.batched_fn is not None else None
         if ext_batch_size is not None:
             cohort.batch_size = ext_batch_size
         while batch_start_idx < len(rows):
             num_batch_rows = min(cohort.batch_size, len(rows) - batch_start_idx)
             for segment_ctx in cohort.segment_ctxs:
-                if not self._is_ext_call(segment_ctx.exprs[0]):
+                if not self._is_batched_fn_call(segment_ctx.exprs[0]):
                     # compute batch row-wise
                     for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
                         self.row_builder.eval(
@@ -193,7 +193,7 @@ class ExprEvalNode(ExecNode):
                             for k in kwarg_batches.keys()
                         }
                         start_ts = time.perf_counter()
-                        result_batch = fn_call.fn.invoke(call_args, call_kwargs)
+                        result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
                         self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
                         self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows

pixeltable/exec/in_memory_data_node.py CHANGED Viewed

@@ -29,18 +29,21 @@ class InMemoryDataNode(ExecNode):
     def _open(self) -> None:
         """Create row batch and populate with self.input_rows"""
-        column_info = {info.col.name: info for info in self.row_builder.output_slot_idxs()}
+        column_info = {info.col.id: info for info in self.row_builder.output_slot_idxs()}
+        # exclude system columns
+        user_column_info = {info.col.name: info for _, info in column_info.items() if info.col.name is not None}
         # stored columns that are not computed
-        inserted_column_names = set([
-            info.col.name for info in self.row_builder.output_slot_idxs()
+        inserted_col_ids = set([
+            info.col.id for info in self.row_builder.output_slot_idxs()
             if info.col.is_stored and not info.col.is_computed
         ])
         self.output_rows = DataRowBatch(self.tbl, self.row_builder, len(self.input_rows))
         for row_idx, input_row in enumerate(self.input_rows):
             # populate the output row with the values provided in the input row
+            input_col_ids: List[int] = []
             for col_name, val in input_row.items():
-                col_info = column_info.get(col_name)
+                col_info = user_column_info.get(col_name)
                 assert col_info is not None
                 if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
@@ -49,11 +52,12 @@ class InMemoryDataNode(ExecNode):
                     open(path, 'wb').write(val)
                     val = path
                 self.output_rows[row_idx][col_info.slot_idx] = val
+                input_col_ids.append(col_info.col.id)
             # set the remaining stored non-computed columns to null
-            null_col_names = inserted_column_names - set(input_row.keys())
-            for col_name in null_col_names:
-                col_info = column_info.get(col_name)
+            null_col_ids = inserted_col_ids - set(input_col_ids)
+            for col_id in null_col_ids:
+                col_info = column_info.get(col_id)
                 assert col_info is not None
                 self.output_rows[row_idx][col_info.slot_idx] = None

pixeltable/exec/sql_scan_node.py CHANGED Viewed

@@ -21,7 +21,6 @@ class SqlScanNode(ExecNode):
             select_list: Iterable[exprs.Expr],
             where_clause: Optional[exprs.Expr] = None, filter: Optional[exprs.Predicate] = None,
             order_by_items: Optional[List[Tuple[exprs.Expr, bool]]] = None,
-            similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None,
             limit: int = 0, set_pk: bool = False, exact_version_only: Optional[List[catalog.TableVersion]] = None
     ):
         """
@@ -77,15 +76,17 @@ class SqlScanNode(ExecNode):
         # the number of tables that need to be joined to the target table
         for rowid_ref in [e for e, _ in order_by_items if isinstance(e, exprs.RowidRef)]:
             rowid_ref.set_tbl(tbl)
-        order_by_clause = [e.sql_expr().desc() if not asc else e.sql_expr() for e, asc in order_by_items]
+        order_by_clause: List[sql.ClauseElement] = []
+        for e, asc in order_by_items:
+            if isinstance(e, exprs.SimilarityExpr):
+                order_by_clause.append(e.as_order_by_clause(asc))
+            else:
+                order_by_clause.append(e.sql_expr().desc() if not asc else e.sql_expr())
         if where_clause is not None:
             sql_where_clause = where_clause.sql_expr()
             assert sql_where_clause is not None
             self.stmt = self.stmt.where(sql_where_clause)
-        if similarity_clause is not None:
-            self.stmt = self.stmt.order_by(
-                similarity_clause.img_col_ref.col.sa_idx_col.l2_distance(similarity_clause.embedding()))
         if len(order_by_clause) > 0:
             self.stmt = self.stmt.order_by(*order_by_clause)
         elif target.id in row_builder.unstored_iter_args:
@@ -201,7 +202,7 @@ class SqlScanNode(ExecNode):
                 self.row_builder.eval(output_row, self.filter_eval_ctx, profile=self.ctx.profile)
                 if output_row[self.filter.slot_idx]:
                     needs_row = True
-                    if self.limit is not None and len(output_batch) >= self.limit:
+                    if self.limit > 0 and len(output_batch) >= self.limit:
                         self.has_more_rows = False
                         break
                 else:

pixeltable/exprs/__init__.py CHANGED Viewed

@@ -6,9 +6,10 @@ from .comparison import Comparison
 from .compound_predicate import CompoundPredicate
 from .data_row import DataRow
 from .expr import Expr
+from .expr_set import ExprSet
 from .function_call import FunctionCall
 from .image_member_access import ImageMemberAccess
-from .image_similarity_predicate import ImageSimilarityPredicate
+from .in_predicate import InPredicate
 from .inline_array import InlineArray
 from .inline_dict import InlineDict
 from .is_null import IsNull
@@ -16,9 +17,9 @@ from .json_mapper import JsonMapper
 from .json_path import RELATIVE_PATH_ROOT, JsonPath
 from .literal import Literal
 from .object_ref import ObjectRef
-from  .variable import Variable
 from .predicate import Predicate
 from .row_builder import RowBuilder, ColumnSlotIdx, ExecProfile
 from .rowid_ref import RowidRef
-from .expr_set import ExprSet
+from .similarity_expr import SimilarityExpr
 from .type_cast import TypeCast
+from .variable import Variable

pixeltable/exprs/column_ref.py CHANGED Viewed

@@ -63,6 +63,15 @@ class ColumnRef(Expr):
         return super().__getattr__(name)
+    def similarity(self, other: Any) -> Expr:
+        if isinstance(other, Expr):
+            raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not an expression')
+        item = Expr.from_object(other)
+        if item is None or not(item.col_type.is_string_type() or item.col_type.is_image_type()):
+            raise excs.Error(f'similarity(): requires a string or a PIL.Image.Image object, not a {type(other)}')
+        from .similarity_expr import SimilarityExpr
+        return SimilarityExpr(self, item)
     def default_column_name(self) -> Optional[str]:
         return str(self)

pixeltable/exprs/comparison.py CHANGED Viewed

@@ -1,14 +1,14 @@
 from __future__ import annotations
 from typing import Optional, List, Any, Dict, Tuple
 import sqlalchemy as sql
-from .globals import ComparisonOperator
+from .data_row import DataRow
 from .expr import Expr
+from .globals import ComparisonOperator
 from .predicate import Predicate
-from .data_row import DataRow
 from .row_builder import RowBuilder
-import pixeltable.catalog as catalog
 class Comparison(Predicate):

pixeltable/exprs/data_row.py CHANGED Viewed

@@ -5,6 +5,8 @@ import urllib.parse
 import urllib.request
 from typing import Optional, List, Any, Tuple
+import sqlalchemy as sql
+import pgvector.sqlalchemy
 import PIL
 import numpy as np
@@ -110,7 +112,7 @@ class DataRow:
         return self.vals[index]
-    def get_stored_val(self, index: object) -> Any:
+    def get_stored_val(self, index: object, sa_col_type: Optional[sql.types.TypeEngine] = None) -> Any:
         """Return the value that gets stored in the db"""
         assert self.excs[index] is None
         if not self.has_val[index]:
@@ -125,6 +127,8 @@ class DataRow:
         if self.vals[index] is not None and index in self.array_slot_idxs:
             assert isinstance(self.vals[index], np.ndarray)
             np_array = self.vals[index]
+            if sa_col_type is not None and isinstance(sa_col_type, pgvector.sqlalchemy.Vector):
+                return np_array
             buffer = io.BytesIO()
             np.save(buffer, np_array)
             return buffer.getvalue()

pixeltable/exprs/expr.py CHANGED Viewed

@@ -60,9 +60,9 @@ class Expr(abc.ABC):
         # index of the expr's value in the data row:
         # - set for all materialized exprs
-        # - -1: not executable
+        # - None: not executable
         # - not set for subexprs that don't need to be materialized because the parent can be materialized via SQL
-        self.slot_idx = -1
+        self.slot_idx: Optional[int] = None
         self.components: List[Expr] = []  # the subexprs that are needed to construct this expr
     def dependencies(self) -> List[Expr]:
@@ -110,6 +110,11 @@ class Expr(abc.ABC):
                 return False
         return self._equals(other)
+    def _equals(self, other: Expr) -> bool:
+        # we already compared the type and components in equals(); subclasses that require additional comparisons
+        # override this
+        return True
     def _id_attrs(self) -> List[Tuple[str, Any]]:
         """Returns attribute name/value pairs that are used to construct the instance id.
@@ -148,7 +153,7 @@ class Expr(abc.ABC):
         cls = self.__class__
         result = cls.__new__(cls)
         result.__dict__.update(self.__dict__)
-        result.slot_idx = -1
+        result.slot_idx = None
         result.components = [c.copy() for c in self.components]
         return result
@@ -313,10 +318,6 @@ class Expr(abc.ABC):
             return InlineArray(tuple(o))
         return None
-    @abc.abstractmethod
-    def _equals(self, other: Expr) -> bool:
-        pass
     @abc.abstractmethod
     def sql_expr(self) -> Optional[sql.ClauseElement]:
         """
@@ -396,6 +397,13 @@ class Expr(abc.ABC):
     def _from_dict(cls, d: Dict, components: List[Expr]) -> Expr:
         assert False, 'not implemented'
+    def isin(self, value_set: Any) -> 'pixeltable.exprs.InPredicate':
+        from .in_predicate import InPredicate
+        if isinstance(value_set, Expr):
+            return InPredicate(self, value_set_expr=value_set)
+        else:
+            return InPredicate(self, value_set_literal=value_set)
     def astype(self, new_type: ts.ColumnType) -> 'pixeltable.exprs.TypeCast':
         from pixeltable.exprs import TypeCast
         return TypeCast(self, new_type)

pixeltable 0.2.4__py3-none-any.whl → 0.2.6__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.4py3-none-any.whl → 0.2.6py3-none-any.whl