PyPI - pixeltable - Versions diffs - 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl - Mend

pixeltable 0.2.19py3-none-any.whl → 0.2.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (88) hide show

pixeltable/__init__.py +7 -19
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +7 -7
pixeltable/catalog/globals.py +3 -0
pixeltable/catalog/insertable_table.py +9 -7
pixeltable/catalog/table.py +220 -143
pixeltable/catalog/table_version.py +36 -18
pixeltable/catalog/table_version_path.py +0 -8
pixeltable/catalog/view.py +3 -3
pixeltable/dataframe.py +9 -24
pixeltable/env.py +107 -36
pixeltable/exceptions.py +7 -4
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/aggregation_node.py +22 -15
pixeltable/exec/component_iteration_node.py +62 -41
pixeltable/exec/data_row_batch.py +7 -7
pixeltable/exec/exec_node.py +35 -7
pixeltable/exec/expr_eval_node.py +2 -1
pixeltable/exec/in_memory_data_node.py +9 -9
pixeltable/exec/sql_node.py +265 -136
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/data_row.py +30 -19
pixeltable/exprs/expr.py +15 -14
pixeltable/exprs/expr_dict.py +55 -0
pixeltable/exprs/expr_set.py +21 -15
pixeltable/exprs/function_call.py +21 -8
pixeltable/exprs/json_path.py +3 -6
pixeltable/exprs/rowid_ref.py +2 -2
pixeltable/exprs/sql_element_cache.py +5 -1
pixeltable/ext/functions/whisperx.py +7 -2
pixeltable/func/callable_function.py +2 -2
pixeltable/func/function_registry.py +6 -7
pixeltable/func/query_template_function.py +11 -12
pixeltable/func/signature.py +17 -15
pixeltable/func/udf.py +0 -4
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/audio.py +4 -6
pixeltable/functions/globals.py +86 -42
pixeltable/functions/huggingface.py +12 -14
pixeltable/functions/image.py +59 -45
pixeltable/functions/json.py +0 -1
pixeltable/functions/mistralai.py +2 -2
pixeltable/functions/openai.py +22 -25
pixeltable/functions/string.py +50 -50
pixeltable/functions/timestamp.py +20 -20
pixeltable/functions/together.py +26 -12
pixeltable/functions/video.py +11 -20
pixeltable/functions/whisper.py +2 -20
pixeltable/globals.py +57 -56
pixeltable/index/base.py +2 -2
pixeltable/index/btree.py +7 -7
pixeltable/index/embedding_index.py +8 -10
pixeltable/io/external_store.py +11 -5
pixeltable/io/globals.py +3 -1
pixeltable/io/hf_datasets.py +4 -4
pixeltable/io/label_studio.py +6 -6
pixeltable/io/parquet.py +14 -13
pixeltable/iterators/document.py +10 -8
pixeltable/iterators/video.py +10 -1
pixeltable/metadata/__init__.py +3 -2
pixeltable/metadata/converters/convert_14.py +4 -2
pixeltable/metadata/converters/convert_15.py +1 -1
pixeltable/metadata/converters/convert_19.py +1 -0
pixeltable/metadata/converters/convert_20.py +1 -1
pixeltable/metadata/converters/util.py +9 -8
pixeltable/metadata/schema.py +32 -21
pixeltable/plan.py +136 -154
pixeltable/store.py +51 -36
pixeltable/tool/create_test_db_dump.py +7 -7
pixeltable/tool/doc_plugins/griffe.py +3 -34
pixeltable/tool/mypy_plugin.py +32 -0
pixeltable/type_system.py +243 -60
pixeltable/utils/arrow.py +10 -9
pixeltable/utils/coco.py +4 -4
pixeltable/utils/documents.py +1 -1
pixeltable/utils/filecache.py +131 -84
pixeltable/utils/formatter.py +1 -1
pixeltable/utils/http_server.py +2 -5
pixeltable/utils/media_store.py +6 -6
pixeltable/utils/pytorch.py +10 -11
pixeltable/utils/sql.py +2 -1
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA +16 -7
pixeltable-0.2.21.dist-info/RECORD +148 -0
pixeltable/utils/help.py +0 -11
pixeltable-0.2.19.dist-info/RECORD +0 -147
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0

pixeltable/utils/coco.py CHANGED Viewed

@@ -1,12 +1,12 @@
-from typing import List, Dict, Any, Set
-from pathlib import Path
 import json
+from pathlib import Path
+from typing import Any, Dict, List, Set
 import PIL
+import pixeltable as pxt
 import pixeltable.exceptions as excs
 format_msg = """
 Required format:
@@ -48,7 +48,7 @@ def _verify_input_dict(input_dict: Dict[str, Any]) -> None:
         if not isinstance(annotation['category'], (str, int)):
             raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
-def write_coco_dataset(df: 'pixeltable.DataFrame', dest_path: Path) -> Path:
+def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
     """Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
     # TODO: validate schema
     if len(df._select_list_exprs) != 1 or not df._select_list_exprs[0].col_type.is_json_type():

pixeltable/utils/documents.py CHANGED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 from typing import Optional
 import bs4
-import fitz  # (pymupdf)
+import fitz  # type: ignore[import-untyped]
 import puremagic
 import pixeltable.type_system as ts

pixeltable/utils/filecache.py CHANGED Viewed

@@ -1,28 +1,33 @@
 from __future__ import annotations
-from typing import Optional, List, Tuple, Dict
-from collections import OrderedDict, defaultdict, namedtuple
-import os
 import glob
-from pathlib import Path
-from time import time
+import hashlib
 import logging
+import os
+import warnings
+from collections import OrderedDict, defaultdict, namedtuple
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
 from uuid import UUID
-import hashlib
+import pixeltable.exceptions as excs
 from pixeltable.env import Env
 _logger = logging.getLogger('pixeltable')
+@dataclass
 class CacheEntry:
-    def __init__(self, key: str, tbl_id: UUID, col_id: int, size: int, last_accessed_ts: int, ext: str):
-        self.key = key
-        self.tbl_id = tbl_id
-        self.col_id = col_id
-        self.size = size
-        self.last_accessed_ts = last_accessed_ts
-        self.ext = ext
+    key: str
+    tbl_id: UUID
+    col_id: int
+    size: int
+    last_used: datetime
+    ext: str
+    @property
     def path(self) -> Path:
         return Env.get().file_cache_dir / f'{self.tbl_id.hex}_{self.col_id}_{self.key}{self.ext}'
@@ -34,7 +39,11 @@ class CacheEntry:
         col_id = int(components[1])
         key = components[2]
         file_info = os.stat(str(path))
-        return cls(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
+        # We use the last modified time (file_info.st_mtime) as the timestamp; `FileCache` will touch the file
+        # each time it is retrieved, so that the mtime of the file will always represent the last used time of
+        # the cache entry.
+        last_used = datetime.fromtimestamp(file_info.st_mtime, tz=timezone.utc)
+        return cls(key, tbl_id, col_id, file_info.st_size, last_used, path.suffix)
 class FileCache:
@@ -45,31 +54,60 @@ class FileCache:
     access of a cache entries is its file's mtime.
     TODO:
-    - enforce a maximum capacity with LRU eviction
     - implement MRU eviction for queries that exceed the capacity
     """
-    _instance: Optional[FileCache] = None
-    ColumnStats = namedtuple('FileCacheColumnStats', ['tbl_id', 'col_id', 'num_files', 'total_size'])
-    CacheStats = namedtuple(
-        'FileCacheStats', ['total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats'])
+    __instance: Optional[FileCache] = None
+    cache: OrderedDict[str, CacheEntry]
+    total_size: int
+    capacity_bytes: int
+    num_requests: int
+    num_hits: int
+    num_evictions: int
+    keys_retrieved: set[str]  # keys retrieved (downloaded or accessed) this session
+    keys_evicted_after_retrieval: set[str]  # keys that were evicted after having been retrieved this session
+    # A key is added to this set when it is already present in `keys_evicted_this_session` and is downloaded again.
+    # In other words, for a key to be added to this set, the following sequence of events must occur in this order:
+    # - It is retrieved during this session (either because it was newly downloaded, or because it was in the cache
+    #   at the start of the session and was accessed at some point during the session)
+    # - It is subsequently evicted
+    # - It is subsequently retrieved a second time ("download after a previous retrieval")
+    # The contents of this set will be used to generate a more informative warning.
+    evicted_working_set_keys: set[str]
+    new_redownload_witnessed: bool  # whether a new re-download has occurred since the last time a warning was issued
+    FileCacheColumnStats = namedtuple('FileCacheColumnStats', ('tbl_id', 'col_id', 'num_files', 'total_size'))
+    FileCacheStats = namedtuple(
+        'FileCacheStats',
+        ('total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats')
+    )
     @classmethod
     def get(cls) -> FileCache:
-        if cls._instance is None:
-            cls._instance = cls()
-        return cls._instance
+        if cls.__instance is None:
+            cls.init()
+        return cls.__instance
+    @classmethod
+    def init(cls) -> None:
+        cls.__instance = cls()
     def __init__(self):
-        self.cache: OrderedDict[str, CacheEntry] = OrderedDict()  # ordered by entry.last_accessed_ts
+        self.cache = OrderedDict()
         self.total_size = 0
-        #self.capacity = Env.get().max_filecache_size
+        self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
         self.num_requests = 0
         self.num_hits = 0
         self.num_evictions = 0
+        self.keys_retrieved = set()
+        self.keys_evicted_after_retrieval = set()
+        self.evicted_working_set_keys = set()
+        self.new_redownload_witnessed = False
         paths = glob.glob(str(Env.get().file_cache_dir / '*'))
         entries = [CacheEntry.from_file(Path(path_str)) for path_str in paths]
-        # we need to insert entries in order of last_accessed_ts
-        entries.sort(key=lambda e: e.last_accessed_ts)
+        # we need to insert entries in access order
+        entries.sort(key=lambda e: e.last_used)
         for entry in entries:
             self.cache[entry.key] = entry
             self.total_size += entry.size
@@ -82,30 +120,43 @@ class FileCache:
     def num_files(self, tbl_id: Optional[UUID] = None) -> int:
         if tbl_id is None:
             return len(self.cache)
-        entries = [e for e in self.cache.values() if e.tbl_id == tbl_id]
-        return len(entries)
+        return sum(e.tbl_id == tbl_id for e in self.cache.values())
-    def clear(self, tbl_id: Optional[UUID] = None, capacity: Optional[int] = None) -> None:
+    def clear(self, tbl_id: Optional[UUID] = None) -> None:
         """
         For testing purposes: allow resetting capacity and stats.
         """
-        self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
-        entries = list(self.cache.values())  # list(): avoid dealing with values() return type
-        if tbl_id is not None:
-            entries = [e for e in entries if e.tbl_id == tbl_id]
-            _logger.debug(f'clearing {len(entries)} entries from file cache for table {tbl_id}')
+        if tbl_id is None:
+            # We need to store the entries to remove in a list, because we can't remove items from a dict while iterating
+            entries_to_remove = list(self.cache.values())
+            _logger.debug(f'clearing {self.num_files()} entries from file cache')
+            self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
+            self.keys_retrieved.clear()
+            self.keys_evicted_after_retrieval.clear()
+            self.new_redownload_witnessed = False
         else:
-            _logger.debug(f'clearing {len(entries)} entries from file cache')
-        for entry in entries:
+            entries_to_remove = [e for e in self.cache.values() if e.tbl_id == tbl_id]
+            _logger.debug(f'clearing {self.num_files(tbl_id)} entries from file cache for table {tbl_id}')
+        for entry in entries_to_remove:
+            os.remove(entry.path)
             del self.cache[entry.key]
             self.total_size -= entry.size
-            os.remove(entry.path())
-        # if capacity is not None:
-        #     self.capacity = capacity
-        # else:
-        #     # need to reset to default
-        #     self.capacity = Env.get().max_filecache_size
-        # _logger.debug(f'setting file cache capacity to {self.capacity}')
+    def emit_eviction_warnings(self) -> None:
+        if self.new_redownload_witnessed:
+            # Compute the additional capacity that would be needed in order to retain all the re-downloaded files
+            extra_capacity_needed = sum(self.cache[key].size for key in self.evicted_working_set_keys)
+            suggested_cache_size = self.capacity_bytes + extra_capacity_needed + (1 << 30)
+            warnings.warn(
+                f'{len(self.evicted_working_set_keys)} media file(s) had to be downloaded multiple times this session, '
+                'because they were evicted\nfrom the file cache after their first access. The total size '
+                f'of the evicted file(s) is {round(extra_capacity_needed / (1 << 30), 1)} GiB.\n'
+                f'Consider increasing the cache size to at least {round(suggested_cache_size / (1 << 30), 1)} GiB '
+                f'(it is currently {round(self.capacity_bytes / (1 << 30), 1)} GiB).\n'
+                f'You can do this by setting the value of `file_cache_size_g` in: {str(Env.get()._config_file)}',
+                excs.PixeltableWarning
+            )
+            self.new_redownload_witnessed = False
     def _url_hash(self, url: str) -> str:
         h = hashlib.sha256()
@@ -120,75 +171,71 @@ class FileCache:
             _logger.debug(f'file cache miss for {url}')
             return None
         # update mtime and cache
-        path = entry.path()
+        path = entry.path
         path.touch(exist_ok=True)
         file_info = os.stat(str(path))
-        entry.last_accessed_ts = file_info.st_mtime
+        entry.last_used = datetime.fromtimestamp(file_info.st_mtime)
         self.cache.move_to_end(key, last=True)
         self.num_hits += 1
+        self.keys_retrieved.add(key)
         _logger.debug(f'file cache hit for {url}')
         return path
-    # def can_admit(self, query_ts: int) -> bool:
-    #     if self.total_size + self.avg_file_size <= self.capacity:
-    #         return True
-    #     assert len(self.cache) > 0
-    #     # check whether we can evict the current lru entry
-    #     lru_entry = next(iter(self.cache.values()))
-    #     if lru_entry.last_accessed_ts >= query_ts:
-    #         # the current query brought this entry in: we're not going to evict it
-    #         return False
-    #     return True
     def add(self, tbl_id: UUID, col_id: int, url: str, path: Path) -> Path:
         """Adds url at 'path' to cache and returns its new path.
         'path' will not be accessible after this call. Retains the extension of 'path'.
         """
         file_info = os.stat(str(path))
-        _ = time()
-        #if self.total_size + file_info.st_size > self.capacity:
-        if False:
-            if len(self.cache) == 0:
-                # nothing to evict
-                return
-            # evict entries until we're below the limit or until we run into entries the current query brought in
-            while True:
-                lru_entry = next(iter(self.cache.values()))
-                if lru_entry.last_accessed_ts >= query_ts:
-                    # the current query brought this entry in: switch to MRU and ignore this put()
-                    _logger.debug('file cache switched to MRU')
-                    return
-                self.cache.popitem(last=False)
-                self.total_size -= lru_entry.size
-                self.num_evictions += 1
-                os.remove(str(lru_entry.path()))
-                _logger.debug(f'evicted entry for cell {lru_entry.cell_id} from file cache')
-                if self.total_size + file_info.st_size <= self.capacity:
-                    break
+        self.ensure_capacity(file_info.st_size)
         key = self._url_hash(url)
         assert key not in self.cache
-        entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, file_info.st_mtime, path.suffix)
+        if key in self.keys_evicted_after_retrieval:
+            # This key was evicted after being retrieved earlier this session, and is now being retrieved again.
+            # Add it to `keys_multiply_downloaded` so that we may generate a warning later.
+            self.evicted_working_set_keys.add(key)
+            self.new_redownload_witnessed = True
+        self.keys_retrieved.add(key)
+        entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, datetime.fromtimestamp(file_info.st_mtime), path.suffix)
         self.cache[key] = entry
         self.total_size += entry.size
-        new_path = entry.path()
+        new_path = entry.path
         os.rename(str(path), str(new_path))
+        new_path.touch(exist_ok=True)
         _logger.debug(f'added entry for cell {url} to file cache')
         return new_path
-    def stats(self) -> CacheStats:
+    def ensure_capacity(self, size: int) -> None:
+        """
+        Evict entries from the cache until there is at least 'size' bytes of free space.
+        """
+        while len(self.cache) > 0 and self.total_size + size > self.capacity_bytes:
+            _, lru_entry = self.cache.popitem(last=False)
+            self.total_size -= lru_entry.size
+            self.num_evictions += 1
+            if lru_entry.key in self.keys_retrieved:
+                # This key was retrieved at some point earlier this session and is now being evicted.
+                # Make a record of the eviction, so that we can generate a warning later if the key is retrieved again.
+                self.keys_evicted_after_retrieval.add(lru_entry.key)
+            os.remove(str(lru_entry.path))
+            _logger.debug(f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)')
+    def set_capacity(self, capacity_bytes: int) -> None:
+        self.capacity_bytes = capacity_bytes
+        self.ensure_capacity(0)  # evict entries if necessary
+    def stats(self) -> FileCacheStats:
         # collect column stats
         # (tbl_id, col_id) -> (num_files, total_size)
-        d: Dict[Tuple[int, int], List[int]] = defaultdict(lambda: [0, 0])
+        d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
         for entry in self.cache.values():
             t = d[(entry.tbl_id, entry.col_id)]
             t[0] += 1
             t[1] += entry.size
         col_stats = [
-            self.ColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
+            self.FileCacheColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
         ]
         col_stats.sort(key=lambda e: e[3], reverse=True)
-        return self.CacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
+        return self.FileCacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
     def debug_print(self) -> None:
         for entry in self.cache.values():

pixeltable/utils/formatter.py CHANGED Viewed

@@ -201,7 +201,7 @@ class Formatter:
         # try generating a thumbnail for different types and use that if successful
         if file_path.lower().endswith('.pdf'):
             try:
-                import fitz
+                import fitz  # type: ignore[import-untyped]
                 doc = fitz.open(file_path)
                 p = doc.get_page_pixmap(0)

pixeltable/utils/http_server.py CHANGED Viewed

@@ -1,11 +1,8 @@
 import http
 import http.server
 import logging
-import urllib
-import posixpath
 import pathlib
-import os
-import string
+import urllib
 _logger = logging.getLogger('pixeltable.http.server')
@@ -43,7 +40,7 @@ class AbsolutePathHandler(http.server.SimpleHTTPRequestHandler):
     def log_message(self, format, *args) -> None:
         """override logging to stderr in http.server.BaseHTTPRequestHandler"""
         message = format % args
-        _logger.info(message.translate(self._control_char_table))
+        _logger.info(message.translate(self._control_char_table))  # type: ignore[attr-defined]
 class LoggingHTTPServer(http.server.ThreadingHTTPServer):

pixeltable/utils/media_store.py CHANGED Viewed

@@ -3,9 +3,9 @@ import os
 import re
 import shutil
 import uuid
-from typing import Optional, List, Tuple, Dict
-from pathlib import Path
 from collections import defaultdict
+from pathlib import Path
+from typing import Optional
 from uuid import UUID
 from pixeltable.env import Env
@@ -46,8 +46,8 @@ class MediaStore:
         else:
             # Remove only the elements for the specified version.
             paths = glob.glob(str(Env.get().media_dir / tbl_id.hex) + f'/**/{tbl_id.hex}_*_{version}_*', recursive=True)
-            for path in paths:
-                os.remove(path)
+            for p in paths:
+                os.remove(p)
     @classmethod
     def count(cls, tbl_id: UUID) -> int:
@@ -58,10 +58,10 @@ class MediaStore:
         return len(paths)
     @classmethod
-    def stats(cls) -> List[Tuple[int, int, int, int]]:
+    def stats(cls) -> list[tuple[UUID, int, int, int]]:
         paths = glob.glob(str(Env.get().media_dir) + "/**", recursive=True)
         # key: (tbl_id, col_id), value: (num_files, size)
-        d: Dict[Tuple[UUID, int], List[int]] = defaultdict(lambda: [0, 0])
+        d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
         for p in paths:
             if not os.path.isdir(p):
                 matched = re.match(cls.pattern, Path(p).name)

pixeltable/utils/pytorch.py CHANGED Viewed

@@ -2,13 +2,13 @@ import datetime
 import io
 import json
 from pathlib import Path
-from typing import Any, Dict, Iterator
+from typing import Any, Iterator, Sequence
 import numpy as np
 import PIL.Image
-import pyarrow as pa
 import torch
 import torch.utils.data
+import torchvision  # type: ignore[import-untyped]
 from pyarrow import parquet
 from pixeltable.type_system import ColumnType
@@ -41,7 +41,7 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
         with column_type_path.open() as f:
             column_types = json.load(f)
         self.column_types = {k: ColumnType.from_dict(v) for k, v in column_types.items()}
-        self.part_metadata = parquet.ParquetDataset(path).files
+        self.part_metadata: list = parquet.ParquetDataset(str(path)).files
     def _unmarshall(self, k: str, v: Any) -> Any:
         if self.column_types[k].is_image_type():
@@ -54,7 +54,6 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
                 return arr
             assert self.image_format == "pt"
-            import torchvision
             # use arr instead of im in ToTensor() to guarantee array input
             # to torch.from_numpy is writable. Using im is a suspected cause of
@@ -77,17 +76,17 @@ class PixeltablePytorchDataset(torch.utils.data.IterableDataset):
             assert not isinstance(v, np.ndarray) # all array outputs should be handled above
             return v
-    def __iter__(self) -> Iterator[Dict[str, Any]]:
-        import pixeltable.utils.arrow as arrow
+    def __iter__(self) -> Iterator[dict[str, Any]]:
+        from pixeltable.utils import arrow
         worker_info = torch.utils.data.get_worker_info()
-        if worker_info is None:
-            part_list = range(len(self.part_metadata))
-        else:
-            part_list = [ i for i in part_list if (i % worker_info.num_workers) == worker_info.id ]
+        part_list: Sequence[int] = range(len(self.part_metadata))
+        if worker_info is not None:
+            part_list = [i for i in part_list if (i % worker_info.num_workers) == worker_info.id]
         for part_no in part_list:
             pqf = parquet.ParquetFile(self.part_metadata[part_no])
             for batch in pqf.iter_batches():
                 for tup in arrow.iter_tuples(batch):
-                    yield {k: self._unmarshall(k, v) for k, v in tup.items()}
+                    yield {k: self._unmarshall(k, v) for k, v in tup.items()}

pixeltable/utils/sql.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import logging
 import sqlalchemy as sql
+from sqlalchemy.dialects import postgresql
 def log_stmt(logger: logging.Logger, stmt) -> None:
-    logger.debug(f'executing {str(stmt.compile(dialect=sql.dialects.postgresql.dialect()))}')
+    logger.debug(f'executing {str(stmt.compile(dialect=postgresql.dialect()))}')
 def log_explain(logger: logging.Logger, stmt: sql.sql.ClauseElement, conn: sql.engine.Connection) -> None:
     try:

{pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pixeltable
-Version: 0.2.19
+Version: 0.2.21
 Summary: Pixeltable: The Multimodal AI Data Plane
 Author: Pixeltable, Inc.
 Author-email: contact@pixeltable.com
@@ -31,6 +31,7 @@ Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)
 Requires-Dist: sqlalchemy (>=2.0.23,<3.0.0)
 Requires-Dist: tenacity (>=8.2,<9.0)
+Requires-Dist: toml (>=0.10)
 Requires-Dist: tqdm (>=4.64)
 Description-Content-Type: text/markdown
@@ -38,18 +39,26 @@ Description-Content-Type: text/markdown
 <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" width="50%" />
 <br></br>
-[![License](https://img.shields.io/badge/License-Apache%202.0-darkblue.svg)](https://opensource.org/licenses/Apache-2.0)
-![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pixeltable?logo=python&logoColor=white)
-![Platform Support](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-8A2BE2)
+[![License](https://img.shields.io/badge/License-Apache%202.0-0530AD.svg)](https://opensource.org/licenses/Apache-2.0)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/pixeltable?logo=python&logoColor=white&)
+![Platform Support](https://img.shields.io/badge/platform-Linux%20%7C%20macOS%20%7C%20Windows-E5DDD4)
 <br>
 [![tests status](https://github.com/pixeltable/pixeltable/actions/workflows/pytest.yml/badge.svg)](https://github.com/pixeltable/pixeltable/actions/workflows/pytest.yml)
 [![tests status](https://github.com/pixeltable/pixeltable/actions/workflows/nightly.yml/badge.svg)](https://github.com/pixeltable/pixeltable/actions/workflows/nightly.yml)
-[![PyPI Package](https://img.shields.io/pypi/v/pixeltable?color=darkorange)](https://pypi.org/project/pixeltable/)
+[![PyPI Package](https://img.shields.io/pypi/v/pixeltable?color=4D148C)](https://pypi.org/project/pixeltable/)
+<a target="_blank" href="https://huggingface.co/Pixeltable"> <img src="https://img.shields.io/badge/🤗-HF Space-F25022" alt="Visit our Hugging Face space"/></a>
-[Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.readme.io/) | [API Reference](https://pixeltable.github.io/pixeltable/) | [Code Samples](https://pixeltable.readme.io/recipes) | [Examples](https://github.com/pixeltable/pixeltable/tree/release/docs/release/tutorials)
+[Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.readme.io/) | [API Reference](https://pixeltable.github.io/pixeltable/) | [Code Samples](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#-code-samples) | [Computer Vision](https://docs.pixeltable.com/docs/object-detection-in-videos) | [LLM](https://docs.pixeltable.com/docs/document-indexing-and-rag)
 </div>
-Pixeltable is a Python library providing a declarative interface for multimodal data (text, images, audio, video). It features built-in versioning, lineage tracking, and incremental updates, enabling users to store, transform, index, and iterate on data for their ML workflows. Data transformations, model inference, and custom logic are embedded as computed columns.
+Pixeltable is a Python library providing a declarative interface for multimodal data (text, images, audio, video). It features built-in versioning, lineage tracking, and incremental updates, enabling users to **store**, **transform**, **index**, and **iterate** on data for their ML workflows.
+Data transformations, model inference, and custom logic are embedded as **computed columns**.
+- **Load/Query all data types**: Interact with [video data](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#import-media-data-into-pixeltable-videos-images-audio) at the [frame level](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#text-and-image-similarity-search-on-video-frames-with-embedding-indexes) and documents at the [chunk level](https://github.com/pixeltable/pixeltable?tab=readme-ov-file#automate-data-operations-with-views-eg-split-documents-into-chunks)
+- **Incremental updates for data transformation**: Maintain an [embedding index](https://docs.pixeltable.com/docs/embedding-vector-indexes) colocated with your data
+- **Lazy evaluation and cache management**: Eliminates the need for [manual frame extraction](https://docs.pixeltable.com/docs/object-detection-in-videos)
+- **Integrates with any Python libraries**: Use [built-in and custom functions (UDFs)](https://docs.pixeltable.com/docs/user-defined-functions-udfs) without complex pipelines
+- **Data format agnostic and extensibility**: Access tables as Parquet files, [PyTorch datasets](https://pixeltable.github.io/pixeltable/api/data-frame/#pixeltable.DataFrame.to_pytorch_dataset), or [COCO annotations](https://pixeltable.github.io/pixeltable/api/table/#pixeltable.Table.to_coco_dataset)
 ## 💾 Installation

pixeltable 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.19py3-none-any.whl → 0.2.21py3-none-any.whl