PyPI - pixeltable - Versions diffs - 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

pixeltable 0.2.26py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (245) hide show

pixeltable/__init__.py +83 -19
pixeltable/_query.py +1444 -0
pixeltable/_version.py +1 -0
pixeltable/catalog/__init__.py +7 -4
pixeltable/catalog/catalog.py +2394 -119
pixeltable/catalog/column.py +225 -104
pixeltable/catalog/dir.py +38 -9
pixeltable/catalog/globals.py +53 -34
pixeltable/catalog/insertable_table.py +265 -115
pixeltable/catalog/path.py +80 -17
pixeltable/catalog/schema_object.py +28 -43
pixeltable/catalog/table.py +1270 -677
pixeltable/catalog/table_metadata.py +103 -0
pixeltable/catalog/table_version.py +1270 -751
pixeltable/catalog/table_version_handle.py +109 -0
pixeltable/catalog/table_version_path.py +137 -42
pixeltable/catalog/tbl_ops.py +53 -0
pixeltable/catalog/update_status.py +191 -0
pixeltable/catalog/view.py +251 -134
pixeltable/config.py +215 -0
pixeltable/env.py +736 -285
pixeltable/exceptions.py +26 -2
pixeltable/exec/__init__.py +7 -2
pixeltable/exec/aggregation_node.py +39 -21
pixeltable/exec/cache_prefetch_node.py +87 -109
pixeltable/exec/cell_materialization_node.py +268 -0
pixeltable/exec/cell_reconstruction_node.py +168 -0
pixeltable/exec/component_iteration_node.py +25 -28
pixeltable/exec/data_row_batch.py +11 -46
pixeltable/exec/exec_context.py +26 -11
pixeltable/exec/exec_node.py +35 -27
pixeltable/exec/expr_eval/__init__.py +3 -0
pixeltable/exec/expr_eval/evaluators.py +365 -0
pixeltable/exec/expr_eval/expr_eval_node.py +413 -0
pixeltable/exec/expr_eval/globals.py +200 -0
pixeltable/exec/expr_eval/row_buffer.py +74 -0
pixeltable/exec/expr_eval/schedulers.py +413 -0
pixeltable/exec/globals.py +35 -0
pixeltable/exec/in_memory_data_node.py +35 -27
pixeltable/exec/object_store_save_node.py +293 -0
pixeltable/exec/row_update_node.py +44 -29
pixeltable/exec/sql_node.py +414 -115
pixeltable/exprs/__init__.py +8 -5
pixeltable/exprs/arithmetic_expr.py +79 -45
pixeltable/exprs/array_slice.py +5 -5
pixeltable/exprs/column_property_ref.py +40 -26
pixeltable/exprs/column_ref.py +254 -61
pixeltable/exprs/comparison.py +14 -9
pixeltable/exprs/compound_predicate.py +9 -10
pixeltable/exprs/data_row.py +213 -72
pixeltable/exprs/expr.py +270 -104
pixeltable/exprs/expr_dict.py +6 -5
pixeltable/exprs/expr_set.py +20 -11
pixeltable/exprs/function_call.py +383 -284
pixeltable/exprs/globals.py +18 -5
pixeltable/exprs/in_predicate.py +7 -7
pixeltable/exprs/inline_expr.py +37 -37
pixeltable/exprs/is_null.py +8 -4
pixeltable/exprs/json_mapper.py +120 -54
pixeltable/exprs/json_path.py +90 -60
pixeltable/exprs/literal.py +61 -16
pixeltable/exprs/method_ref.py +7 -6
pixeltable/exprs/object_ref.py +19 -8
pixeltable/exprs/row_builder.py +238 -75
pixeltable/exprs/rowid_ref.py +53 -15
pixeltable/exprs/similarity_expr.py +65 -50
pixeltable/exprs/sql_element_cache.py +5 -5
pixeltable/exprs/string_op.py +107 -0
pixeltable/exprs/type_cast.py +25 -13
pixeltable/exprs/variable.py +2 -2
pixeltable/func/__init__.py +9 -5
pixeltable/func/aggregate_function.py +197 -92
pixeltable/func/callable_function.py +119 -35
pixeltable/func/expr_template_function.py +101 -48
pixeltable/func/function.py +375 -62
pixeltable/func/function_registry.py +20 -19
pixeltable/func/globals.py +6 -5
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +151 -35
pixeltable/func/signature.py +178 -49
pixeltable/func/tools.py +164 -0
pixeltable/func/udf.py +176 -53
pixeltable/functions/__init__.py +44 -4
pixeltable/functions/anthropic.py +226 -47
pixeltable/functions/audio.py +148 -11
pixeltable/functions/bedrock.py +137 -0
pixeltable/functions/date.py +188 -0
pixeltable/functions/deepseek.py +113 -0
pixeltable/functions/document.py +81 -0
pixeltable/functions/fal.py +76 -0
pixeltable/functions/fireworks.py +72 -20
pixeltable/functions/gemini.py +249 -0
pixeltable/functions/globals.py +208 -53
pixeltable/functions/groq.py +108 -0
pixeltable/functions/huggingface.py +1088 -95
pixeltable/functions/image.py +155 -84
pixeltable/functions/json.py +8 -11
pixeltable/functions/llama_cpp.py +31 -19
pixeltable/functions/math.py +169 -0
pixeltable/functions/mistralai.py +50 -75
pixeltable/functions/net.py +70 -0
pixeltable/functions/ollama.py +29 -36
pixeltable/functions/openai.py +548 -160
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/replicate.py +15 -14
pixeltable/functions/reve.py +250 -0
pixeltable/functions/string.py +310 -85
pixeltable/functions/timestamp.py +37 -19
pixeltable/functions/together.py +77 -120
pixeltable/functions/twelvelabs.py +188 -0
pixeltable/functions/util.py +7 -2
pixeltable/functions/uuid.py +30 -0
pixeltable/functions/video.py +1528 -117
pixeltable/functions/vision.py +26 -26
pixeltable/functions/voyageai.py +289 -0
pixeltable/functions/whisper.py +19 -10
pixeltable/functions/whisperx.py +179 -0
pixeltable/functions/yolox.py +112 -0
pixeltable/globals.py +716 -236
pixeltable/index/__init__.py +3 -1
pixeltable/index/base.py +17 -21
pixeltable/index/btree.py +32 -22
pixeltable/index/embedding_index.py +155 -92
pixeltable/io/__init__.py +12 -7
pixeltable/io/datarows.py +140 -0
pixeltable/io/external_store.py +83 -125
pixeltable/io/fiftyone.py +24 -33
pixeltable/io/globals.py +47 -182
pixeltable/io/hf_datasets.py +96 -127
pixeltable/io/label_studio.py +171 -156
pixeltable/io/lancedb.py +3 -0
pixeltable/io/pandas.py +136 -115
pixeltable/io/parquet.py +40 -153
pixeltable/io/table_data_conduit.py +702 -0
pixeltable/io/utils.py +100 -0
pixeltable/iterators/__init__.py +8 -4
pixeltable/iterators/audio.py +207 -0
pixeltable/iterators/base.py +9 -3
pixeltable/iterators/document.py +144 -87
pixeltable/iterators/image.py +17 -38
pixeltable/iterators/string.py +15 -12
pixeltable/iterators/video.py +523 -127
pixeltable/metadata/__init__.py +33 -8
pixeltable/metadata/converters/convert_10.py +2 -3
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_15.py +15 -11
pixeltable/metadata/converters/convert_16.py +4 -5
pixeltable/metadata/converters/convert_17.py +4 -5
pixeltable/metadata/converters/convert_18.py +4 -6
pixeltable/metadata/converters/convert_19.py +6 -9
pixeltable/metadata/converters/convert_20.py +3 -6
pixeltable/metadata/converters/convert_21.py +6 -8
pixeltable/metadata/converters/convert_22.py +3 -2
pixeltable/metadata/converters/convert_23.py +33 -0
pixeltable/metadata/converters/convert_24.py +55 -0
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/converters/convert_26.py +23 -0
pixeltable/metadata/converters/convert_27.py +29 -0
pixeltable/metadata/converters/convert_28.py +13 -0
pixeltable/metadata/converters/convert_29.py +110 -0
pixeltable/metadata/converters/convert_30.py +63 -0
pixeltable/metadata/converters/convert_31.py +11 -0
pixeltable/metadata/converters/convert_32.py +15 -0
pixeltable/metadata/converters/convert_33.py +17 -0
pixeltable/metadata/converters/convert_34.py +21 -0
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/convert_38.py +39 -0
pixeltable/metadata/converters/convert_39.py +124 -0
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/converters/convert_41.py +12 -0
pixeltable/metadata/converters/convert_42.py +9 -0
pixeltable/metadata/converters/convert_43.py +44 -0
pixeltable/metadata/converters/util.py +44 -18
pixeltable/metadata/notes.py +21 -0
pixeltable/metadata/schema.py +185 -42
pixeltable/metadata/utils.py +74 -0
pixeltable/mypy/__init__.py +3 -0
pixeltable/mypy/mypy_plugin.py +123 -0
pixeltable/plan.py +616 -225
pixeltable/share/__init__.py +3 -0
pixeltable/share/packager.py +797 -0
pixeltable/share/protocol/__init__.py +33 -0
pixeltable/share/protocol/common.py +165 -0
pixeltable/share/protocol/operation_types.py +33 -0
pixeltable/share/protocol/replica.py +119 -0
pixeltable/share/publish.py +349 -0
pixeltable/store.py +398 -232
pixeltable/type_system.py +730 -267
pixeltable/utils/__init__.py +40 -0
pixeltable/utils/arrow.py +201 -29
pixeltable/utils/av.py +298 -0
pixeltable/utils/azure_store.py +346 -0
pixeltable/utils/coco.py +26 -27
pixeltable/utils/code.py +4 -4
pixeltable/utils/console_output.py +46 -0
pixeltable/utils/coroutine.py +24 -0
pixeltable/utils/dbms.py +92 -0
pixeltable/utils/description_helper.py +11 -12
pixeltable/utils/documents.py +60 -61
pixeltable/utils/exception_handler.py +36 -0
pixeltable/utils/filecache.py +38 -22
pixeltable/utils/formatter.py +88 -51
pixeltable/utils/gcs_store.py +295 -0
pixeltable/utils/http.py +133 -0
pixeltable/utils/http_server.py +14 -13
pixeltable/utils/iceberg.py +13 -0
pixeltable/utils/image.py +17 -0
pixeltable/utils/lancedb.py +90 -0
pixeltable/utils/local_store.py +322 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +573 -0
pixeltable/utils/pydantic.py +60 -0
pixeltable/utils/pytorch.py +20 -20
pixeltable/utils/s3_store.py +527 -0
pixeltable/utils/sql.py +32 -5
pixeltable/utils/system.py +30 -0
pixeltable/utils/transactional_directory.py +4 -3
pixeltable-0.5.7.dist-info/METADATA +579 -0
pixeltable-0.5.7.dist-info/RECORD +227 -0
{pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info}/WHEEL +1 -1
pixeltable-0.5.7.dist-info/entry_points.txt +2 -0
pixeltable/__version__.py +0 -3
pixeltable/catalog/named_function.py +0 -36
pixeltable/catalog/path_dict.py +0 -141
pixeltable/dataframe.py +0 -894
pixeltable/exec/expr_eval_node.py +0 -232
pixeltable/ext/__init__.py +0 -14
pixeltable/ext/functions/__init__.py +0 -8
pixeltable/ext/functions/whisperx.py +0 -77
pixeltable/ext/functions/yolox.py +0 -157
pixeltable/tool/create_test_db_dump.py +0 -311
pixeltable/tool/create_test_video.py +0 -81
pixeltable/tool/doc_plugins/griffe.py +0 -50
pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
pixeltable/tool/embed_udf.py +0 -9
pixeltable/tool/mypy_plugin.py +0 -55
pixeltable/utils/media_store.py +0 -76
pixeltable/utils/s3.py +0 -16
pixeltable-0.2.26.dist-info/METADATA +0 -400
pixeltable-0.2.26.dist-info/RECORD +0 -156
pixeltable-0.2.26.dist-info/entry_points.txt +0 -3
{pixeltable-0.2.26.dist-info → pixeltable-0.5.7.dist-info/licenses}/LICENSE +0 -0

pixeltable/utils/description_helper.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import dataclasses
-from typing import Optional, Union
 import pandas as pd
 from pandas.io.formats.style import Styler
@@ -7,11 +6,11 @@ from pandas.io.formats.style import Styler
 @dataclasses.dataclass
 class _Descriptor:
-    body: Union[str, pd.DataFrame]
+    body: str | pd.DataFrame
     # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
     show_index: bool
     show_header: bool
-    styler: Optional[Styler] = None
+    styler: Styler | None = None
 class DescriptionHelper:
@@ -25,6 +24,7 @@ class DescriptionHelper:
     DescriptionHelper can convert a list of descriptors into either HTML or plaintext and do something reasonable
     in each case.
     """
     __descriptors: list[_Descriptor]
     def __init__(self) -> None:
@@ -32,10 +32,10 @@ class DescriptionHelper:
     def append(
         self,
-        descriptor: Union[str, pd.DataFrame],
+        descriptor: str | pd.DataFrame,
         show_index: bool = False,
         show_header: bool = True,
-        styler: Optional[Styler] = None,
+        styler: Styler | None = None,
     ) -> None:
         self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
@@ -69,18 +69,17 @@ class DescriptionHelper:
             return (
                 # Render the string as a single-cell DataFrame. This will ensure a consistent style of output in
                 # cases where strings appear alongside DataFrames in the same DescriptionHelper.
-                pd.DataFrame([descriptor.body]).style
-                .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
-                .hide(axis='index').hide(axis='columns')
+                pd.DataFrame([descriptor.body])
+                .style.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
+                .hide(axis='index')
+                .hide(axis='columns')
             )
         else:
             styler = descriptor.styler
             if styler is None:
                 styler = descriptor.body.style
-            styler = (
-                styler
-                .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
-                .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
+            styler = styler.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'}).set_table_styles(
+                [{'selector': 'th', 'props': [('text-align', 'left')]}]
             )
             if not descriptor.show_header:
                 styler = styler.hide(axis='columns')

pixeltable/utils/documents.py CHANGED Viewed

@@ -1,86 +1,85 @@
 import dataclasses
-from typing import Optional
+import os
 import bs4
-import fitz  # type: ignore[import-untyped]
 import puremagic
+from pypdfium2 import PdfDocument  # type: ignore[import-untyped]
-import pixeltable.type_system as ts
+from pixeltable import exceptions as excs, type_system as ts
 from pixeltable.env import Env
 @dataclasses.dataclass
 class DocumentHandle:
     format: ts.DocumentType.DocumentFormat
-    bs_doc: Optional[bs4.BeautifulSoup] = None
-    md_ast: Optional[dict] = None
-    pdf_doc: Optional[fitz.Document] = None
+    bs_doc: bs4.BeautifulSoup | None = None
+    md_ast: dict | None = None
+    pdf_doc: PdfDocument | None = None
+    txt_doc: str | None = None
-def get_document_handle(path: str) -> Optional[DocumentHandle]:
-    doc_format = puremagic.from_file(path)
+def get_document_handle(path: str) -> DocumentHandle:
+    _, extension = os.path.splitext(path)
+    handle = get_handle_by_extension(path, extension)
+    if handle is not None:
+        return handle
-    if doc_format == '.pdf':
-        pdf_doc = get_pdf_handle(path)
-        if pdf_doc is not None:
-            return DocumentHandle(format=ts.DocumentType.DocumentFormat.PDF, pdf_doc=pdf_doc)
+    # if no extension, use puremagic to determine the type
+    extension = puremagic.from_file(path)
+    handle = get_handle_by_extension(path, extension)
+    if handle is not None:
+        return handle
-    if doc_format == '.html':
-        bs_doc = get_html_handle(path)
-        if bs_doc is not None:
-            return DocumentHandle(format=ts.DocumentType.DocumentFormat.HTML, bs_doc=bs_doc)
+    raise excs.Error(f'Unrecognized document format: {path}')
-    if doc_format == '.md':
-        md_ast = get_markdown_handle(path)
-        if md_ast is not None:
-            return DocumentHandle(format=ts.DocumentType.DocumentFormat.MD, md_ast=md_ast)
-    if doc_format == '.xml':
-        bs_doc = get_xml_handle(path)
-        if bs_doc is not None:
-            return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
+def get_handle_by_extension(path: str, extension: str) -> DocumentHandle | None:
+    doc_format = ts.DocumentType.DocumentFormat.from_extension(extension)
-    return None
-def get_pdf_handle(path: str) -> Optional[fitz.Document]:
-    try:
-        doc = fitz.open(path)
-        # check pdf (bc it will work for images)
-        if not doc.is_pdf:
-            return None
-        # try to read one page
-        next(page for page in doc)
-        return doc
-    except Exception:
-        return None
-def get_html_handle(path: str) -> Optional[bs4.BeautifulSoup]:
     try:
-        with open(path, 'r', encoding='utf8') as fp:
-            doc = bs4.BeautifulSoup(fp, 'lxml')
-        return doc if doc.find() is not None else None
-    except Exception:
-        return None
+        if doc_format == ts.DocumentType.DocumentFormat.HTML:
+            return DocumentHandle(doc_format, bs_doc=get_html_handle(path))
+        if doc_format == ts.DocumentType.DocumentFormat.MD:
+            return DocumentHandle(doc_format, md_ast=get_markdown_handle(path))
+        if doc_format == ts.DocumentType.DocumentFormat.PDF:
+            return DocumentHandle(doc_format, pdf_doc=PdfDocument(path))
+        if doc_format == ts.DocumentType.DocumentFormat.XML:
+            return DocumentHandle(doc_format, bs_doc=get_xml_handle(path))
+        if doc_format == ts.DocumentType.DocumentFormat.TXT:
+            return DocumentHandle(doc_format, txt_doc=get_txt(path))
+    except Exception as exc:
+        raise excs.Error(f'An error occurred processing a {doc_format} document: {path}') from exc
+    return None
-def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
-    try:
-        with open(path, 'r', encoding='utf8') as fp:
-            doc = bs4.BeautifulSoup(fp, 'xml')
-        return doc if doc.find() is not None else None
-    except Exception:
-        return None
+def get_html_handle(path: str) -> bs4.BeautifulSoup:
+    with open(path, 'r', encoding='utf8') as fp:
+        doc = bs4.BeautifulSoup(fp, 'lxml')
+    if doc.find() is None:
+        raise excs.Error(f'Not a valid HTML document: {path}')
+    return doc
-def get_markdown_handle(path: str) -> Optional[dict]:
+def get_markdown_handle(path: str) -> dict:
     Env.get().require_package('mistune', [3, 0])
     import mistune
-    try:
-        with open(path, encoding='utf8') as file:
-            text = file.read()
-        md_ast = mistune.create_markdown(renderer=None)
-        return md_ast(text)
-    except Exception:
-        return None
+    with open(path, encoding='utf8') as file:
+        text = file.read()
+    md_ast = mistune.create_markdown(renderer=None)
+    return md_ast(text)
+def get_xml_handle(path: str) -> bs4.BeautifulSoup:
+    with open(path, 'r', encoding='utf8') as fp:
+        doc = bs4.BeautifulSoup(fp, 'xml')
+    if doc.find() is None:
+        raise excs.Error(f'Not a valid XML document: {path}')
+    return doc
+def get_txt(path: str) -> str:
+    with open(path, 'r', encoding='utf-8') as fp:
+        doc = fp.read()
+    return doc

pixeltable/utils/exception_handler.py ADDED Viewed

@@ -0,0 +1,36 @@
+import logging
+from typing import Any, Callable, TypeVar
+R = TypeVar('R')
+logger = logging.getLogger('pixeltable')
+def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> R | None:
+    """
+    Runs a cleanup function. If interrupted, retry cleanup.
+    The `run_cleanup()` function ensures that the `cleanup_func()` function executes at least once.
+    If the `cleanup_func()` is interrupted during execution, it will be retried.
+    Args:
+        cleanup_func: an idempotent function
+        raise_error: raise an exception if an error occurs during cleanup.
+    """
+    try:
+        logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
+        return cleanup_func(*args, **kwargs)
+    except KeyboardInterrupt as interrupt:
+        # Save original exception and re-attempt cleanup
+        original_exception = interrupt
+        logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
+        try:
+            return cleanup_func(*args, **kwargs)
+        except Exception as e:
+            # Suppress this exception
+            logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
+        raise KeyboardInterrupt from original_exception
+    except Exception as e:
+        logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
+        if raise_error:
+            raise e
+    return None

pixeltable/utils/filecache.py CHANGED Viewed

@@ -5,21 +5,22 @@ import hashlib
 import logging
 import os
 import warnings
-from collections import OrderedDict, defaultdict, namedtuple
+from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Optional
+from typing import NamedTuple
 from uuid import UUID
 import pixeltable.exceptions as excs
+from pixeltable.config import Config
 from pixeltable.env import Env
 _logger = logging.getLogger('pixeltable')
 @dataclass
 class CacheEntry:
     key: str
     tbl_id: UUID
     col_id: int
@@ -56,7 +57,8 @@ class FileCache:
     TODO:
     - implement MRU eviction for queries that exceed the capacity
     """
-    __instance: Optional[FileCache] = None
+    __instance: FileCache | None = None
     cache: OrderedDict[str, CacheEntry]
     total_size: int
@@ -77,11 +79,18 @@ class FileCache:
     evicted_working_set_keys: set[str]
     new_redownload_witnessed: bool  # whether a new re-download has occurred since the last time a warning was issued
-    FileCacheColumnStats = namedtuple('FileCacheColumnStats', ('tbl_id', 'col_id', 'num_files', 'total_size'))
-    FileCacheStats = namedtuple(
-        'FileCacheStats',
-        ('total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats')
-    )
+    class FileCacheColumnStats(NamedTuple):
+        tbl_id: UUID
+        col_id: int
+        num_files: int
+        total_size: int
+    class FileCacheStats(NamedTuple):
+        total_size: int
+        num_requests: int
+        num_hits: int
+        num_evictions: int
+        column_stats: list[FileCache.FileCacheColumnStats]
     @classmethod
     def get(cls) -> FileCache:
@@ -93,7 +102,7 @@ class FileCache:
     def init(cls) -> None:
         cls.__instance = cls()
-    def __init__(self):
+    def __init__(self) -> None:
         self.cache = OrderedDict()
         self.total_size = 0
         self.capacity_bytes = int(Env.get()._file_cache_size_g * (1 << 30))
@@ -117,17 +126,18 @@ class FileCache:
             return 0
         return int(self.total_size / len(self.cache))
-    def num_files(self, tbl_id: Optional[UUID] = None) -> int:
+    def num_files(self, tbl_id: UUID | None = None) -> int:
         if tbl_id is None:
             return len(self.cache)
         return sum(e.tbl_id == tbl_id for e in self.cache.values())
-    def clear(self, tbl_id: Optional[UUID] = None) -> None:
+    def clear(self, tbl_id: UUID | None = None) -> None:
         """
         For testing purposes: allow resetting capacity and stats.
         """
         if tbl_id is None:
-            # We need to store the entries to remove in a list, because we can't remove items from a dict while iterating
+            # We need to store the entries to remove in a list, because we can't remove items from a dict
+            # while iterating
             entries_to_remove = list(self.cache.values())
             _logger.debug(f'clearing {self.num_files()} entries from file cache')
             self.num_requests, self.num_hits, self.num_evictions = 0, 0, 0
@@ -153,8 +163,9 @@ class FileCache:
                 f'of the evicted file(s) is {round(extra_capacity_needed / (1 << 30), 1)} GiB.\n'
                 f'Consider increasing the cache size to at least {round(suggested_cache_size / (1 << 30), 1)} GiB '
                 f'(it is currently {round(self.capacity_bytes / (1 << 30), 1)} GiB).\n'
-                f'You can do this by setting the value of `file_cache_size_g` in: {str(Env.get()._config_file)}',
-                excs.PixeltableWarning
+                f'You can do this by setting the value of `file_cache_size_g` in: {Config.get().config_file}',
+                excs.PixeltableWarning,
+                stacklevel=2,
             )
             self.new_redownload_witnessed = False
@@ -163,7 +174,7 @@ class FileCache:
         h.update(url.encode())
         return h.hexdigest()
-    def lookup(self, url: str) -> Optional[Path]:
+    def lookup(self, url: str) -> Path | None:
         self.num_requests += 1
         key = self._url_hash(url)
         entry = self.cache.get(key, None)
@@ -195,13 +206,15 @@ class FileCache:
             self.evicted_working_set_keys.add(key)
             self.new_redownload_witnessed = True
         self.keys_retrieved.add(key)
-        entry = CacheEntry(key, tbl_id, col_id, file_info.st_size, datetime.fromtimestamp(file_info.st_mtime), path.suffix)
+        entry = CacheEntry(
+            key, tbl_id, col_id, file_info.st_size, datetime.fromtimestamp(file_info.st_mtime), path.suffix
+        )
         self.cache[key] = entry
         self.total_size += entry.size
         new_path = entry.path
         os.rename(str(path), str(new_path))
         new_path.touch(exist_ok=True)
-        _logger.debug(f'added entry for cell {url} to file cache')
+        _logger.debug(f'FileCache: cached url {url} with file name {new_path}')
         return new_path
     def ensure_capacity(self, size: int) -> None:
@@ -217,7 +230,9 @@ class FileCache:
                 # Make a record of the eviction, so that we can generate a warning later if the key is retrieved again.
                 self.keys_evicted_after_retrieval.add(lru_entry.key)
             os.remove(str(lru_entry.path))
-            _logger.debug(f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)')
+            _logger.debug(
+                f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)'
+            )
     def set_capacity(self, capacity_bytes: int) -> None:
         self.capacity_bytes = capacity_bytes
@@ -228,15 +243,16 @@ class FileCache:
         # (tbl_id, col_id) -> (num_files, total_size)
         d: dict[tuple[UUID, int], list[int]] = defaultdict(lambda: [0, 0])
         for entry in self.cache.values():
-            t = d[(entry.tbl_id, entry.col_id)]
+            t = d[entry.tbl_id, entry.col_id]
             t[0] += 1
             t[1] += entry.size
         col_stats = [
-            self.FileCacheColumnStats(tbl_id, col_id, num_files, size) for (tbl_id, col_id), (num_files, size) in d.items()
+            self.FileCacheColumnStats(tbl_id, col_id, num_files, size)
+            for (tbl_id, col_id), (num_files, size) in d.items()
         ]
         col_stats.sort(key=lambda e: e[3], reverse=True)
         return self.FileCacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
     def debug_print(self) -> None:
         for entry in self.cache.values():
-            print(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')
+            _logger.debug(f'CacheEntry: tbl_id={entry.tbl_id}, col_id={entry.col_id}, size={entry.size}')

pixeltable/utils/formatter.py CHANGED Viewed

@@ -4,12 +4,13 @@ import io
 import json
 import logging
 import mimetypes
-from typing import Any, Callable, Optional
+import uuid
+from typing import Any, Callable
-import av  # type: ignore[import-untyped]
+import av
 import numpy as np
-import PIL
-import PIL.Image as Image
+from PIL import Image
+from pypdfium2 import PdfDocument  # type: ignore[import-untyped]
 import pixeltable.type_system as ts
 from pixeltable.utils.http_server import get_file_uri
@@ -20,11 +21,11 @@ _logger = logging.getLogger('pixeltable')
 class Formatter:
     """
     A factory for constructing HTML formatters for Pixeltable data. The formatters are used to customize
-    the rendering of `DataFrameResultSet`s in notebooks.
+    the rendering of `ResultSet`s in notebooks.
     Args:
-        num_rows: Number of rows in the DataFrame being rendered.
-        num_cols: Number of columns in the DataFrame being rendered.
+        num_rows: Number of rows in the `ResultSet` being rendered.
+        num_cols: Number of columns in the `ResultSet` being rendered.
         http_address: Root address of the Pixeltable HTTP server (used to construct URLs for media references).
     """
@@ -40,9 +41,13 @@ class Formatter:
         self.__num_cols = num_cols
         self.__http_address = http_address
-    def get_pandas_formatter(self, col_type: ts.ColumnType) -> Optional[Callable]:
+    def get_pandas_formatter(self, col_type: ts.ColumnType) -> Callable | None:
         if col_type.is_string_type():
             return self.format_string
+        if col_type.is_uuid_type():
+            return self.format_uuid
+        if col_type.is_binary_type():
+            return self.format_binary
         if col_type.is_float_type():
             return self.format_float
         if col_type.is_json_type():
@@ -64,10 +69,24 @@ class Formatter:
         """
         Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
         """
-        return cls.__escape(cls.__abbreviate(val, cls.__STRING_MAX_LEN))
+        return cls.__escape(cls.abbreviate(val))
     @classmethod
-    def __abbreviate(cls, val: str, max_len: int) -> str:
+    def format_uuid(cls, val: uuid.UUID | None) -> str:
+        """
+        Formats a UUID by converting it to a string and applying string formatting.
+        """
+        return '' if val is None else cls.format_string(str(val))
+    @classmethod
+    def format_binary(cls, val: bytes) -> str:
+        """
+        Formats binary data by converting it to an encoded string and applying string formatting.
+        """
+        return cls.format_string(str(val))
+    @classmethod
+    def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
         if len(val) > max_len:
             edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
             return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
@@ -95,41 +114,45 @@ class Formatter:
         )
     @classmethod
-    def format_json(cls, val: Any) -> str:
+    def format_json(cls, val: Any, escape_strings: bool = True) -> str:
         if isinstance(val, str):
             # JSON-like formatting will be applied to strings that appear nested within a list or dict
             # (quote the string; escape any quotes inside the string; shorter abbreviations).
             # However, if the string appears in top-level position (i.e., the entire JSON value is a
             # string), then we format it like an ordinary string.
-            return cls.format_string(val)
+            return cls.format_string(val) if escape_strings else cls.abbreviate(val)
         # In all other cases, dump the JSON struct recursively.
-        return cls.__format_json_rec(val)
+        return cls.__format_json_rec(val, escape_strings)
     @classmethod
-    def __format_json_rec(cls, val: Any) -> str:
+    def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
         if isinstance(val, str):
-            return cls.__escape(json.dumps(cls.__abbreviate(val, cls.__NESTED_STRING_MAX_LEN)))
+            formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
+            return cls.__escape(formatted) if escape_strings else formatted
         if isinstance(val, float):
             return cls.format_float(val)
         if isinstance(val, np.ndarray):
             return cls.format_array(val)
         if isinstance(val, list):
             if len(val) < cls.__LIST_THRESHOLD:
-                components = [cls.__format_json_rec(x) for x in val]
+                components = [cls.__format_json_rec(x, escape_strings) for x in val]
             else:
-                components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
+                components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
                 components.append('...')
-                components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
+                components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
             return '[' + ', '.join(components) + ']'
         if isinstance(val, dict):
-            kv_pairs = (f'{cls.__format_json_rec(k)}: {cls.__format_json_rec(v)}' for k, v in val.items())
+            kv_pairs = (
+                f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
+                for k, v in val.items()
+            )
             return '{' + ', '.join(kv_pairs) + '}'
         # Everything else
         try:
             return json.dumps(val)
         except TypeError:  # Not JSON serializable
-            return str(val)
+            return cls.__escape(str(val))
     def format_img(self, img: Image.Image) -> str:
         """
@@ -153,22 +176,19 @@ class Formatter:
             """
     def format_video(self, file_path: str) -> str:
-        thumb_tag = ''
         # Attempt to extract the first frame of the video to use as a thumbnail,
         # so that the notebook can be exported as HTML and viewed in contexts where
         # the video itself is not accessible.
         # TODO(aaron-siegel): If the video is backed by a concrete external URL,
         # should we link to that instead?
-        with av.open(file_path) as container:
-            try:
-                thumb = next(container.decode(video=0)).to_image()
-                assert isinstance(thumb, Image.Image)
-                with io.BytesIO() as buffer:
-                    thumb.save(buffer, 'jpeg')
-                    thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
-                    thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
-            except Exception:
-                pass
+        thumb = self.extract_first_video_frame(file_path)
+        if thumb is None:
+            thumb_tag = ''
+        else:
+            with io.BytesIO() as buffer:
+                thumb.save(buffer, 'jpeg')
+                thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
+                thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
         if self.__num_rows > 1:
             width = 320
         elif self.__num_cols > 1:
@@ -183,6 +203,16 @@ class Formatter:
         </div>
         """
+    @classmethod
+    def extract_first_video_frame(cls, file_path: str) -> Image.Image | None:
+        with av.open(file_path) as container:
+            try:
+                img = next(container.decode(video=0)).to_image()
+                assert isinstance(img, Image.Image)
+                return img
+            except Exception:
+                return None
     def format_audio(self, file_path: str) -> str:
         return f"""
         <div class="pxt_audio">
@@ -192,29 +222,18 @@ class Formatter:
         </div>
         """
-    def format_document(self, file_path: str) -> str:
-        max_width = max_height = 320
+    def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
         # by default, file path will be shown as a link
         inner_element = file_path
         inner_element = html.escape(inner_element)
-        # try generating a thumbnail for different types and use that if successful
-        if file_path.lower().endswith('.pdf'):
-            try:
-                import fitz  # type: ignore[import-untyped]
-                doc = fitz.open(file_path)
-                p = doc.get_page_pixmap(0)
-                while p.width > max_width or p.height > max_height:
-                    # shrink(1) will halve each dimension
-                    p.shrink(1)
-                data = p.tobytes(output='jpeg')
-                thumb_base64 = base64.b64encode(data).decode()
-                img_src = f'data:image/jpeg;base64,{thumb_base64}'
-                inner_element = f"""
-                    <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
-                """
-            except:
-                logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
+        thumb = self.make_document_thumbnail(file_path, max_width, max_height)
+        if thumb is not None:
+            with io.BytesIO() as buffer:
+                thumb.save(buffer, 'webp')
+                thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
+                thumb_tag = f'data:image/webp;base64,{thumb_base64}'
+            inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
         return f"""
         <div class="pxt_document" style="width:{max_width}px;">
@@ -224,6 +243,24 @@ class Formatter:
         </div>
         """
+    @classmethod
+    def make_document_thumbnail(cls, file_path: str, max_width: int = 320, max_height: int = 320) -> Image.Image | None:
+        """
+        Returns a thumbnail image of a document.
+        """
+        if file_path.lower().endswith('.pdf'):
+            try:
+                doc = PdfDocument(file_path)
+                if len(doc) == 0:
+                    return None
+                img = doc[0].render().to_pil()
+                img.thumbnail((max_width, max_height), Image.LANCZOS)
+                return img
+            except Exception:
+                logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have pypdfium2 installed.')
+        return None
     @classmethod
     def __create_source_tag(cls, http_address: str, file_path: str) -> str:
         src_url = get_file_uri(http_address, file_path)

pixeltable 0.2.26__py3-none-any.whl → 0.5.7__py3-none-any.whl

pixeltable 0.2.26py3-none-any.whl → 0.5.7py3-none-any.whl