PyPI - pixeltable - Versions diffs - 0.2.25__py3-none-any.whl → 0.2.27__py3-none-any.whl - Mend

pixeltable 0.2.25py3-none-any.whl → 0.2.27py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (51) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/table.py +118 -44
pixeltable/catalog/view.py +2 -2
pixeltable/dataframe.py +240 -92
pixeltable/env.py +8 -1
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +6 -7
pixeltable/exec/sql_node.py +91 -44
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +1 -1
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +29 -2
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/expr.py +11 -5
pixeltable/exprs/expr_set.py +8 -0
pixeltable/exprs/function_call.py +14 -11
pixeltable/exprs/in_predicate.py +1 -1
pixeltable/exprs/inline_expr.py +3 -3
pixeltable/exprs/is_null.py +1 -1
pixeltable/exprs/json_mapper.py +1 -1
pixeltable/exprs/json_path.py +1 -1
pixeltable/exprs/method_ref.py +1 -1
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/exprs/similarity_expr.py +1 -1
pixeltable/exprs/sql_element_cache.py +4 -0
pixeltable/exprs/type_cast.py +2 -2
pixeltable/exprs/variable.py +3 -0
pixeltable/func/expr_template_function.py +3 -0
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/gemini.py +85 -0
pixeltable/functions/ollama.py +4 -4
pixeltable/globals.py +4 -1
pixeltable/io/__init__.py +1 -1
pixeltable/io/parquet.py +39 -19
pixeltable/iterators/document.py +12 -0
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_22.py +17 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +128 -50
pixeltable/store.py +1 -1
pixeltable/type_system.py +2 -1
pixeltable/utils/arrow.py +8 -3
pixeltable/utils/description_helper.py +89 -0
pixeltable/utils/documents.py +14 -0
{pixeltable-0.2.25.dist-info → pixeltable-0.2.27.dist-info}/METADATA +26 -10
{pixeltable-0.2.25.dist-info → pixeltable-0.2.27.dist-info}/RECORD +51 -48
{pixeltable-0.2.25.dist-info → pixeltable-0.2.27.dist-info}/WHEEL +1 -1
{pixeltable-0.2.25.dist-info → pixeltable-0.2.27.dist-info}/LICENSE +0 -0
{pixeltable-0.2.25.dist-info → pixeltable-0.2.27.dist-info}/entry_points.txt +0 -0

pixeltable/functions/gemini.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""
+Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs)
+that wrap various endpoints from the Google Gemini API. In order to use them, you must
+first `pip install google-generativeai` and configure your Gemini credentials, as described in
+the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini) tutorial.
+"""
+from typing import Optional
+import pixeltable as pxt
+from pixeltable import env
+@env.register_client('gemini')
+def _(api_key: str) -> None:
+    import google.generativeai as genai  # type: ignore[import-untyped]
+    genai.configure(api_key=api_key)
+def _ensure_loaded() -> None:
+    env.Env.get().get_client('gemini')
+@pxt.udf
+def generate_content(
+    contents: str,
+    *,
+    model_name: str,
+    candidate_count: Optional[int] = None,
+    stop_sequences: Optional[list[str]] = None,
+    max_output_tokens: Optional[int] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    top_k: Optional[int] = None,
+    response_mime_type: Optional[str] = None,
+    response_schema: Optional[dict] = None,
+    presence_penalty: Optional[float] = None,
+    frequency_penalty: Optional[float] = None,
+    response_logprobs: Optional[bool] = None,
+    logprobs: Optional[int] = None,
+) -> dict:
+    """
+    Generate content from the specified model. For additional details, see:
+    <https://ai.google.dev/gemini-api/docs>
+    __Requirements:__
+    - `pip install google-generativeai`
+    Args:
+        contents: The input content to generate from.
+        model_name: The name of the model to use.
+    For details on the other parameters, see: <https://ai.google.dev/gemini-api/docs>
+    Returns:
+        A dictionary containing the response and other metadata.
+    Examples:
+        Add a computed column that applies the model `gemini-1.5-flash`
+        to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
+        >>> tbl['response'] = generate_content(tbl.prompt, model_name='gemini-1.5-flash')
+    """
+    env.Env.get().require_package('google.generativeai')
+    _ensure_loaded()
+    import google.generativeai as genai
+    model = genai.GenerativeModel(model_name=model_name)
+    gc = genai.GenerationConfig(
+        candidate_count=candidate_count,
+        stop_sequences=stop_sequences,
+        max_output_tokens=max_output_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        response_mime_type=response_mime_type,
+        response_schema=response_schema,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
+        response_logprobs=response_logprobs,
+        logprobs=logprobs,
+    )
+    response = model.generate_content(contents, generation_config=gc)
+    return response.to_dict()

pixeltable/functions/ollama.py CHANGED Viewed

@@ -68,7 +68,7 @@ def generate(
         raw=raw,
         format=format,
         options=options,
-    )  # type: ignore[call-overload]
+    ).dict()  # type: ignore[call-overload]
 @pxt.udf
@@ -103,7 +103,7 @@ def chat(
         tools=tools,
         format=format,
         options=options,
-    )  # type: ignore[call-overload]
+    ).dict()  # type: ignore[call-overload]
 @pxt.udf(batch_size=16)
@@ -135,8 +135,8 @@ def embed(
         model=model,
         input=input,
         truncate=truncate,
-        options=options,  # type: ignore[arg-type]
-    )
+        options=options,
+    ).dict()
     return [np.array(data, dtype=np.float64) for data in results['embeddings']]

pixeltable/globals.py CHANGED Viewed

@@ -46,6 +46,7 @@ def create_table(
         num_retained_versions: Number of versions of the table to retain.
         comment: An optional comment; its meaning is user-defined.
         media_validation: Media validation policy for the table.
             - `'on_read'`: validate media files at query time
             - `'on_write'`: validate media files during insert/update operations
@@ -149,7 +150,9 @@ def create_view(
         tbl_version_path = base._tbl_version_path
     elif isinstance(base, DataFrame):
         base._validate_mutable('create_view')
-        tbl_version_path = base.tbl
+        if len(base._from_clause.tbls) > 1:
+            raise excs.Error('Cannot create a view of a join')
+        tbl_version_path = base._from_clause.tbls[0]
         where = base.where_clause
     else:
         raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')

pixeltable/io/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from .external_store import ExternalStore, SyncStatus
 from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
 from .hf_datasets import import_huggingface_dataset
 from .pandas import import_csv, import_excel, import_pandas
-from .parquet import import_parquet
+from .parquet import import_parquet, export_parquet
 __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
 __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}

pixeltable/io/parquet.py CHANGED Viewed

@@ -7,11 +7,14 @@ import random
 import typing
 from collections import deque
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
 import numpy as np
 import PIL.Image
+import datetime
+import pixeltable as pxt
+from pixeltable.env import Env
 import pixeltable.exceptions as exc
 import pixeltable.type_system as ts
 from pixeltable.utils.transactional_directory import transactional_directory
@@ -39,28 +42,44 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
     parquet.write_table(tab, str(output_path))
-def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
+def export_parquet(
+            table_or_df: Union[pxt.Table, pxt.DataFrame],
+            parquet_path: Path,
+            partition_size_bytes: int = 100_000_000,
+            inline_images: bool = False
+            ) -> None:
     """
-    Internal method to stream dataframe data to parquet format.
-    Does not materialize the dataset to memory.
+    Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
-    It preserves pixeltable type metadata in a json file, which would otherwise
+    It additionally writes the pixeltable metadata in a json file, which would otherwise
     not be available in the parquet format.
-    Images are stored inline in a compressed format in their parquet file.
     Args:
-        df : dataframe to save.
-        dest_path : path to directory to save the parquet files to.
-        partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
+        table_or_df : Table or Dataframe to export.
+        parquet_path : Path to directory to write the parquet files to.
+        partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
+        inline_images : If True, images are stored inline in the parquet file. This is useful
+                        for small images, to be imported as pytorch dataset. But can be inefficient
+                        for large images, and cannot be imported into pixeltable.
+                        If False, will raise an error if the Dataframe has any image column.
+                        Default False.
     """
     from pixeltable.utils.arrow import to_arrow_schema
+    df: pxt.DataFrame
+    if isinstance(table_or_df, pxt.catalog.Table):
+        df = table_or_df._df()
+    else:
+        df = table_or_df
     type_dict = {k: v.as_dict() for k, v in df.schema.items()}
     arrow_schema = to_arrow_schema(df.schema)
+    if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
+        raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
     # store the changes atomically
-    with transactional_directory(dest_path) as temp_path:
+    with transactional_directory(parquet_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
         json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
@@ -111,6 +130,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
                 elif col_type.is_bool_type():
                     length = 1
                 elif col_type.is_timestamp_type():
+                    val = val.astimezone(datetime.timezone.utc)
                     length = 8
                 else:
                     assert False, f'unknown type {col_type} for {col_name}'
@@ -139,7 +159,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
 def import_parquet(
-    table_path: str,
+    table: str,
     *,
     parquet_path: str,
     schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
@@ -148,7 +168,7 @@ def import_parquet(
     """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
     Args:
-        table_path: Path to the table.
+        table: Fully qualified name of the table to import the data into.
         parquet_path: Path to an individual Parquet file or directory of Parquet files.
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
             name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
@@ -157,7 +177,7 @@ def import_parquet(
         kwargs: Additional arguments to pass to `create_table`.
     Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
+        A handle to the newly created table.
     """
     from pyarrow import parquet
@@ -176,11 +196,11 @@ def import_parquet(
         if v is None:
             raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
-    if table_path in pxt.list_tables():
-        raise exc.Error(f'Table {table_path} already exists')
+    if table in pxt.list_tables():
+        raise exc.Error(f'Table {table} already exists')
     try:
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
+        tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
         tab = pxt.create_table(tmp_name, schema, **kwargs)
         for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
             for batch in fragment.to_batches():
@@ -190,5 +210,5 @@ def import_parquet(
         _logger.error(f'Error while inserting Parquet file into table: {e}')
         raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    pxt.move(tmp_name, table)
+    return pxt.get_table(table)

pixeltable/iterators/document.py CHANGED Viewed

@@ -151,6 +151,9 @@ class DocumentSplitter(ComponentIterator):
         elif self._doc_handle.format == DocumentType.DocumentFormat.PDF:
             assert self._doc_handle.pdf_doc is not None
             self._sections = self._pdf_sections()
+        elif self._doc_handle.format == DocumentType.DocumentFormat.TXT:
+            assert self._doc_handle.txt_doc is not None
+            self._sections = self._txt_sections()
         else:
             assert False, f'Unsupported document format: {self._doc_handle.format}'
@@ -389,6 +392,15 @@ class DocumentSplitter(ComponentIterator):
         if accumulated_text and not emit_on_page:
             yield DocumentSection(text=_emit_text(), metadata=DocumentSectionMetadata())
+    def _txt_sections(self) -> Iterator[DocumentSection]:
+        """Create DocumentSections for text files.
+        Currently, it returns the entire text as a single section.
+        TODO: Add support for paragraphs.
+        """
+        assert self._doc_handle.txt_doc is not None
+        yield DocumentSection(text=ftfy.fix_text(self._doc_handle.txt_doc), metadata=DocumentSectionMetadata())
     def _sentence_sections(self, input_sections: Iterable[DocumentSection]) -> Iterator[DocumentSection]:
         """Split the input sections into sentences"""
         for section in input_sections:

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 22
+VERSION = 23
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_22.py ADDED Viewed

@@ -0,0 +1,17 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=22)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'DataFrame':
+        v['from_clause'] = {'tbls': [v['tbl']], 'join_clauses': []}
+        return k, v
+    return None

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    23: 'DataFrame.from_clause',
     22: 'TableMd/ColumnMd.media_validation',
     21: 'Separate InlineArray and InlineList',
     20: 'Store DB timestamps in UTC',

pixeltable 0.2.25__py3-none-any.whl → 0.2.27__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.25py3-none-any.whl → 0.2.27py3-none-any.whl