PyPI - pixeltable - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

pixeltable 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (25) hide show

pixeltable/catalog/column.py +1 -1
pixeltable/client.py +72 -2
pixeltable/env.py +36 -52
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/fireworks.py +10 -37
pixeltable/functions/openai.py +192 -24
pixeltable/functions/together.py +104 -9
pixeltable/tests/conftest.py +4 -4
pixeltable/tests/functions/test_fireworks.py +42 -0
pixeltable/tests/functions/test_functions.py +60 -0
pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +5 -141
pixeltable/tests/functions/test_openai.py +152 -0
pixeltable/tests/functions/test_together.py +111 -0
pixeltable/tests/test_dataframe.py +4 -4
pixeltable/tests/test_table.py +105 -2
pixeltable/tests/utils.py +128 -5
pixeltable/type_system.py +41 -84
pixeltable/utils/arrow.py +98 -0
pixeltable/utils/hf_datasets.py +157 -0
pixeltable/utils/parquet.py +68 -27
pixeltable/utils/pytorch.py +16 -97
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/METADATA +33 -27
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/RECORD +25 -19
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/LICENSE +0 -0
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +0 -0

pixeltable/catalog/column.py CHANGED Viewed

@@ -61,7 +61,7 @@ class Column:
             raise excs.Error(f"Invalid column name: '{name}'")
         self.name = name
         if col_type is None and computed_with is None:
-            raise excs.Error(f'Column {name}: col_type is required if computed_with is not specified')
+            raise excs.Error(f'Column `{name}`: col_type is required if computed_with is not specified')
         self.value_expr: Optional['Expr'] = None
         self.compute_func: Optional[Callable] = None

pixeltable/client.py CHANGED Viewed

@@ -2,12 +2,11 @@ from typing import List, Optional, Dict, Type, Any, Union
 import pandas as pd
 import logging
 import dataclasses
-from uuid import UUID
-from collections import defaultdict
 import sqlalchemy as sql
 import sqlalchemy.orm as orm
+import pixeltable
 from pixeltable.metadata import schema
 from pixeltable.env import Env
 import pixeltable.func as func
@@ -16,6 +15,10 @@ from pixeltable import exceptions as excs
 from pixeltable.exprs import Predicate
 from pixeltable.iterators import ComponentIterator
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    import datasets
 __all__ = [
     'Client',
 ]
@@ -155,6 +158,73 @@ class Client:
         _logger.info(f'Created table `{path_str}`.')
         return tbl
+    def import_parquet(
+        self,
+        table_path: str,
+        *,
+        parquet_path: str,
+        schema_override: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> catalog.InsertableTable:
+        """Create a new `InsertableTable` from a Parquet file or set of files. Requires pyarrow to be installed.
+        Args:
+            path_str: Path to the table within pixeltable.
+            parquet_path: Path to an individual Parquet file or directory of Parquet files.
+            schema_override: Optional dictionary mapping column names to column type to override the default
+                            schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
+                            For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
+                            Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
+            kwargs: Additional arguments to pass to `Client.create_table`.
+        Returns:
+            The newly created table. The table will have loaded the data from the Parquet file(s).
+        """
+        from pixeltable.utils import parquet
+        return parquet.import_parquet(
+            self,
+            table_path=table_path,
+            parquet_path=parquet_path,
+            schema_override=schema_override,
+            **kwargs,
+        )
+    def import_huggingface_dataset(
+        self,
+        table_path: str,
+        dataset: Union['datasets.Dataset', 'datasets.DatasetDict'],
+        *,
+        column_name_for_split: Optional[str] = 'split',
+        schema_override: Optional[Dict[str, Any]] = None,
+        **kwargs
+    ) -> catalog.InsertableTable:
+        """Create a new `InsertableTable` from a Huggingface dataset, or dataset dict with multiple splits.
+            Requires datasets library to be installed.
+        Args:
+            path_str: Path to the table.
+            dataset: Huggingface datasts.Dataset or datasts.DatasetDict to insert into the table.
+            column_name_for_split: column name to use for split information. If None, no split information will be stored.
+            schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
+            `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
+            For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
+            kwargs: Additional arguments to pass to `create_table`.
+        Returns:
+            The newly created table. The table will have loaded the data from the dataset.
+        """
+        from pixeltable.utils import hf_datasets
+        return hf_datasets.import_huggingface_dataset(
+            self,
+            table_path,
+            dataset,
+            column_name_for_split=column_name_for_split,
+            schema_override=schema_override,
+            **kwargs,
+        )
     def create_view(
             self, path_str: str, base: catalog.Table, *, schema: Optional[Dict[str, Any]] = None,
             filter: Optional[Predicate] = None,

pixeltable/env.py CHANGED Viewed

@@ -1,33 +1,28 @@
 from __future__ import annotations
 import datetime
-import os
-from typing import Optional, Dict, Any, List
-from pathlib import Path
-import sqlalchemy as sql
-import uuid
+import glob
+import http.server
 import importlib
 import importlib.util
-import http.server
+import logging
+import os
 import socketserver
+import sys
 import threading
 import typing
 import uuid
 from pathlib import Path
-from typing import Optional, Dict, Any, List
+from typing import Callable, Optional, Dict, Any, List
+import pgserver
+import sqlalchemy as sql
 import yaml
 from sqlalchemy_utils.functions import database_exists, create_database, drop_database
-import pgserver
-import logging
-import sys
-import glob
-from pixeltable import metadata
 import pixeltable.exceptions as excs
+from pixeltable import metadata
-if typing.TYPE_CHECKING:
-    import openai
 class Env:
     """
@@ -59,12 +54,12 @@ class Env:
         # package name -> version; version == []: package is installed, but we haven't determined the version yet
         self._installed_packages: Dict[str, Optional[List[int]]] = {}
         self._nos_client: Optional[Any] = None
-        self._openai_client: Optional['openai.OpenAI'] = None
-        self._has_together_client: bool = False
         self._spacy_nlp: Optional[Any] = None  # spacy.Language
         self._httpd: Optional[socketserver.TCPServer] = None
         self._http_address: Optional[str] = None
+        self._registered_clients: dict[str, Any] = {}
         # logging-related state
         self._logger = logging.getLogger('pixeltable')
         self._logger.setLevel(logging.DEBUG)  # allow everything to pass, we filter in _log_filter()
@@ -256,31 +251,32 @@ class Env:
         from pixeltable.functions.util import create_nos_modules
         _ = create_nos_modules()
-    def _create_openai_client(self) -> None:
-        if not self.is_installed_package('openai'):
-            raise excs.Error('OpenAI client not initialized (cannot find package `openai`: `pip install openai`?)')
-        import openai
-        if 'openai' in self._config and 'api_key' in self._config['openai']:
-            api_key = self._config['openai']['api_key']
-        else:
-            api_key = os.environ.get('OPENAI_API_KEY')
-        if api_key is None or api_key == '':
-            raise excs.Error('OpenAI client not initialized (no API key configured).')
-        self._openai_client = openai.OpenAI(api_key=api_key)
-        self._logger.info('Initialized OpenAI client.')
+    def get_client(self, name: str, init: Callable, environ: Optional[str] = None) -> Any:
+        """
+        Gets the client with the specified name, using `init` to construct one if necessary.
+        - name: The name of the client
+        - init: A `Callable` with signature `fn(api_key: str) -> Any` that constructs a client object
+        - environ: The name of the environment variable to use for the API key, if no API key is found in config
+            (defaults to f'{name.upper()}_API_KEY')
+        """
+        if name in self._registered_clients:
+            return self._registered_clients[name]
+        if environ is None:
+            environ = f'{name.upper()}_API_KEY'
-    def _create_together_client(self) -> None:
-        if 'together' in self._config and 'api_key' in self._config['together']:
-            api_key = self._config['together']['api_key']
+        if name in self._config and 'api_key' in self._config[name]:
+            api_key = self._config[name]['api_key']
         else:
-            api_key = os.environ.get('TOGETHER_API_KEY')
+            api_key = os.environ.get(environ)
         if api_key is None or api_key == '':
-            self._logger.info('Together client not initialized (no API key configured).')
-            return
-        import together
-        self._logger.info('Initializing Together client.')
-        together.api_key = api_key
-        self._has_together_client = True
+            raise excs.Error(f'`{name}` client not initialized (no API key configured).')
+        client = init(api_key)
+        self._registered_clients[name] = client
+        self._logger.info(f'Initialized `{name}` client.')
+        return client
     def _start_web_server(self) -> None:
         """
@@ -319,6 +315,7 @@ class Env:
             else:
                 self._installed_packages[package] = None
+        check('datasets')
         check('torch')
         check('torchvision')
         check('transformers')
@@ -332,8 +329,6 @@ class Env:
         check('tiktoken')
         check('openai')
         check('together')
-        if self.is_installed_package('together'):
-            self._create_together_client()
         check('fireworks')
         check('nos')
         if self.is_installed_package('nos'):
@@ -399,17 +394,6 @@ class Env:
     def nos_client(self) -> Any:
         return self._nos_client
-    @property
-    def openai_client(self) -> 'openai.OpenAI':
-        if self._openai_client is None:
-            self._create_openai_client()
-        assert self._openai_client is not None
-        return self._openai_client
-    @property
-    def has_together_client(self) -> bool:
-        return self._has_together_client
     @property
     def spacy_nlp(self) -> Any:
         assert self._spacy_nlp is not None

pixeltable/functions/__init__.py CHANGED Viewed

@@ -15,7 +15,7 @@ import pixeltable.functions.pil.image
 from pixeltable import exprs
 from pixeltable.type_system import IntType, ColumnType, FloatType, ImageType, VideoType
 # automatically import all submodules so that the udfs get registered
-from . import image, string, video, openai, together, fireworks, huggingface
+from . import image, string, video, huggingface
 # TODO: remove and replace calls with astype()
 def cast(expr: exprs.Expr, target_type: ColumnType) -> exprs.Expr:

pixeltable/functions/fireworks.py CHANGED Viewed

@@ -1,61 +1,34 @@
-import logging
-import os
 from typing import Optional
+import fireworks.client
 import pixeltable as pxt
-import pixeltable.exceptions as excs
 from pixeltable import env
+def fireworks_client() -> fireworks.client.Fireworks:
+    return env.Env.get().get_client('fireworks', lambda api_key: fireworks.client.Fireworks(api_key=api_key))
 @pxt.udf
 def chat_completions(
-        prompt: str,
-        model: str,
+        messages: list[dict[str, str]],
         *,
+        model: str,
         max_tokens: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
         top_k: Optional[int] = None,
         top_p: Optional[float] = None,
         temperature: Optional[float] = None
 ) -> dict:
-    initialize()
     kwargs = {
         'max_tokens': max_tokens,
-        'repetition_penalty': repetition_penalty,
         'top_k': top_k,
         'top_p': top_p,
         'temperature': temperature
     }
     kwargs_not_none = dict(filter(lambda x: x[1] is not None, kwargs.items()))
-    import fireworks.client
-    return fireworks.client.Completion.create(
+    return fireworks_client().chat.completions.create(
         model=model,
-        prompt_or_messages=prompt,
+        messages=messages,
         **kwargs_not_none
     ).dict()
-def initialize():
-    global _is_fireworks_initialized
-    if _is_fireworks_initialized:
-        return
-    _logger.info('Initializing Fireworks client.')
-    config = pxt.env.Env.get().config
-    if 'fireworks' in config and 'api_key' in config['fireworks']:
-        api_key = config['fireworks']['api_key']
-    else:
-        api_key = os.environ.get('FIREWORKS_API_KEY')
-    if api_key is None or api_key == '':
-        raise excs.Error('Fireworks client not initialized (no API key configured).')
-    import fireworks.client
-    fireworks.client.api_key = api_key
-    _is_fireworks_initialized = True
-_logger = logging.getLogger('pixeltable')
-_is_fireworks_initialized = False

pixeltable/functions/openai.py CHANGED Viewed

@@ -1,9 +1,14 @@
 import base64
 import io
-from typing import Optional
+import pathlib
+import uuid
+from typing import Optional, TypeVar, Union, Callable
 import PIL.Image
 import numpy as np
+import openai
+import tenacity
+from openai._types import NOT_GIVEN, NotGiven
 import pixeltable as pxt
 import pixeltable.type_system as ts
@@ -11,43 +16,148 @@ from pixeltable import env
 from pixeltable.func import Batch
+def openai_client() -> openai.OpenAI:
+    return env.Env.get().get_client('openai', lambda api_key: openai.OpenAI(api_key=api_key))
+# Exponential backoff decorator using tenacity.
+# TODO(aaron-siegel): Right now this hardwires random exponential backoff with defaults suggested
+# by OpenAI. Should we investigate making this more customizable in the future?
+def _retry(fn: Callable) -> Callable:
+    return tenacity.retry(
+        retry=tenacity.retry_if_exception_type(openai.RateLimitError),
+        wait=tenacity.wait_random_exponential(min=1, max=60),
+        stop=tenacity.stop_after_attempt(6)
+    )(fn)
+#####################################
+# Audio Endpoints
+@pxt.udf(return_type=ts.AudioType())
+@_retry
+def speech(
+        input: str,
+        *,
+        model: str,
+        voice: str,
+        response_format: Optional[str] = None,
+        speed: Optional[float] = None
+) -> str:
+    content = openai_client().audio.speech.create(
+        input=input,
+        model=model,
+        voice=voice,
+        response_format=_opt(response_format),
+        speed=_opt(speed)
+    )
+    ext = response_format or 'mp3'
+    output_filename = str(env.Env.get().tmp_dir / f"{uuid.uuid4()}.{ext}")
+    content.stream_to_file(output_filename, chunk_size=1 << 20)
+    return output_filename
+@pxt.udf(
+    param_types=[ts.AudioType(), ts.StringType(), ts.StringType(nullable=True),
+                 ts.StringType(nullable=True), ts.FloatType(nullable=True)]
+)
+@_retry
+def transcriptions(
+        audio: str,
+        *,
+        model: str,
+        language: Optional[str] = None,
+        prompt: Optional[str] = None,
+        temperature: Optional[float] = None
+) -> dict:
+    file = pathlib.Path(audio)
+    transcription = openai_client().audio.transcriptions.create(
+        file=file,
+        model=model,
+        language=_opt(language),
+        prompt=_opt(prompt),
+        temperature=_opt(temperature)
+    )
+    return transcription.dict()
+@pxt.udf(
+    param_types=[ts.AudioType(), ts.StringType(), ts.StringType(nullable=True), ts.FloatType(nullable=True)]
+)
+@_retry
+def translations(
+        audio: str,
+        *,
+        model: str,
+        prompt: Optional[str] = None,
+        temperature: Optional[float] = None
+) -> dict:
+    file = pathlib.Path(audio)
+    translation = openai_client().audio.translations.create(
+        file=file,
+        model=model,
+        prompt=_opt(prompt),
+        temperature=_opt(temperature)
+    )
+    return translation.dict()
+#####################################
+# Chat Endpoints
 @pxt.udf
+@_retry
 def chat_completions(
         messages: list,
+        *,
         model: str,
         frequency_penalty: Optional[float] = None,
-        logit_bias: Optional[dict] = None,
+        logit_bias: Optional[dict[str, int]] = None,
+        logprobs: Optional[bool] = None,
+        top_logprobs: Optional[int] = None,
         max_tokens: Optional[int] = None,
         n: Optional[int] = None,
         presence_penalty: Optional[float] = None,
         response_format: Optional[dict] = None,
         seed: Optional[int] = None,
+        stop: Optional[list[str]] = None,
+        temperature: Optional[float] = None,
         top_p: Optional[float] = None,
-        temperature: Optional[float] = None
+        tools: Optional[list[dict]] = None,
+        tool_choice: Optional[dict] = None,
+        user: Optional[str] = None
 ) -> dict:
-    from openai._types import NOT_GIVEN
-    result = env.Env.get().openai_client.chat.completions.create(
+    result = openai_client().chat.completions.create(
         messages=messages,
         model=model,
-        frequency_penalty=frequency_penalty if frequency_penalty is not None else NOT_GIVEN,
-        logit_bias=logit_bias if logit_bias is not None else NOT_GIVEN,
-        max_tokens=max_tokens if max_tokens is not None else NOT_GIVEN,
-        n=n if n is not None else NOT_GIVEN,
-        presence_penalty=presence_penalty if presence_penalty is not None else NOT_GIVEN,
-        response_format=response_format if response_format is not None else NOT_GIVEN,
-        seed=seed if seed is not None else NOT_GIVEN,
-        top_p=top_p if top_p is not None else NOT_GIVEN,
-        temperature=temperature if temperature is not None else NOT_GIVEN
+        frequency_penalty=_opt(frequency_penalty),
+        logit_bias=_opt(logit_bias),
+        logprobs=_opt(logprobs),
+        top_logprobs=_opt(top_logprobs),
+        max_tokens=_opt(max_tokens),
+        n=_opt(n),
+        presence_penalty=_opt(presence_penalty),
+        response_format=_opt(response_format),
+        seed=_opt(seed),
+        stop=_opt(stop),
+        temperature=_opt(temperature),
+        top_p=_opt(top_p),
+        tools=_opt(tools),
+        tool_choice=_opt(tool_choice),
+        user=_opt(user)
     )
     return result.dict()
 @pxt.udf
+@_retry
 def vision(
         prompt: str,
         image: PIL.Image.Image,
+        *,
         model: str = 'gpt-4-vision-preview'
 ) -> str:
+    # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
     bytes_arr = io.BytesIO()
     image.save(bytes_arr, format='png')
     b64_bytes = base64.b64encode(bytes_arr.getvalue())
@@ -61,28 +171,86 @@ def vision(
              }}
          ]}
     ]
-    result = env.Env.get().openai_client.chat.completions.create(
+    result = openai_client().chat.completions.create(
         messages=messages,
         model=model
     )
     return result.choices[0].message.content
-@pxt.udf
-def moderations(input: str, model: Optional[str] = None) -> dict:
-    result = env.Env().get().openai_client.moderations.create(input=input, model=model)
-    return result.dict()
+#####################################
+# Embeddings Endpoints
 @pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType()))
-def embeddings(input: Batch[str], *, model: str) -> Batch[np.ndarray]:
-    result = env.Env().get().openai_client.embeddings.create(
+@_retry
+def embeddings(
+        input: Batch[str],
+        *,
+        model: str,
+        user: Optional[str] = None
+) -> Batch[np.ndarray]:
+    result = openai_client().embeddings.create(
         input=input,
         model=model,
+        user=_opt(user),
         encoding_format='float'
     )
-    embeddings = [
+    return [
         np.array(data.embedding, dtype=np.float64)
         for data in result.data
     ]
-    return embeddings
+#####################################
+# Images Endpoints
+@pxt.udf
+@_retry
+def image_generations(
+        prompt: str,
+        *,
+        model: Optional[str] = None,
+        quality: Optional[str] = None,
+        size: Optional[str] = None,
+        style: Optional[str] = None,
+        user: Optional[str] = None
+) -> PIL.Image.Image:
+    # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
+    result = openai_client().images.generate(
+        prompt=prompt,
+        model=_opt(model),
+        quality=_opt(quality),
+        size=_opt(size),
+        style=_opt(style),
+        user=_opt(user),
+        response_format="b64_json"
+    )
+    b64_str = result.data[0].b64_json
+    b64_bytes = base64.b64decode(b64_str)
+    img = PIL.Image.open(io.BytesIO(b64_bytes))
+    img.load()
+    return img
+#####################################
+# Moderations Endpoints
+@pxt.udf
+@_retry
+def moderations(
+        input: str,
+        *,
+        model: Optional[str] = None
+) -> dict:
+    result = openai_client().moderations.create(
+        input=input,
+        model=_opt(model)
+    )
+    return result.dict()
+_T = TypeVar('_T')
+def _opt(arg: _T) -> Union[_T, NotGiven]:
+    return arg if arg is not None else NOT_GIVEN

pixeltable 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl