PyPI - pixeltable - Versions diffs - 0.2.30__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

pixeltable 0.2.30py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show

pixeltable/__init__.py +1 -1
pixeltable/__version__.py +2 -2
pixeltable/catalog/table.py +212 -173
pixeltable/catalog/table_version.py +2 -1
pixeltable/catalog/view.py +3 -5
pixeltable/dataframe.py +52 -39
pixeltable/env.py +94 -5
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/aggregation_node.py +3 -3
pixeltable/exec/cache_prefetch_node.py +13 -7
pixeltable/exec/component_iteration_node.py +3 -9
pixeltable/exec/data_row_batch.py +17 -5
pixeltable/exec/exec_node.py +32 -12
pixeltable/exec/expr_eval/__init__.py +1 -0
pixeltable/exec/expr_eval/evaluators.py +245 -0
pixeltable/exec/expr_eval/expr_eval_node.py +404 -0
pixeltable/exec/expr_eval/globals.py +114 -0
pixeltable/exec/expr_eval/row_buffer.py +76 -0
pixeltable/exec/expr_eval/schedulers.py +232 -0
pixeltable/exec/in_memory_data_node.py +2 -2
pixeltable/exec/row_update_node.py +14 -14
pixeltable/exec/sql_node.py +2 -2
pixeltable/exprs/column_ref.py +5 -1
pixeltable/exprs/data_row.py +50 -40
pixeltable/exprs/expr.py +57 -12
pixeltable/exprs/function_call.py +54 -19
pixeltable/exprs/inline_expr.py +12 -21
pixeltable/exprs/literal.py +25 -8
pixeltable/exprs/row_builder.py +23 -0
pixeltable/exprs/similarity_expr.py +4 -4
pixeltable/func/__init__.py +5 -5
pixeltable/func/aggregate_function.py +4 -0
pixeltable/func/callable_function.py +54 -6
pixeltable/func/expr_template_function.py +5 -1
pixeltable/func/function.py +54 -13
pixeltable/func/query_template_function.py +56 -10
pixeltable/func/tools.py +51 -14
pixeltable/func/udf.py +7 -1
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/anthropic.py +108 -21
pixeltable/functions/gemini.py +2 -6
pixeltable/functions/huggingface.py +10 -28
pixeltable/functions/openai.py +225 -28
pixeltable/globals.py +8 -5
pixeltable/index/embedding_index.py +90 -38
pixeltable/io/label_studio.py +1 -1
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_24.py +11 -2
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +24 -9
pixeltable/store.py +6 -0
pixeltable/type_system.py +4 -7
pixeltable/utils/arrow.py +3 -3
{pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/METADATA +5 -11
{pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/RECORD +59 -53
pixeltable/exec/expr_eval_node.py +0 -232
{pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/LICENSE +0 -0
{pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/WHEEL +0 -0
{pixeltable-0.2.30.dist-info → pixeltable-0.3.1.dist-info}/entry_points.txt +0 -0

pixeltable/functions/huggingface.py CHANGED Viewed

@@ -144,9 +144,9 @@ def cross_encoder_list(sentence1: str, sentences2: list, *, model_id: str) -> li
 @pxt.udf(batch_size=32)
-def clip_text(text: Batch[str], *, model_id: str) -> Batch[pxt.Array[(None,), pxt.Float]]:
+def clip(text: Batch[str], *, model_id: str) -> Batch[pxt.Array[(None,), pxt.Float]]:
     """
-    Computes a CLIP embedding for the specified text. `model_id` should be a reference to a pretrained
+    Computes a CLIP embedding for the specified text or image. `model_id` should be a reference to a pretrained
     [CLIP Model](https://huggingface.co/docs/transformers/model_doc/clip).
     __Requirements:__
@@ -164,7 +164,11 @@ def clip_text(text: Batch[str], *, model_id: str) -> Batch[pxt.Array[(None,), px
         Add a computed column that applies the model `openai/clip-vit-base-patch32` to an existing
         Pixeltable column `tbl.text` of the table `tbl`:
-        >>> tbl['result'] = clip_text(tbl.text, model_id='openai/clip-vit-base-patch32')
+        >>> tbl.add_computed_column(
+        ...     result=clip(tbl.text, model_id='openai/clip-vit-base-patch32')
+        ... )
+        The same would work with an image column `tbl.image` in place of `tbl.text`.
     """
     env.Env.get().require_package('transformers')
     device = resolve_torch_device('auto')
@@ -181,29 +185,8 @@ def clip_text(text: Batch[str], *, model_id: str) -> Batch[pxt.Array[(None,), px
     return [embeddings[i] for i in range(embeddings.shape[0])]
-@pxt.udf(batch_size=32)
-def clip_image(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[pxt.Array[(None,), pxt.Float]]:
-    """
-    Computes a CLIP embedding for the specified image. `model_id` should be a reference to a pretrained
-    [CLIP Model](https://huggingface.co/docs/transformers/model_doc/clip).
-    __Requirements:__
-    - `pip install torch transformers`
-    Args:
-        image: The image to embed.
-        model_id: The pretrained model to use for the embedding.
-    Returns:
-        An array containing the output of the embedding model.
-    Examples:
-        Add a computed column that applies the model `openai/clip-vit-base-patch32` to an existing
-        Pixeltable column `image` of the table `tbl`:
-        >>> tbl['result'] = clip_image(tbl.image, model_id='openai/clip-vit-base-patch32')
-    """
+@clip.overload
+def _(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[pxt.Array[(None,), pxt.Float]]:
     env.Env.get().require_package('transformers')
     device = resolve_torch_device('auto')
     import torch
@@ -219,8 +202,7 @@ def clip_image(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[pxt.Arr
     return [embeddings[i] for i in range(embeddings.shape[0])]
-@clip_text.conditional_return_type
-@clip_image.conditional_return_type
+@clip.conditional_return_type
 def _(model_id: str) -> pxt.ArrayType:
     try:
         from transformers import CLIPModel

pixeltable/functions/openai.py CHANGED Viewed

@@ -6,14 +6,18 @@ the [Working with OpenAI](https://pixeltable.readme.io/docs/working-with-openai)
 """
 import base64
+import datetime
 import io
 import json
+import logging
 import pathlib
+import re
 import uuid
-from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union, cast, Any, Type
-import numpy as np
 import PIL.Image
+import httpx
+import numpy as np
 import tenacity
 import pixeltable as pxt
@@ -24,15 +28,28 @@ from pixeltable.utils.code import local_public_names
 if TYPE_CHECKING:
     import openai
+_logger = logging.getLogger('pixeltable')
 @env.register_client('openai')
-def _(api_key: str) -> 'openai.OpenAI':
+def _(api_key: str) -> tuple['openai.OpenAI', 'openai.AsyncOpenAI']:
     import openai
-    return openai.OpenAI(api_key=api_key)
+    return (
+        openai.OpenAI(api_key=api_key),
+        openai.AsyncOpenAI(
+            api_key=api_key,
+            # recommended to increase limits for async client to avoid connection errors
+            http_client=httpx.AsyncClient(limits=httpx.Limits(max_keepalive_connections=100, max_connections=500)),
+        )
+    )
 def _openai_client() -> 'openai.OpenAI':
-    return env.Env.get().get_client('openai')
+    return env.Env.get().get_client('openai')[0]
+def _async_openai_client() -> 'openai.AsyncOpenAI':
+    return env.Env.get().get_client('openai')[1]
 # Exponential backoff decorator using tenacity.
@@ -47,13 +64,128 @@ def _retry(fn: Callable) -> Callable:
     )(fn)
+# models that share rate limits; see https://platform.openai.com/settings/organization/limits for details
+_shared_rate_limits = {
+    'gpt-4-turbo': [
+        'gpt-4-turbo',
+        'gpt-4-turbo-latest',
+        'gpt-4-turbo-2024-04-09',
+        'gpt-4-turbo-preview',
+        'gpt-4-0125-preview',
+        'gpt-4-1106-preview'
+    ],
+    'gpt-4o': [
+        'gpt-4o',
+        'gpt-4o-latest',
+        'gpt-4o-2024-05-13',
+        'gpt-4o-2024-08-06',
+        'gpt-4o-2024-11-20',
+        'gpt-4o-audio-preview',
+        'gpt-4o-audio-preview-2024-10-01',
+        'gpt-4o-audio-preview-2024-12-17'
+    ],
+    'gpt-4o-mini': [
+        'gpt-4o-mini',
+        'gpt-4o-mini-latest',
+        'gpt-4o-mini-2024-07-18',
+        'gpt-4o-mini-audio-preview',
+        'gpt-4o-mini-audio-preview-2024-12-17'
+    ],
+    'gpt-4o-mini-realtime-preview': [
+        'gpt-4o-mini-realtime-preview',
+        'gpt-4o-mini-realtime-preview-latest',
+        'gpt-4o-mini-realtime-preview-2024-12-17'
+    ]
+}
+def _resource_pool(model: str) -> str:
+    for model_family, models in _shared_rate_limits.items():
+        if model in models:
+            return f'rate-limits:openai:{model_family}'
+    return f'rate-limits:openai:{model}'
+class OpenAIRateLimitsInfo(env.RateLimitsInfo):
+    retryable_errors: tuple[Type[Exception], ...]
+    def __init__(self, get_request_resources: Callable[..., dict[str, int]]):
+        super().__init__(get_request_resources)
+        import openai
+        self.retryable_errors = (
+            openai.RateLimitError, openai.APITimeoutError, openai.UnprocessableEntityError, openai.InternalServerError
+        )
+    def get_retry_delay(self, exc: Exception) -> Optional[float]:
+        import openai
+        if not isinstance(exc, self.retryable_errors):
+            return None
+        assert isinstance(exc, openai.APIError)
+        return 1.0
+# RE pattern for duration in '*-reset' headers;
+# examples: 1d2h3ms, 4m5.6s; # fractional seconds can be reported as 0.5s or 500ms
+_header_duration_pattern = re.compile(r'(?:(\d+)d)?(?:(\d+)h)?(?:(\d+)ms)|(?:(\d+)m)?(?:([\d.]+)s)?')
+def _parse_header_duration(duration_str):
+    match = _header_duration_pattern.match(duration_str)
+    if not match:
+        raise ValueError("Invalid duration format")
+    days = int(match.group(1) or 0)
+    hours = int(match.group(2) or 0)
+    milliseconds = int(match.group(3) or 0)
+    minutes = int(match.group(4) or 0)
+    seconds = float(match.group(5) or 0)
+    return datetime.timedelta(
+        days=days,
+        hours=hours,
+        minutes=minutes,
+        seconds=seconds,
+        milliseconds=milliseconds
+    )
+def _get_header_info(
+        headers: httpx.Headers, *, requests: bool = True, tokens: bool = True
+) -> tuple[Optional[tuple[int, int, datetime.datetime]], Optional[tuple[int, int, datetime.datetime]]]:
+    assert requests or tokens
+    now = datetime.datetime.now(tz=datetime.timezone.utc)
+    requests_info: Optional[tuple[int, int, datetime.datetime]] = None
+    if requests:
+        requests_limit_str = headers.get('x-ratelimit-limit-requests')
+        requests_limit = int(requests_limit_str) if requests_limit_str is not None else None
+        requests_remaining_str = headers.get('x-ratelimit-remaining-requests')
+        requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
+        requests_reset_str = headers.get('x-ratelimit-reset-requests')
+        requests_reset_ts = now + _parse_header_duration(requests_reset_str)
+        requests_info = (requests_limit, requests_remaining, requests_reset_ts)
+    tokens_info: Optional[tuple[int, int, datetime.datetime]] = None
+    if tokens:
+        tokens_limit_str = headers.get('x-ratelimit-limit-tokens')
+        tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
+        tokens_remaining_str = headers.get('x-ratelimit-remaining-tokens')
+        tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
+        tokens_reset_str = headers.get('x-ratelimit-reset-tokens')
+        tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
+        tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
+    return requests_info, tokens_info
 #####################################
 # Audio Endpoints
 @pxt.udf
 def speech(
-    input: str, *, model: str, voice: str, response_format: Optional[str] = None, speed: Optional[float] = None
+        input: str, *, model: str, voice: str, response_format: Optional[str] = None, speed: Optional[float] = None
 ) -> pxt.Audio:
     """
     Generates audio from the input text.
@@ -176,8 +308,24 @@ def translations(
 # Chat Endpoints
+def _chat_completions_get_request_resources(
+        messages: list, max_tokens: Optional[int], n: Optional[int]
+) -> dict[str, int]:
+    completion_tokens = n * max_tokens
+    num_tokens = 0.0
+    for message in messages:
+        num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+        for key, value in message.items():
+            num_tokens += len(value) / 4
+            if key == "name":  # if there's a name, the role is omitted
+                num_tokens -= 1  # role is always required and always 1 token
+    num_tokens += 2  # every reply is primed with <im_start>assistant
+    return {'requests': 1, 'tokens': int(num_tokens) + completion_tokens}
 @pxt.udf
-def chat_completions(
+async def chat_completions(
     messages: list,
     *,
     model: str,
@@ -185,8 +333,8 @@ def chat_completions(
     logit_bias: Optional[dict[str, int]] = None,
     logprobs: Optional[bool] = None,
     top_logprobs: Optional[int] = None,
-    max_tokens: Optional[int] = None,
-    n: Optional[int] = None,
+    max_tokens: Optional[int] = 1024,
+    n: Optional[int] = 1,
     presence_penalty: Optional[float] = None,
     response_format: Optional[dict] = None,
     seed: Optional[int] = None,
@@ -226,7 +374,6 @@ def chat_completions(
             ]
             tbl['response'] = chat_completions(messages, model='gpt-4o-mini')
     """
     if tools is not None:
         tools = [
             {
@@ -236,7 +383,25 @@ def chat_completions(
             for tool in tools
         ]
-    result = _retry(_openai_client().chat.completions.create)(
+    tool_choice_: Union[str, dict, None] = None
+    if tool_choice is not None:
+        if tool_choice['auto']:
+            tool_choice_ = 'auto'
+        elif tool_choice['required']:
+            tool_choice_ = 'required'
+        else:
+            assert tool_choice['tool'] is not None
+            tool_choice_ = {
+                'type': 'function',
+                'function': {'name': tool_choice['tool']}
+            }
+    extra_body: Optional[dict[str, Any]] = None
+    if tool_choice is not None and not tool_choice['parallel_tool_calls']:
+        extra_body = {'parallel_tool_calls': False}
+    # cast(Any, ...): avoid mypy errors
+    result = await _async_openai_client().chat.completions.with_raw_response.create(
         messages=messages,
         model=model,
         frequency_penalty=_opt(frequency_penalty),
@@ -246,16 +411,25 @@ def chat_completions(
         max_tokens=_opt(max_tokens),
         n=_opt(n),
         presence_penalty=_opt(presence_penalty),
-        response_format=_opt(response_format),
+        response_format=_opt(cast(Any, response_format)),
         seed=_opt(seed),
         stop=_opt(stop),
         temperature=_opt(temperature),
         top_p=_opt(top_p),
-        tools=_opt(tools),
-        tool_choice=_opt(tool_choice),
+        tools=_opt(cast(Any, tools)),
+        tool_choice=_opt(cast(Any, tool_choice_)),
         user=_opt(user),
+        timeout=10,
+        extra_body=extra_body,
     )
-    return result.dict()
+    resource_pool = _resource_pool(model)
+    requests_info, tokens_info = _get_header_info(result.headers)
+    rate_limits_info = env.Env.get().get_resource_pool_info(resource_pool, lambda: OpenAIRateLimitsInfo(
+        _chat_completions_get_request_resources))
+    rate_limits_info.record(requests=requests_info, tokens=tokens_info)
+    return json.loads(result.text)
 @pxt.udf
@@ -312,8 +486,13 @@ _embedding_dimensions_cache: dict[str, int] = {
 }
+def _embeddings_get_request_resources(input: list[str]) -> dict[str, int]:
+    input_len = sum(len(s) for s in input)
+    return {'requests': 1, 'tokens': int(input_len / 4)}
 @pxt.udf(batch_size=32)
-def embeddings(
+async def embeddings(
     input: Batch[str], *, model: str, dimensions: Optional[int] = None, user: Optional[str] = None
 ) -> Batch[pxt.Array[(None,), pxt.Float]]:
     """
@@ -343,10 +522,16 @@ def embeddings(
         >>> tbl['embed'] = embeddings(tbl.text, model='text-embedding-3-small')
     """
-    result = _retry(_openai_client().embeddings.create)(
+    _logger.debug(f'embeddings: batch_size={len(input)}')
+    result = await _async_openai_client().embeddings.with_raw_response.create(
         input=input, model=model, dimensions=_opt(dimensions), user=_opt(user), encoding_format='float'
     )
-    return [np.array(data.embedding, dtype=np.float64) for data in result.data]
+    resource_pool = _resource_pool(model)
+    requests_info, tokens_info = _get_header_info(result.headers)
+    rate_limits_info = env.Env.get().get_resource_pool_info(
+        resource_pool, lambda: OpenAIRateLimitsInfo(_embeddings_get_request_resources))
+    rate_limits_info.record(requests=requests_info, tokens=tokens_info)
+    return [np.array(data['embedding'], dtype=np.float64) for data in json.loads(result.content)['data']]
 @embeddings.conditional_return_type
@@ -367,7 +552,7 @@ def _(model: str, dimensions: Optional[int] = None) -> pxt.ArrayType:
 def image_generations(
     prompt: str,
     *,
-    model: Optional[str] = None,
+    model: str = 'dall-e-2',
     quality: Optional[str] = None,
     size: Optional[str] = None,
     style: Optional[str] = None,
@@ -423,7 +608,7 @@ def _(size: Optional[str] = None) -> pxt.ImageType:
     if x_pos == -1:
         return pxt.ImageType()
     try:
-        width, height = int(size[:x_pos]), int(size[x_pos + 1 :])
+        width, height = int(size[:x_pos]), int(size[x_pos + 1:])
     except ValueError:
         return pxt.ImageType()
     return pxt.ImageType(size=(width, height))
@@ -434,7 +619,7 @@ def _(size: Optional[str] = None) -> pxt.ImageType:
 @pxt.udf
-def moderations(input: str, *, model: Optional[str] = None) -> dict:
+def moderations(input: str, *, model: str = 'omni-moderation-latest') -> dict:
     """
     Classifies if text is potentially harmful.
@@ -464,6 +649,18 @@ def moderations(input: str, *, model: Optional[str] = None) -> dict:
     return result.dict()
+# @speech.resource_pool
+# @transcriptions.resource_pool
+# @translations.resource_pool
+@chat_completions.resource_pool
+# @vision.resource_pool
+@embeddings.resource_pool
+# @image_generations.resource_pool
+# @moderations.resource_pool
+def _(model: str) -> str:
+    return _resource_pool(model)
 def invoke_tools(tools: Tools, response: exprs.Expr) -> exprs.InlineDict:
     """Converts an OpenAI response dict to Pixeltable tool invocation format and calls `tools._invoke()`."""
     return tools._invoke(_openai_response_to_pxt_tool_calls(response))
@@ -471,15 +668,15 @@ def invoke_tools(tools: Tools, response: exprs.Expr) -> exprs.InlineDict:
 @pxt.udf
 def _openai_response_to_pxt_tool_calls(response: dict) -> Optional[dict]:
+    if 'tool_calls' not in response['choices'][0]['message'] or response['choices'][0]['message']['tool_calls'] is None:
+        return None
     openai_tool_calls = response['choices'][0]['message']['tool_calls']
-    if openai_tool_calls is not None:
-        return {
-            tool_call['function']['name']: {
-                'args': json.loads(tool_call['function']['arguments'])
-            }
-            for tool_call in openai_tool_calls
+    return {
+        tool_call['function']['name']: {
+            'args': json.loads(tool_call['function']['arguments'])
         }
-    return None
+        for tool_call in openai_tool_calls
+    }
 _T = TypeVar('_T')

pixeltable/globals.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import dataclasses
 import logging
-from typing import Any, Iterable, Optional, Union, Literal, Type
+from typing import Any, Iterable, Literal, Optional, Union
 from uuid import UUID
 import pandas as pd
@@ -26,7 +26,7 @@ def init() -> None:
 def _get_or_drop_existing_path(
     path_str: str,
-    expected_obj_type: Type[catalog.SchemaObject],
+    expected_obj_type: type[catalog.SchemaObject],
     expected_snapshot: bool,
     if_exists: catalog.IfExistsParam
 ) -> Optional[catalog.SchemaObject]:
@@ -289,6 +289,11 @@ def create_view(
     if additional_columns is None:
         additional_columns = {}
+    else:
+        # additional columns should not be in the base table
+        for col_name in additional_columns.keys():
+            if col_name in [c.name for c in tbl_version_path.columns()]:
+                raise excs.Error(f"Column {col_name!r} already exists in the base table {tbl_version_path.get_column(col_name).tbl.name}.")
     if iterator is None:
         iterator_class, iterator_args = None, None
     else:
@@ -787,8 +792,6 @@ def tool(fn: func.Function, name: Optional[str] = None, description: Optional[st
     Returns:
         A `Tool` instance that can be passed to an LLM tool-calling API.
     """
-    if fn.self_path is None:
-        raise excs.Error('Only module UDFs can be used as tools (not locally defined UDFs)')
     if isinstance(fn, func.AggregateFunction):
         raise excs.Error('Aggregator UDFs cannot be used as tools')
@@ -814,4 +817,4 @@ def configure_logging(
 def array(elements: Iterable) -> exprs.Expr:
-    return exprs.InlineArray(elements)
+    return exprs.Expr.from_array(elements)

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -46,32 +46,79 @@ class EmbeddingIndex(IndexBase):
     index_col_type: pgvector.sqlalchemy.Vector
     def __init__(
-            self, c: catalog.Column, metric: str, string_embed: Optional[func.Function] = None,
-            image_embed: Optional[func.Function] = None):
+        self,
+        c: catalog.Column,
+        metric: str,
+        embed: Optional[func.Function] = None,
+        string_embed: Optional[func.Function] = None,
+        image_embed: Optional[func.Function] = None,
+    ):
+        if embed is None and string_embed is None and image_embed is None:
+            raise excs.Error('At least one of `embed`, `string_embed`, or `image_embed` must be specified')
         metric_names = [m.name.lower() for m in self.Metric]
         if metric.lower() not in metric_names:
             raise excs.Error(f'Invalid metric {metric}, must be one of {metric_names}')
         if not c.col_type.is_string_type() and not c.col_type.is_image_type():
             raise excs.Error(f'Embedding index requires string or image column')
-        if c.col_type.is_string_type() and string_embed is None:
-                raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
-        if c.col_type.is_image_type() and image_embed is None:
-            raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
-        if string_embed is None:
-            self.string_embed = None
-        else:
-            # verify signature and convert to a monomorphic function
-            self.string_embed = self._validate_embedding_fn(string_embed, 'string_embed', ts.ColumnType.Type.STRING)
-        if image_embed is None:
-            self.image_embed = None
-        else:
-            # verify signature and convert to a monomorphic function
-            self.image_embed = self._validate_embedding_fn(image_embed, 'image_embed', ts.ColumnType.Type.IMAGE)
+        self.string_embed = None
+        self.image_embed = None
+        # Resolve the specific embedding functions corresponding to the user-provided `string_embed`, `image_embed`,
+        # and/or `embed`. For string embeddings, `string_embed` will be used if specified; otherwise, `embed` will
+        # be used as a fallback, if it has a matching signature. Likewise for image embeddings.
+        if string_embed is not None:
+            # `string_embed` is specified; it MUST be valid.
+            self.string_embed = self._resolve_embedding_fn(string_embed, ts.ColumnType.Type.STRING)
+            if self.string_embed is None:
+                raise excs.Error(
+                    f'The function `{string_embed.name}` is not a valid string embedding: '
+                    'it must take a single string parameter'
+                )
+        elif embed is not None:
+            # `embed` is specified; see if it has a string signature.
+            self.string_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.STRING)
+        if image_embed is not None:
+            # `image_embed` is specified; it MUST be valid.
+            self.image_embed = self._resolve_embedding_fn(image_embed, ts.ColumnType.Type.IMAGE)
+            if self.image_embed is None:
+                raise excs.Error(
+                    f'The function `{image_embed.name}` is not a valid image embedding: '
+                    'it must take a single image parameter'
+                )
+        elif embed is not None:
+            # `embed` is specified; see if it has an image signature.
+            self.image_embed = self._resolve_embedding_fn(embed, ts.ColumnType.Type.IMAGE)
+        if self.string_embed is None and self.image_embed is None:
+            # No string OR image signature was found. This can only happen if `embed` was specified and
+            # contains no matching signatures.
+            assert embed is not None
+            raise excs.Error(
+                f'The function `{embed.name}` is not a valid embedding: '
+                'it must take a single string or image parameter'
+            )
+        # Now validate the return types of the embedding functions.
+        if self.string_embed is not None:
+            self._validate_embedding_fn(self.string_embed, ts.ColumnType.Type.STRING)
+        if self.image_embed is not None:
+            self._validate_embedding_fn(self.image_embed, ts.ColumnType.Type.IMAGE)
+        if c.col_type.is_string_type() and self.string_embed is None:
+            raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
+        if c.col_type.is_image_type() and self.image_embed is None:
+            raise excs.Error(f"Image embedding function is required for column {c.name} (parameter 'image_embed')")
         self.metric = self.Metric[metric.upper()]
-        self.value_expr = string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type() else image_embed(exprs.ColumnRef(c))
+        self.value_expr = (
+            self.string_embed(exprs.ColumnRef(c)) if c.col_type.is_string_type()
+            else self.image_embed(exprs.ColumnRef(c))
+        )
         assert isinstance(self.value_expr.col_type, ts.ArrayType)
         vector_size = self.value_expr.col_type.shape[0]
         assert vector_size is not None
@@ -144,42 +191,47 @@ class EmbeddingIndex(IndexBase):
         return 'embedding'
     @classmethod
-    def _validate_embedding_fn(cls, embed_fn: func.Function, name: str, expected_type: ts.ColumnType.Type) -> func.Function:
-        """Validate that the Function has a matching signature, and return the corresponding monomorphic function."""
+    def _resolve_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> Optional[func.Function]:
+        """Find an overload resolution for `embed_fn` that matches the given type."""
         assert isinstance(embed_fn, func.Function)
-        signature_idx: int = -1
-        for idx, sig in enumerate(embed_fn.signatures):
+        for resolved_fn in embed_fn._resolved_fns:
             # The embedding function must be a 1-ary function of the correct type. But it's ok if the function signature
             # has more than one parameter, as long as it has at most one *required* parameter.
+            sig = resolved_fn.signature
             if (len(sig.parameters) >= 1
                 and len(sig.required_parameters) <= 1
                 and sig.parameters_by_pos[0].col_type.type_enum == expected_type):
-                signature_idx = idx
-                break
-        if signature_idx == -1:
-            raise excs.Error(f'{name} must take a single {expected_type.name.lower()} parameter')
+                return resolved_fn
+        return None
-        resolved_fn = embed_fn._resolved_fns[signature_idx]
+    @classmethod
+    def _validate_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> None:
+        """Validate the given embedding function."""
+        assert not embed_fn.is_polymorphic
+        sig = embed_fn.signature
         # validate return type
         param_name = sig.parameters_by_pos[0].name
         if expected_type == ts.ColumnType.Type.STRING:
-            return_type = resolved_fn.call_return_type([], {param_name: 'dummy'})
+            return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
         else:
             assert expected_type == ts.ColumnType.Type.IMAGE
             img = PIL.Image.new('RGB', (512, 512))
-            return_type = resolved_fn.call_return_type([], {param_name: img})
+            return_type = embed_fn.call_return_type([], {param_name: img})
         assert return_type is not None
         if not isinstance(return_type, ts.ArrayType):
-            raise excs.Error(f'{name} must return an array, but returns {return_type}')
-        else:
-            shape = return_type.shape
-            if len(shape) != 1 or shape[0] == None:
-                raise excs.Error(f'{name} must return a 1D array of a specific length, but returns {return_type}')
-        return resolved_fn
+            raise excs.Error(
+                f'The function `{embed_fn.name}` is not a valid embedding: '
+                f'it must return an array, but returns {return_type}'
+            )
+        shape = return_type.shape
+        if len(shape) != 1 or shape[0] == None:
+            raise excs.Error(
+                f'The function `{embed_fn.name}` is not a valid embedding: '
+                f'it must return a 1-dimensional array of a specific length, but returns {return_type}'
+            )
     def as_dict(self) -> dict:
         return {

pixeltable/io/label_studio.py CHANGED Viewed

@@ -574,7 +574,7 @@ class LabelStudioProject(Project):
             else:
                 local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
             if local_annotations_column not in t._schema.keys():
-                t[local_annotations_column] = pxt.JsonType(nullable=True)
+                t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
         resolved_col_mapping = cls.validate_columns(
             t, config.export_columns, {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}, col_mapping)

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 25
+VERSION = 26
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable 0.2.30__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.30py3-none-any.whl → 0.3.1py3-none-any.whl