PyPI - pixeltable - Versions diffs - 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

pixeltable 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/insertable_table.py +3 -3
pixeltable/catalog/table.py +2 -2
pixeltable/catalog/table_version.py +3 -2
pixeltable/catalog/view.py +1 -1
pixeltable/dataframe.py +52 -27
pixeltable/env.py +109 -4
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/aggregation_node.py +3 -3
pixeltable/exec/cache_prefetch_node.py +13 -7
pixeltable/exec/component_iteration_node.py +3 -9
pixeltable/exec/data_row_batch.py +17 -5
pixeltable/exec/exec_node.py +32 -12
pixeltable/exec/expr_eval/__init__.py +1 -0
pixeltable/exec/expr_eval/evaluators.py +240 -0
pixeltable/exec/expr_eval/expr_eval_node.py +408 -0
pixeltable/exec/expr_eval/globals.py +113 -0
pixeltable/exec/expr_eval/row_buffer.py +76 -0
pixeltable/exec/expr_eval/schedulers.py +240 -0
pixeltable/exec/in_memory_data_node.py +2 -2
pixeltable/exec/row_update_node.py +14 -14
pixeltable/exec/sql_node.py +2 -2
pixeltable/exprs/column_ref.py +5 -1
pixeltable/exprs/data_row.py +50 -40
pixeltable/exprs/expr.py +57 -12
pixeltable/exprs/function_call.py +54 -19
pixeltable/exprs/inline_expr.py +12 -21
pixeltable/exprs/literal.py +25 -8
pixeltable/exprs/row_builder.py +25 -2
pixeltable/func/aggregate_function.py +4 -0
pixeltable/func/callable_function.py +54 -4
pixeltable/func/expr_template_function.py +5 -1
pixeltable/func/function.py +48 -7
pixeltable/func/query_template_function.py +16 -7
pixeltable/func/udf.py +7 -1
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/anthropic.py +97 -21
pixeltable/functions/gemini.py +2 -6
pixeltable/functions/openai.py +219 -28
pixeltable/globals.py +2 -3
pixeltable/io/hf_datasets.py +1 -1
pixeltable/io/label_studio.py +5 -5
pixeltable/io/parquet.py +1 -1
pixeltable/metadata/__init__.py +2 -1
pixeltable/plan.py +24 -9
pixeltable/store.py +6 -0
pixeltable/type_system.py +73 -36
pixeltable/utils/arrow.py +3 -8
pixeltable/utils/console_output.py +41 -0
pixeltable/utils/filecache.py +1 -1
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/METADATA +4 -1
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/RECORD +55 -49
pixeltable/exec/expr_eval_node.py +0 -232
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/LICENSE +0 -0
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/WHEEL +0 -0
{pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/entry_points.txt +0 -0

pixeltable/functions/openai.py CHANGED Viewed

@@ -6,14 +6,18 @@ the [Working with OpenAI](https://pixeltable.readme.io/docs/working-with-openai)
 """
 import base64
+import datetime
 import io
 import json
+import logging
 import pathlib
+import re
 import uuid
-from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Callable, Optional, TypeVar, Union, cast, Any, Type
-import numpy as np
 import PIL.Image
+import httpx
+import numpy as np
 import tenacity
 import pixeltable as pxt
@@ -24,15 +28,28 @@ from pixeltable.utils.code import local_public_names
 if TYPE_CHECKING:
     import openai
+_logger = logging.getLogger('pixeltable')
 @env.register_client('openai')
-def _(api_key: str) -> 'openai.OpenAI':
+def _(api_key: str) -> tuple['openai.OpenAI', 'openai.AsyncOpenAI']:
     import openai
-    return openai.OpenAI(api_key=api_key)
+    return (
+        openai.OpenAI(api_key=api_key),
+        openai.AsyncOpenAI(
+            api_key=api_key,
+            # recommended to increase limits for async client to avoid connection errors
+            http_client=httpx.AsyncClient(limits=httpx.Limits(max_keepalive_connections=100, max_connections=500)),
+        )
+    )
 def _openai_client() -> 'openai.OpenAI':
-    return env.Env.get().get_client('openai')
+    return env.Env.get().get_client('openai')[0]
+def _async_openai_client() -> 'openai.AsyncOpenAI':
+    return env.Env.get().get_client('openai')[1]
 # Exponential backoff decorator using tenacity.
@@ -47,13 +64,138 @@ def _retry(fn: Callable) -> Callable:
     )(fn)
+# models that share rate limits; see https://platform.openai.com/settings/organization/limits for details
+_shared_rate_limits = {
+    'gpt-4-turbo': [
+        'gpt-4-turbo',
+        'gpt-4-turbo-latest',
+        'gpt-4-turbo-2024-04-09',
+        'gpt-4-turbo-preview',
+        'gpt-4-0125-preview',
+        'gpt-4-1106-preview'
+    ],
+    'gpt-4o': [
+        'gpt-4o',
+        'gpt-4o-latest',
+        'gpt-4o-2024-05-13',
+        'gpt-4o-2024-08-06',
+        'gpt-4o-2024-11-20',
+        'gpt-4o-audio-preview',
+        'gpt-4o-audio-preview-2024-10-01',
+        'gpt-4o-audio-preview-2024-12-17'
+    ],
+    'gpt-4o-mini': [
+        'gpt-4o-mini',
+        'gpt-4o-mini-latest',
+        'gpt-4o-mini-2024-07-18',
+        'gpt-4o-mini-audio-preview',
+        'gpt-4o-mini-audio-preview-2024-12-17'
+    ],
+    'gpt-4o-mini-realtime-preview': [
+        'gpt-4o-mini-realtime-preview',
+        'gpt-4o-mini-realtime-preview-latest',
+        'gpt-4o-mini-realtime-preview-2024-12-17'
+    ]
+}
+def _resource_pool(model: str) -> str:
+    for model_family, models in _shared_rate_limits.items():
+        if model in models:
+            return f'rate-limits:openai:{model_family}'
+    return f'rate-limits:openai:{model}'
+class OpenAIRateLimitsInfo(env.RateLimitsInfo):
+    retryable_errors: tuple[Type[Exception], ...]
+    def __init__(self, get_request_resources: Callable[..., dict[str, int]]):
+        super().__init__(get_request_resources)
+        import openai
+        self.retryable_errors = (
+            # ConnectionError: we occasionally see this error when the AsyncConnectionPool is trying to close
+            # expired connections
+            # (AsyncConnectionPool._close_expired_connections() fails with ConnectionError when executing
+            # 'await connection.aclose()', which is potentially a bug in AsyncConnectionPool)
+            openai.APIConnectionError,
+            # the following errors are retryable according to OpenAI's API documentation
+            openai.RateLimitError,
+            openai.APITimeoutError,
+            openai.UnprocessableEntityError,
+            openai.InternalServerError,
+        )
+    def get_retry_delay(self, exc: Exception) -> Optional[float]:
+        import openai
+        if not isinstance(exc, self.retryable_errors):
+            return None
+        assert isinstance(exc, openai.APIError)
+        return 1.0
+# RE pattern for duration in '*-reset' headers;
+# examples: 1d2h3ms, 4m5.6s; # fractional seconds can be reported as 0.5s or 500ms
+_header_duration_pattern = re.compile(r'(?:(\d+)d)?(?:(\d+)h)?(?:(\d+)ms)|(?:(\d+)m)?(?:([\d.]+)s)?')
+def _parse_header_duration(duration_str):
+    match = _header_duration_pattern.match(duration_str)
+    if not match:
+        raise ValueError("Invalid duration format")
+    days = int(match.group(1) or 0)
+    hours = int(match.group(2) or 0)
+    milliseconds = int(match.group(3) or 0)
+    minutes = int(match.group(4) or 0)
+    seconds = float(match.group(5) or 0)
+    return datetime.timedelta(
+        days=days,
+        hours=hours,
+        minutes=minutes,
+        seconds=seconds,
+        milliseconds=milliseconds
+    )
+def _get_header_info(
+        headers: httpx.Headers, *, requests: bool = True, tokens: bool = True
+) -> tuple[Optional[tuple[int, int, datetime.datetime]], Optional[tuple[int, int, datetime.datetime]]]:
+    assert requests or tokens
+    now = datetime.datetime.now(tz=datetime.timezone.utc)
+    requests_info: Optional[tuple[int, int, datetime.datetime]] = None
+    if requests:
+        requests_limit_str = headers.get('x-ratelimit-limit-requests')
+        requests_limit = int(requests_limit_str) if requests_limit_str is not None else None
+        requests_remaining_str = headers.get('x-ratelimit-remaining-requests')
+        requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
+        requests_reset_str = headers.get('x-ratelimit-reset-requests')
+        requests_reset_ts = now + _parse_header_duration(requests_reset_str)
+        requests_info = (requests_limit, requests_remaining, requests_reset_ts)
+    tokens_info: Optional[tuple[int, int, datetime.datetime]] = None
+    if tokens:
+        tokens_limit_str = headers.get('x-ratelimit-limit-tokens')
+        tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
+        tokens_remaining_str = headers.get('x-ratelimit-remaining-tokens')
+        tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
+        tokens_reset_str = headers.get('x-ratelimit-reset-tokens')
+        tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
+        tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
+    return requests_info, tokens_info
 #####################################
 # Audio Endpoints
 @pxt.udf
 def speech(
-    input: str, *, model: str, voice: str, response_format: Optional[str] = None, speed: Optional[float] = None
+        input: str, *, model: str, voice: str, response_format: Optional[str] = None, speed: Optional[float] = None
 ) -> pxt.Audio:
     """
     Generates audio from the input text.
@@ -176,8 +318,24 @@ def translations(
 # Chat Endpoints
+def _chat_completions_get_request_resources(
+        messages: list, max_tokens: Optional[int], n: Optional[int]
+) -> dict[str, int]:
+    completion_tokens = n * max_tokens
+    num_tokens = 0.0
+    for message in messages:
+        num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
+        for key, value in message.items():
+            num_tokens += len(value) / 4
+            if key == "name":  # if there's a name, the role is omitted
+                num_tokens -= 1  # role is always required and always 1 token
+    num_tokens += 2  # every reply is primed with <im_start>assistant
+    return {'requests': 1, 'tokens': int(num_tokens) + completion_tokens}
 @pxt.udf
-def chat_completions(
+async def chat_completions(
     messages: list,
     *,
     model: str,
@@ -185,8 +343,8 @@ def chat_completions(
     logit_bias: Optional[dict[str, int]] = None,
     logprobs: Optional[bool] = None,
     top_logprobs: Optional[int] = None,
-    max_tokens: Optional[int] = None,
-    n: Optional[int] = None,
+    max_tokens: Optional[int] = 1024,
+    n: Optional[int] = 1,
     presence_penalty: Optional[float] = None,
     response_format: Optional[dict] = None,
     seed: Optional[int] = None,
@@ -226,7 +384,6 @@ def chat_completions(
             ]
             tbl['response'] = chat_completions(messages, model='gpt-4o-mini')
     """
     if tools is not None:
         tools = [
             {
@@ -253,7 +410,13 @@ def chat_completions(
     if tool_choice is not None and not tool_choice['parallel_tool_calls']:
         extra_body = {'parallel_tool_calls': False}
-    result = _retry(_openai_client().chat.completions.create)(
+    # make sure the pool info exists prior to making the request
+    resource_pool = _resource_pool(model)
+    rate_limits_info = env.Env.get().get_resource_pool_info(
+        resource_pool, lambda: OpenAIRateLimitsInfo(_chat_completions_get_request_resources))
+    # cast(Any, ...): avoid mypy errors
+    result = await _async_openai_client().chat.completions.with_raw_response.create(
         messages=messages,
         model=model,
         frequency_penalty=_opt(frequency_penalty),
@@ -263,17 +426,22 @@ def chat_completions(
         max_tokens=_opt(max_tokens),
         n=_opt(n),
         presence_penalty=_opt(presence_penalty),
-        response_format=_opt(response_format),
+        response_format=_opt(cast(Any, response_format)),
         seed=_opt(seed),
         stop=_opt(stop),
         temperature=_opt(temperature),
         top_p=_opt(top_p),
-        tools=_opt(tools),
-        tool_choice=_opt(tool_choice_),
+        tools=_opt(cast(Any, tools)),
+        tool_choice=_opt(cast(Any, tool_choice_)),
         user=_opt(user),
+        timeout=10,
         extra_body=extra_body,
     )
-    return result.dict()
+    requests_info, tokens_info = _get_header_info(result.headers)
+    rate_limits_info.record(requests=requests_info, tokens=tokens_info)
+    return json.loads(result.text)
 @pxt.udf
@@ -330,8 +498,13 @@ _embedding_dimensions_cache: dict[str, int] = {
 }
+def _embeddings_get_request_resources(input: list[str]) -> dict[str, int]:
+    input_len = sum(len(s) for s in input)
+    return {'requests': 1, 'tokens': int(input_len / 4)}
 @pxt.udf(batch_size=32)
-def embeddings(
+async def embeddings(
     input: Batch[str], *, model: str, dimensions: Optional[int] = None, user: Optional[str] = None
 ) -> Batch[pxt.Array[(None,), pxt.Float]]:
     """
@@ -361,10 +534,16 @@ def embeddings(
         >>> tbl['embed'] = embeddings(tbl.text, model='text-embedding-3-small')
     """
-    result = _retry(_openai_client().embeddings.create)(
+    _logger.debug(f'embeddings: batch_size={len(input)}')
+    resource_pool = _resource_pool(model)
+    rate_limits_info = env.Env.get().get_resource_pool_info(
+        resource_pool, lambda: OpenAIRateLimitsInfo(_embeddings_get_request_resources))
+    result = await _async_openai_client().embeddings.with_raw_response.create(
         input=input, model=model, dimensions=_opt(dimensions), user=_opt(user), encoding_format='float'
     )
-    return [np.array(data.embedding, dtype=np.float64) for data in result.data]
+    requests_info, tokens_info = _get_header_info(result.headers)
+    rate_limits_info.record(requests=requests_info, tokens=tokens_info)
+    return [np.array(data['embedding'], dtype=np.float64) for data in json.loads(result.content)['data']]
 @embeddings.conditional_return_type
@@ -385,7 +564,7 @@ def _(model: str, dimensions: Optional[int] = None) -> pxt.ArrayType:
 def image_generations(
     prompt: str,
     *,
-    model: Optional[str] = None,
+    model: str = 'dall-e-2',
     quality: Optional[str] = None,
     size: Optional[str] = None,
     style: Optional[str] = None,
@@ -441,7 +620,7 @@ def _(size: Optional[str] = None) -> pxt.ImageType:
     if x_pos == -1:
         return pxt.ImageType()
     try:
-        width, height = int(size[:x_pos]), int(size[x_pos + 1 :])
+        width, height = int(size[:x_pos]), int(size[x_pos + 1:])
     except ValueError:
         return pxt.ImageType()
     return pxt.ImageType(size=(width, height))
@@ -452,7 +631,7 @@ def _(size: Optional[str] = None) -> pxt.ImageType:
 @pxt.udf
-def moderations(input: str, *, model: Optional[str] = None) -> dict:
+def moderations(input: str, *, model: str = 'omni-moderation-latest') -> dict:
     """
     Classifies if text is potentially harmful.
@@ -482,6 +661,18 @@ def moderations(input: str, *, model: Optional[str] = None) -> dict:
     return result.dict()
+# @speech.resource_pool
+# @transcriptions.resource_pool
+# @translations.resource_pool
+@chat_completions.resource_pool
+# @vision.resource_pool
+@embeddings.resource_pool
+# @image_generations.resource_pool
+# @moderations.resource_pool
+def _(model: str) -> str:
+    return _resource_pool(model)
 def invoke_tools(tools: Tools, response: exprs.Expr) -> exprs.InlineDict:
     """Converts an OpenAI response dict to Pixeltable tool invocation format and calls `tools._invoke()`."""
     return tools._invoke(_openai_response_to_pxt_tool_calls(response))
@@ -489,15 +680,15 @@ def invoke_tools(tools: Tools, response: exprs.Expr) -> exprs.InlineDict:
 @pxt.udf
 def _openai_response_to_pxt_tool_calls(response: dict) -> Optional[dict]:
+    if 'tool_calls' not in response['choices'][0]['message'] or response['choices'][0]['message']['tool_calls'] is None:
+        return None
     openai_tool_calls = response['choices'][0]['message']['tool_calls']
-    if openai_tool_calls is not None:
-        return {
-            tool_call['function']['name']: {
-                'args': json.loads(tool_call['function']['arguments'])
-            }
-            for tool_call in openai_tool_calls
+    return {
+        tool_call['function']['name']: {
+            'args': json.loads(tool_call['function']['arguments'])
         }
-    return None
+        for tool_call in openai_tool_calls
+    }
 _T = TypeVar('_T')

pixeltable/globals.py CHANGED Viewed

@@ -606,8 +606,7 @@ def create_dir(path_str: str, if_exists: Literal['error', 'ignore', 'replace', '
         dir = catalog.Dir(dir_record.id, parent._id, path.name)
         cat.paths[path] = dir
         session.commit()
-        _logger.info(f'Created directory `{path_str}`.')
-        print(f'Created directory `{path_str}`.')
+        Env.get().console_logger.info(f'Created directory `{path_str}`.')
         return dir
 def drop_dir(path_str: str, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error') -> None:
@@ -817,4 +816,4 @@ def configure_logging(
 def array(elements: Iterable) -> exprs.Expr:
-    return exprs.InlineArray(elements)
+    return exprs.Expr.from_array(elements)

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -13,7 +13,7 @@ from pixeltable import exceptions as excs
 if typing.TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
-_logger = logging.getLogger(__name__)
+_logger = logging.getLogger('pixeltable')
 # use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
 # The primary goal is to bound memory use, regardless of dataset size.

pixeltable/io/label_studio.py CHANGED Viewed

@@ -230,7 +230,7 @@ class LabelStudioProject(Project):
                 self.project.create_predictions(predictions)
                 tasks_created += 1
-        print(f'Created {tasks_created} new task(s) in {self}.')
+        env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) in {self}.')
         sync_status = SyncStatus(external_rows_created=tasks_created)
@@ -330,7 +330,7 @@ class LabelStudioProject(Project):
         if len(page) > 0:
             self.project.import_tasks(page)
-        print(f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.')
+        env.Env.get().console_logger.info(f'Created {tasks_created} new task(s) and updated {tasks_updated} existing task(s) in {self}.')
         sync_status = SyncStatus(external_rows_created=tasks_created, external_rows_updated=tasks_updated)
@@ -363,7 +363,7 @@ class LabelStudioProject(Project):
         if len(tasks_to_delete) > 0:
             self.project.delete_tasks(tasks_to_delete)
-            print(f'Deleted {len(tasks_to_delete)} tasks(s) in {self} that are no longer present in Pixeltable.')
+            env.Env.get().console_logger.info(f'Deleted {len(tasks_to_delete)} tasks(s) in {self} that are no longer present in Pixeltable.')
         # Remove them from the `existing_tasks` dict so that future updates are applied correctly
         for rowid in deleted_rowids:
@@ -406,7 +406,7 @@ class LabelStudioProject(Project):
                 assert ancestor._base is not None
                 ancestor = ancestor._base
             update_status = ancestor.batch_update(updates)
-            print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
+            env.Env.get().console_logger.info(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
             return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
         else:
             return SyncStatus.empty()
@@ -529,7 +529,7 @@ class LabelStudioProject(Project):
         """
         title = self.project_title
         _label_studio_client().delete_project(self.project_id)
-        print(f'Deleted Label Studio project: {title}')
+        env.Env.get().console_logger.info(f'Deleted Label Studio project: {title}')
     def __eq__(self, other) -> bool:
         return isinstance(other, LabelStudioProject) and self.project_id == other.project_id

pixeltable/io/parquet.py CHANGED Viewed

@@ -23,7 +23,7 @@ if typing.TYPE_CHECKING:
     import pyarrow as pa
     import pixeltable as pxt
-_logger = logging.getLogger(__name__)
+_logger = logging.getLogger('pixeltable')
 def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -47,7 +47,8 @@ def upgrade_md(engine: sql.engine.Engine) -> None:
         while md_version < VERSION:
             if md_version not in converter_cbs:
                 raise RuntimeError(f'No metadata converter for version {md_version}')
-            print(f'Converting metadata from version {md_version} to {md_version + 1}')
+            from pixeltable.env import Env
+            Env.get().console_logger.info(f'Converting metadata from version {md_version} to {md_version + 1}')
             converter_cbs[md_version](engine)
             md_version += 1
         # update system info

pixeltable/plan.py CHANGED Viewed

@@ -5,6 +5,7 @@ import enum
 from typing import Any, Iterable, Optional, Sequence, Literal
 from uuid import UUID
 import sqlalchemy as sql
 import pixeltable as pxt
@@ -166,10 +167,13 @@ class Analyzer:
             raise excs.Error(
                 f'Invalid non-aggregate expression in aggregate query: {self.select_list[is_agg_output.index(False)]}')
-        # check that filter doesn't contain aggregates
+        # check that Where clause and filter doesn't contain aggregates
+        if self.sql_where_clause is not None:
+            if any(_is_agg_fn_call(e) for e in self.sql_where_clause.subexprs(expr_class=exprs.FunctionCall)):
+                raise excs.Error(f'where() cannot contain aggregate functions: {self.sql_where_clause}')
         if self.filter is not None:
             if any(_is_agg_fn_call(e) for e in self.filter.subexprs(expr_class=exprs.FunctionCall)):
-                raise excs.Error(f'Filter cannot contain aggregate functions: {self.filter}')
+                raise excs.Error(f'where() cannot contain aggregate functions: {self.filter}')
         # check that grouping exprs don't contain aggregates and can be expressed as SQL (we perform sort-based
         # aggregation and rely on the SqlScanNode returning data in the correct order)
@@ -283,7 +287,8 @@ class Planner:
         computed_exprs = row_builder.output_exprs - row_builder.input_exprs
         if len(computed_exprs) > 0:
             # add an ExprEvalNode when there are exprs to compute
-            plan = exec.ExprEvalNode(row_builder, computed_exprs, plan.output_exprs, input=plan)
+            plan = exec.ExprEvalNode(
+                row_builder, computed_exprs, plan.output_exprs, input=plan, maintain_input_order=False)
         stored_col_info = row_builder.output_slot_idxs()
         stored_img_col_info = [info for info in stored_col_info if info.col.col_type.is_image_type()]
@@ -548,7 +553,7 @@ class Planner:
             plan = exec.ComponentIterationNode(target, plan)
         if len(view_output_exprs) > 0:
             plan = exec.ExprEvalNode(
-                row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs,input=plan)
+                row_builder, output_exprs=view_output_exprs, input_exprs=base_output_exprs, input=plan)
         stored_img_col_info = [info for info in row_builder.output_slot_idxs() if info.col.col_type.is_image_type()]
         plan.set_stored_img_cols(stored_img_col_info)
@@ -750,10 +755,12 @@ class Planner:
             ctx.batch_size = 16
             # do aggregation in SQL if all agg exprs can be translated
-            if (sql_elements.contains_all(analyzer.select_list)
-                    and sql_elements.contains_all(analyzer.grouping_exprs)
-                    and isinstance(plan, exec.SqlNode)
-                    and plan.to_cte() is not None):
+            if (
+                sql_elements.contains_all(analyzer.select_list)
+                and sql_elements.contains_all(analyzer.grouping_exprs)
+                and isinstance(plan, exec.SqlNode)
+                and plan.to_cte() is not None
+            ):
                 plan = exec.SqlAggregationNode(
                     row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause)
             else:
@@ -770,14 +777,22 @@ class Planner:
                 # we need an ExprEvalNode to evaluate the remaining output exprs
                 plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_exprs, input=plan)
             # we're returning everything to the user, so we might as well do it in a single batch
+            # TODO: return smaller batches in order to increase inter-ExecNode parallelism
             ctx.batch_size = 0
+        sql_node = plan.get_node(exec.SqlNode)
         if len(analyzer.order_by_clause) > 0:
             # we have the last SqlNode we created produce the ordering
-            sql_node = plan.get_node(exec.SqlNode)
             assert sql_node is not None
             sql_node.set_order_by(analyzer.order_by_clause)
+        # if we don't need an ordered result, tell the ExprEvalNode not to maintain input order (which allows us to
+        # return batches earlier)
+        if sql_node is not None and len(sql_node.order_by_clause) == 0:
+            expr_eval_node = plan.get_node(exec.ExprEvalNode)
+            if expr_eval_node is not None:
+                expr_eval_node.set_input_order(False)
         if limit is not None:
             plan.set_limit(limit)

pixeltable/store.py CHANGED Viewed

@@ -229,6 +229,7 @@ class StoreBase:
             sql.exc.DBAPIError if there was a SQL error during execution
             excs.Error if on_error='abort' and there was an exception during row evaluation
         """
+        assert col.tbl.id == self.tbl_version.id
         num_excs = 0
         num_rows = 0
@@ -249,6 +250,7 @@ class StoreBase:
         try:
             # insert rows from exec_plan into temp table
+            # TODO: unify the table row construction logic with RowBuilder.create_table_row()
             for row_batch in exec_plan:
                 num_rows += len(row_batch)
                 tbl_rows: list[dict[str, Any]] = []
@@ -272,6 +274,10 @@ class StoreBase:
                             tbl_row[col.sa_errortype_col.name] = error_type
                             tbl_row[col.sa_errormsg_col.name] = error_msg
                         else:
+                            if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
+                                # we have yet to store this image
+                                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
+                                result_row.flush_img(value_expr_slot_idx, filepath)
                             val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
                             if col.col_type.is_media_type():
                                 val = self._move_tmp_media_file(val, col, result_row.pk[-1])

pixeltable 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.0py3-none-any.whl → 0.3.2py3-none-any.whl