PyPI - pixeltable - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

pixeltable 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show

pixeltable/__init__.py +1 -0
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +9 -2
pixeltable/catalog/column.py +1 -1
pixeltable/catalog/dir.py +1 -1
pixeltable/catalog/table.py +1 -1
pixeltable/catalog/table_version.py +12 -2
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +64 -20
pixeltable/dataframe.py +14 -14
pixeltable/env.py +20 -3
pixeltable/exec/component_iteration_node.py +1 -2
pixeltable/exec/expr_eval/evaluators.py +4 -2
pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
pixeltable/exprs/comparison.py +8 -4
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +9 -2
pixeltable/exprs/function_call.py +155 -313
pixeltable/func/aggregate_function.py +29 -15
pixeltable/func/callable_function.py +11 -8
pixeltable/func/expr_template_function.py +3 -9
pixeltable/func/function.py +148 -74
pixeltable/func/signature.py +65 -30
pixeltable/func/udf.py +1 -1
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/deepseek.py +121 -0
pixeltable/functions/image.py +7 -7
pixeltable/functions/openai.py +49 -10
pixeltable/functions/video.py +14 -7
pixeltable/globals.py +14 -3
pixeltable/index/embedding_index.py +4 -13
pixeltable/io/globals.py +88 -77
pixeltable/io/hf_datasets.py +34 -34
pixeltable/io/pandas.py +75 -87
pixeltable/io/parquet.py +19 -27
pixeltable/io/utils.py +115 -0
pixeltable/iterators/audio.py +2 -1
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/__init__.py +2 -1
pixeltable/metadata/converters/convert_15.py +18 -8
pixeltable/metadata/converters/convert_27.py +31 -0
pixeltable/metadata/converters/convert_28.py +15 -0
pixeltable/metadata/converters/convert_29.py +111 -0
pixeltable/metadata/converters/util.py +12 -1
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/share/__init__.py +1 -0
pixeltable/share/packager.py +246 -0
pixeltable/share/publish.py +97 -0
pixeltable/type_system.py +87 -42
pixeltable/utils/__init__.py +41 -0
pixeltable/utils/arrow.py +45 -12
pixeltable/utils/formatter.py +1 -1
pixeltable/utils/iceberg.py +14 -0
pixeltable/utils/media_store.py +1 -1
{pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/METADATA +37 -50
{pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/RECORD +60 -51
{pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
{pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
{pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0

pixeltable/functions/deepseek.py ADDED Viewed

@@ -0,0 +1,121 @@
+import json
+from typing import TYPE_CHECKING, Any, Optional, Union, cast
+import httpx
+import pixeltable as pxt
+from pixeltable import env
+from pixeltable.utils.code import local_public_names
+from .openai import _opt
+if TYPE_CHECKING:
+    import openai
+@env.register_client('deepseek')
+def _(api_key: str) -> 'openai.AsyncOpenAI':
+    import openai
+    return openai.AsyncOpenAI(
+        api_key=api_key,
+        base_url='https://api.deepseek.com',
+        http_client=httpx.AsyncClient(limits=httpx.Limits(max_keepalive_connections=100, max_connections=500)),
+    )
+def _deepseek_client() -> 'openai.AsyncOpenAI':
+    return env.Env.get().get_client('deepseek')
+@pxt.udf
+async def chat_completions(
+    messages: list,
+    *,
+    model: str,
+    frequency_penalty: Optional[float] = None,
+    logprobs: Optional[bool] = None,
+    top_logprobs: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    presence_penalty: Optional[float] = None,
+    response_format: Optional[dict] = None,
+    stop: Optional[list[str]] = None,
+    temperature: Optional[float] = None,
+    tools: Optional[list[dict]] = None,
+    tool_choice: Optional[dict] = None,
+    top_p: Optional[float] = None,
+) -> dict:
+    """
+    Creates a model response for the given chat conversation.
+    Equivalent to the Deepseek `chat/completions` API endpoint.
+    For additional details, see: <https://api-docs.deepseek.com/api/create-chat-completion>
+    Deepseek uses the OpenAI SDK, so you will need to install the `openai` package to use this UDF.
+    __Requirements:__
+    - `pip install openai`
+    Args:
+        messages: A list of messages to use for chat completion, as described in the Deepseek API documentation.
+        model: The model to use for chat completion.
+    For details on the other parameters, see: <https://api-docs.deepseek.com/api/create-chat-completion>
+    Returns:
+        A dictionary containing the response and other metadata.
+    Examples:
+        Add a computed column that applies the model `deepseek-chat` to an existing Pixeltable column `tbl.prompt`
+        of the table `tbl`:
+        >>> messages = [
+                {'role': 'system', 'content': 'You are a helpful assistant.'},
+                {'role': 'user', 'content': tbl.prompt}
+            ]
+            tbl.add_computed_column(response=chat_completions(messages, model='deepseek-chat'))
+    """
+    if tools is not None:
+        tools = [{'type': 'function', 'function': tool} for tool in tools]
+    tool_choice_: Union[str, dict, None] = None
+    if tool_choice is not None:
+        if tool_choice['auto']:
+            tool_choice_ = 'auto'
+        elif tool_choice['required']:
+            tool_choice_ = 'required'
+        else:
+            assert tool_choice['tool'] is not None
+            tool_choice_ = {'type': 'function', 'function': {'name': tool_choice['tool']}}
+    extra_body: Optional[dict[str, Any]] = None
+    if tool_choice is not None and not tool_choice['parallel_tool_calls']:
+        extra_body = {'parallel_tool_calls': False}
+    # cast(Any, ...): avoid mypy errors
+    result = await _deepseek_client().chat.completions.with_raw_response.create(
+        messages=messages,
+        model=model,
+        frequency_penalty=_opt(frequency_penalty),
+        logprobs=_opt(logprobs),
+        top_logprobs=_opt(top_logprobs),
+        max_tokens=_opt(max_tokens),
+        presence_penalty=_opt(presence_penalty),
+        response_format=_opt(cast(Any, response_format)),
+        stop=_opt(stop),
+        temperature=_opt(temperature),
+        tools=_opt(cast(Any, tools)),
+        tool_choice=_opt(cast(Any, tool_choice_)),
+        top_p=_opt(top_p),
+        extra_body=extra_body,
+    )
+    return json.loads(result.text)
+__all__ = local_public_names(__name__)
+def __dir__():
+    return __all__

pixeltable/functions/image.py CHANGED Viewed

@@ -131,6 +131,13 @@ def getchannel(self: PIL.Image.Image, channel: int) -> PIL.Image.Image:
     pass
+@getchannel.conditional_return_type
+def _(self: Expr) -> pxt.ColumnType:
+    input_type = self.col_type
+    assert isinstance(input_type, pxt.ImageType)
+    return pxt.ImageType(size=input_type.size, mode='L', nullable=input_type.nullable)
 @pxt.udf(is_method=True)
 def get_metadata(self: PIL.Image.Image) -> dict:
     """
@@ -146,13 +153,6 @@ def get_metadata(self: PIL.Image.Image) -> dict:
     }
-@getchannel.conditional_return_type
-def _(self: Expr) -> pxt.ColumnType:
-    input_type = self.col_type
-    assert isinstance(input_type, pxt.ImageType)
-    return pxt.ImageType(size=input_type.size, mode='L', nullable=input_type.nullable)
 # Image.point()
 @pxt.udf(is_method=True)
 def point(self: PIL.Image.Image, lut: list[int], mode: Optional[str] = None) -> PIL.Image.Image:

pixeltable/functions/openai.py CHANGED Viewed

@@ -14,7 +14,7 @@ import math
 import pathlib
 import re
 import uuid
-from typing import TYPE_CHECKING, Any, Callable, Optional, Type, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Type, TypeVar, Union, cast
 import httpx
 import numpy as np
@@ -324,10 +324,37 @@ async def translations(
 # Chat Endpoints
+def _default_max_tokens(model: str) -> int:
+    if (
+        _is_model_family(model, 'gpt-4o-realtime')
+        or _is_model_family(model, 'gpt-4o-mini-realtime')
+        or _is_model_family(model, 'gpt-4-turbo')
+        or _is_model_family(model, 'gpt-3.5-turbo')
+    ):
+        return 4096
+    if _is_model_family(model, 'gpt-4'):
+        return 8192  # All other gpt-4 models (will not match on gpt-4o models)
+    if _is_model_family(model, 'gpt-4o') or _is_model_family(model, 'gpt-4.5-preview'):
+        return 16384  # All other gpt-4o / gpt-4.5 models
+    if _is_model_family(model, 'o1-preview'):
+        return 32768
+    if _is_model_family(model, 'o1-mini'):
+        return 65536
+    if _is_model_family(model, 'o1') or _is_model_family(model, 'o3'):
+        return 100000  # All other o1 / o3 models
+    return 100000  # global default
+def _is_model_family(model: str, family: str) -> bool:
+    # `model.startswith(family)` would be a simpler match, but increases the risk of false positives.
+    # We use a slightly more complicated criterion to make things a little less error prone.
+    return model == family or model.startswith(f'{family}-')
 def _chat_completions_get_request_resources(
-    messages: list, max_tokens: Optional[int], n: Optional[int]
+    messages: list, model: str, max_completion_tokens: Optional[int], max_tokens: Optional[int], n: Optional[int]
 ) -> dict[str, int]:
-    completion_tokens = n * max_tokens
+    completion_tokens = (n or 1) * (max_completion_tokens or max_tokens or _default_max_tokens(model))
     num_tokens = 0.0
     for message in messages:
@@ -349,16 +376,18 @@ async def chat_completions(
     logit_bias: Optional[dict[str, int]] = None,
     logprobs: Optional[bool] = None,
     top_logprobs: Optional[int] = None,
-    max_tokens: Optional[int] = 1024,
-    n: Optional[int] = 1,
+    max_completion_tokens: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
     presence_penalty: Optional[float] = None,
+    reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None,
     response_format: Optional[dict] = None,
     seed: Optional[int] = None,
     stop: Optional[list[str]] = None,
     temperature: Optional[float] = None,
-    top_p: Optional[float] = None,
     tools: Optional[list[dict]] = None,
     tool_choice: Optional[dict] = None,
+    top_p: Optional[float] = None,
     user: Optional[str] = None,
     timeout: Optional[float] = None,
 ) -> dict:
@@ -426,16 +455,18 @@ async def chat_completions(
         logit_bias=_opt(logit_bias),
         logprobs=_opt(logprobs),
         top_logprobs=_opt(top_logprobs),
+        max_completion_tokens=_opt(max_completion_tokens),
         max_tokens=_opt(max_tokens),
         n=_opt(n),
         presence_penalty=_opt(presence_penalty),
+        reasoning_effort=_opt(reasoning_effort),
         response_format=_opt(cast(Any, response_format)),
         seed=_opt(seed),
         stop=_opt(stop),
         temperature=_opt(temperature),
-        top_p=_opt(top_p),
         tools=_opt(cast(Any, tools)),
         tool_choice=_opt(cast(Any, tool_choice_)),
+        top_p=_opt(top_p),
         user=_opt(user),
         timeout=_opt(timeout),
         extra_body=extra_body,
@@ -448,9 +479,14 @@ async def chat_completions(
 def _vision_get_request_resources(
-    prompt: str, image: PIL.Image.Image, max_tokens: Optional[int], n: Optional[int]
+    prompt: str,
+    image: PIL.Image.Image,
+    model: str,
+    max_completion_tokens: Optional[int],
+    max_tokens: Optional[int],
+    n: Optional[int],
 ) -> dict[str, int]:
-    completion_tokens = n * max_tokens
+    completion_tokens = (n or 1) * (max_completion_tokens or max_tokens or _default_max_tokens(model))
     prompt_tokens = len(prompt) / 4
     # calculate image tokens based on
@@ -482,7 +518,8 @@ async def vision(
     image: PIL.Image.Image,
     *,
     model: str,
-    max_tokens: Optional[int] = 1024,
+    max_completion_tokens: Optional[int] = None,
+    max_tokens: Optional[int] = None,
     n: Optional[int] = 1,
     timeout: Optional[float] = None,
 ) -> str:
@@ -534,9 +571,11 @@ async def vision(
     rate_limits_info = env.Env.get().get_resource_pool_info(
         resource_pool, lambda: OpenAIRateLimitsInfo(_vision_get_request_resources)
     )
     result = await _openai_client().chat.completions.with_raw_response.create(
         messages=messages,  # type: ignore
         model=model,
+        max_completion_tokens=_opt(max_completion_tokens),
         max_tokens=_opt(max_tokens),
         n=_opt(n),
         timeout=_opt(timeout),

pixeltable/functions/video.py CHANGED Viewed

@@ -14,9 +14,9 @@ t.select(pxt_video.extract_audio(t.video_col)).collect()
 import tempfile
 import uuid
 from pathlib import Path
-from typing import Optional
+from typing import Any, Optional
-import av  # type: ignore[import-untyped]
+import av
 import numpy as np
 import PIL.Image
@@ -53,10 +53,14 @@ class make_video(pxt.Aggregator):
     Aggregator that creates a video from a sequence of images.
     """
+    container: Optional[av.container.OutputContainer]
+    stream: Optional[av.video.stream.VideoStream]
+    fps: int
     def __init__(self, fps: int = 25):
         """follows https://pyav.org/docs/develop/cookbook/numpy.html#generating-video"""
-        self.container: Optional[av.container.OutputContainer] = None
-        self.stream: Optional[av.stream.Stream] = None
+        self.container = None
+        self.stream = None
         self.fps = fps
     def update(self, frame: PIL.Image.Image) -> None:
@@ -107,9 +111,10 @@ def extract_audio(
         with av.open(output_filename, 'w', format=format) as output_container:
             output_stream = output_container.add_stream(codec or default_codec)
+            assert isinstance(output_stream, av.audio.stream.AudioStream)
             for packet in container.demux(audio_stream):
                 for frame in packet.decode():
-                    output_container.mux(output_stream.encode(frame))
+                    output_container.mux(output_stream.encode(frame))  # type: ignore[arg-type]
         return output_filename
@@ -141,7 +146,7 @@ def __get_stream_metadata(stream: av.stream.Stream) -> dict:
         return {'type': stream.type}  # Currently unsupported
     codec_context = stream.codec_context
-    codec_context_md = {
+    codec_context_md: dict[str, Any] = {
         'name': codec_context.name,
         'codec_tag': codec_context.codec_tag.encode('unicode-escape').decode('utf-8'),
         'profile': codec_context.profile,
@@ -160,9 +165,11 @@ def __get_stream_metadata(stream: av.stream.Stream) -> dict:
     if stream.type == 'audio':
         # Additional metadata for audio
-        codec_context_md['channels'] = int(codec_context.channels) if codec_context.channels is not None else None
+        channels = getattr(stream.codec_context, 'channels', None)
+        codec_context_md['channels'] = int(channels) if channels is not None else None
     else:
         assert stream.type == 'video'
+        assert isinstance(stream, av.video.stream.VideoStream)
         # Additional metadata for video
         codec_context_md['pix_fmt'] = getattr(stream.codec_context, 'pix_fmt', None)
         metadata.update(

pixeltable/globals.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import dataclasses
 import logging
+import urllib.parse
 from typing import Any, Iterable, Literal, Optional, Union
 from uuid import UUID
@@ -10,7 +11,7 @@ from sqlalchemy.util.preloaded import orm
 import pixeltable.exceptions as excs
 import pixeltable.exprs as exprs
-from pixeltable import DataFrame, catalog, func
+from pixeltable import DataFrame, catalog, func, share
 from pixeltable.catalog import Catalog
 from pixeltable.dataframe import DataFrameResultSet
 from pixeltable.env import Env
@@ -279,14 +280,16 @@ def create_view(
         ... view = pxt.create_view('my_view', tbl.where(tbl.col1 > 100), if_exists='replace_force')
     """
     where: Optional[exprs.Expr] = None
+    select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None
     if isinstance(base, catalog.Table):
         tbl_version_path = base._tbl_version_path
     elif isinstance(base, DataFrame):
-        base._validate_mutable('create_view')
+        base._validate_mutable('create_view', allow_select=True)
         if len(base._from_clause.tbls) > 1:
             raise excs.Error('Cannot create a view of a join')
         tbl_version_path = base._from_clause.tbls[0]
         where = base.where_clause
+        select_list = base.select_list
     else:
         raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
     assert isinstance(base, catalog.Table) or isinstance(base, DataFrame)
@@ -322,6 +325,7 @@ def create_view(
         dir._id,
         path.name,
         base=tbl_version_path,
+        select_list=select_list,
         additional_columns=additional_columns,
         predicate=where,
         is_snapshot=is_snapshot,
@@ -630,7 +634,7 @@ def create_dir(
     parent = cat.paths[path.parent]
     assert parent is not None
     with orm.Session(Env.get().engine, future=True) as session:
-        dir_md = schema.DirMd(name=path.name)
+        dir_md = schema.DirMd(name=path.name, user=None, additional_md={})
         dir_record = schema.Dir(parent_id=parent._id, md=dataclasses.asdict(dir_md))
         session.add(dir_record)
         session.flush()
@@ -723,6 +727,13 @@ def drop_dir(path_str: str, force: bool = False, if_not_exists: Literal['error',
     _logger.info(f'Removed directory `{path_str}`.')
+def publish_snapshot(dest_uri: str, table: catalog.Table) -> None:
+    parsed_uri = urllib.parse.urlparse(dest_uri)
+    if parsed_uri.scheme != 'pxt':
+        raise excs.Error(f'Invalid Pixeltable URI (does not start with pxt://): {dest_uri}')
+    share.publish_snapshot(dest_uri, table)
 def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
     """List the directories in a directory.

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -99,10 +99,10 @@ class EmbeddingIndex(IndexBase):
         # Now validate the return types of the embedding functions.
         if self.string_embed is not None:
-            self._validate_embedding_fn(self.string_embed, ts.ColumnType.Type.STRING)
+            self._validate_embedding_fn(self.string_embed)
         if self.image_embed is not None:
-            self._validate_embedding_fn(self.image_embed, ts.ColumnType.Type.IMAGE)
+            self._validate_embedding_fn(self.image_embed)
         if c.col_type.is_string_type() and self.string_embed is None:
             raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
@@ -206,21 +206,12 @@ class EmbeddingIndex(IndexBase):
         return None
     @classmethod
-    def _validate_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> None:
+    def _validate_embedding_fn(cls, embed_fn: func.Function) -> None:
         """Validate the given embedding function."""
         assert not embed_fn.is_polymorphic
-        sig = embed_fn.signature
-        # validate return type
-        param_name = sig.parameters_by_pos[0].name
-        if expected_type == ts.ColumnType.Type.STRING:
-            return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
-        else:
-            assert expected_type == ts.ColumnType.Type.IMAGE
-            img = PIL.Image.new('RGB', (512, 512))
-            return_type = embed_fn.call_return_type([], {param_name: img})
+        return_type = embed_fn.signature.return_type
-        assert return_type is not None
         if not isinstance(return_type, ts.ArrayType):
             raise excs.Error(
                 f'The function `{embed_fn.name}` is not a valid embedding: '

pixeltable/io/globals.py CHANGED Viewed

@@ -1,3 +1,7 @@
+import json
+import urllib.parse
+import urllib.request
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 import pixeltable as pxt
@@ -5,11 +9,61 @@ import pixeltable.exceptions as excs
 from pixeltable import Table, exprs
 from pixeltable.env import Env
 from pixeltable.io.external_store import SyncStatus
+from pixeltable.utils import parse_local_file_path
 if TYPE_CHECKING:
     import fiftyone as fo  # type: ignore[import-untyped]
+from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
+def _infer_schema_from_rows(
+    rows: list[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
+) -> dict[str, pxt.ColumnType]:
+    schema: dict[str, pxt.ColumnType] = {}
+    cols_with_nones: set[str] = set()
+    for n, row in enumerate(rows):
+        for col_name, value in row.items():
+            if col_name in schema_overrides:
+                # We do the insertion here; this will ensure that the column order matches the order
+                # in which the column names are encountered in the input data, even if `schema_overrides`
+                # is specified.
+                if col_name not in schema:
+                    schema[col_name] = schema_overrides[col_name]
+            elif value is not None:
+                # If `key` is not in `schema_overrides`, then we infer its type from the data.
+                # The column type will always be nullable by default.
+                col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
+                if col_type is None:
+                    raise excs.Error(
+                        f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
+                    )
+                if col_name not in schema:
+                    schema[col_name] = col_type
+                else:
+                    supertype = schema[col_name].supertype(col_type)
+                    if supertype is None:
+                        raise excs.Error(
+                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
+                            'Consider specifying the type explicitly in `schema_overrides`.'
+                        )
+                    schema[col_name] = supertype
+            else:
+                cols_with_nones.add(col_name)
+    entirely_none_cols = cols_with_nones - schema.keys()
+    if len(entirely_none_cols) > 0:
+        # A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
+        # was not encountered in any row with a non-None value.
+        raise excs.Error(
+            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
+            'Consider specifying the type(s) explicitly in `schema_overrides`.'
+        )
+    return schema
 def create_label_studio_project(
     t: Table,
     label_config: str,
@@ -140,7 +194,7 @@ def import_rows(
     tbl_path: str,
     rows: list[dict[str, Any]],
     *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -169,67 +223,22 @@ def import_rows(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    if schema_overrides is None:
-        schema_overrides = {}
-    schema: dict[str, pxt.ColumnType] = {}
-    cols_with_nones: set[str] = set()
+    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
+    row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
+    schema, pxt_pk, _ = normalize_schema_names(row_schema, primary_key, schema_overrides, True)
-    for n, row in enumerate(rows):
-        for col_name, value in row.items():
-            if col_name in schema_overrides:
-                # We do the insertion here; this will ensure that the column order matches the order
-                # in which the column names are encountered in the input data, even if `schema_overrides`
-                # is specified.
-                if col_name not in schema:
-                    schema[col_name] = schema_overrides[col_name]
-            elif value is not None:
-                # If `key` is not in `schema_overrides`, then we infer its type from the data.
-                # The column type will always be nullable by default.
-                col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
-                if col_type is None:
-                    raise excs.Error(
-                        f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
-                    )
-                if col_name not in schema:
-                    schema[col_name] = col_type
-                else:
-                    supertype = schema[col_name].supertype(col_type)
-                    if supertype is None:
-                        raise excs.Error(
-                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
-                            'Consider specifying the type explicitly in `schema_overrides`.'
-                        )
-                    schema[col_name] = supertype
-            else:
-                cols_with_nones.add(col_name)
-    extraneous_keys = schema_overrides.keys() - schema.keys()
-    if len(extraneous_keys) > 0:
-        raise excs.Error(
-            f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}'
-        )
-    entirely_none_cols = cols_with_nones - schema.keys()
-    if len(entirely_none_cols) > 0:
-        # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
-        # was not encountered in any row with a non-None value.
-        raise excs.Error(
-            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
-            'Consider specifying the type(s) explicitly in `schema_overrides`.'
-        )
-    t = pxt.create_table(
-        tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment
+    table = find_or_create_table(
+        tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
     )
-    t.insert(rows)
-    return t
+    table.insert(rows)
+    return table
 def import_json(
     tbl_path: str,
     filepath_or_url: str,
     *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -253,33 +262,35 @@ def import_json(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import json
-    import urllib.parse
-    import urllib.request
-    # TODO Consolidate this logic with other places where files/URLs are parsed
-    parsed = urllib.parse.urlparse(filepath_or_url)
-    if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
-        # local file path
-        if len(parsed.scheme) <= 1:
-            filepath = filepath_or_url
-        else:
-            filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
-        with open(filepath) as fp:
+    path = parse_local_file_path(filepath_or_url)
+    if path is None:  # it's a URL
+        # TODO: This should read from S3 as well.
+        contents = urllib.request.urlopen(filepath_or_url).read()
+    else:
+        with open(path) as fp:
             contents = fp.read()
+    rows = json.loads(contents, **kwargs)
+    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
+    row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
+    schema, pxt_pk, col_mapping = normalize_schema_names(row_schema, primary_key, schema_overrides, False)
+    # Convert all rows to insertable format - not needed, misnamed columns and types are errors in the incoming row format
+    if col_mapping is not None:
+        tbl_rows = [
+            {field if col_mapping is None else col_mapping[field]: val for field, val in row.items()} for row in rows
+        ]
     else:
-        # URL
-        contents = urllib.request.urlopen(filepath_or_url).read()
-    data = json.loads(contents, **kwargs)
-    return import_rows(
-        tbl_path,
-        data,
-        schema_overrides=schema_overrides,
-        primary_key=primary_key,
-        num_retained_versions=num_retained_versions,
-        comment=comment,
+        tbl_rows = rows
+    table = find_or_create_table(
+        tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
     )
+    table.insert(tbl_rows)
+    return table
 def export_images_as_fo_dataset(
     tbl: pxt.Table,

pixeltable 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl