PyPI - pixeltable - Versions diffs - 0.2.24__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

pixeltable 0.2.24py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (101) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/dir.py +6 -0
pixeltable/catalog/globals.py +25 -0
pixeltable/catalog/named_function.py +4 -0
pixeltable/catalog/path_dict.py +37 -11
pixeltable/catalog/schema_object.py +6 -0
pixeltable/catalog/table.py +531 -251
pixeltable/catalog/table_version.py +22 -8
pixeltable/catalog/view.py +8 -7
pixeltable/dataframe.py +439 -105
pixeltable/env.py +19 -5
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +6 -7
pixeltable/exec/expr_eval_node.py +1 -1
pixeltable/exec/sql_node.py +92 -45
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +1 -1
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +29 -2
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/expr.py +12 -5
pixeltable/exprs/expr_set.py +8 -0
pixeltable/exprs/function_call.py +147 -39
pixeltable/exprs/in_predicate.py +1 -1
pixeltable/exprs/inline_expr.py +25 -5
pixeltable/exprs/is_null.py +1 -1
pixeltable/exprs/json_mapper.py +1 -1
pixeltable/exprs/json_path.py +1 -1
pixeltable/exprs/method_ref.py +1 -1
pixeltable/exprs/row_builder.py +1 -1
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/exprs/similarity_expr.py +17 -7
pixeltable/exprs/sql_element_cache.py +4 -0
pixeltable/exprs/type_cast.py +2 -2
pixeltable/exprs/variable.py +3 -0
pixeltable/func/__init__.py +5 -4
pixeltable/func/aggregate_function.py +151 -68
pixeltable/func/callable_function.py +48 -16
pixeltable/func/expr_template_function.py +64 -23
pixeltable/func/function.py +227 -23
pixeltable/func/function_registry.py +2 -1
pixeltable/func/query_template_function.py +51 -9
pixeltable/func/signature.py +65 -7
pixeltable/func/tools.py +153 -0
pixeltable/func/udf.py +57 -35
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/anthropic.py +51 -4
pixeltable/functions/gemini.py +85 -0
pixeltable/functions/globals.py +54 -34
pixeltable/functions/huggingface.py +10 -28
pixeltable/functions/json.py +3 -8
pixeltable/functions/math.py +67 -0
pixeltable/functions/mistralai.py +0 -2
pixeltable/functions/ollama.py +8 -8
pixeltable/functions/openai.py +51 -4
pixeltable/functions/timestamp.py +1 -1
pixeltable/functions/video.py +3 -9
pixeltable/functions/vision.py +1 -1
pixeltable/globals.py +374 -89
pixeltable/index/embedding_index.py +106 -29
pixeltable/io/__init__.py +1 -1
pixeltable/io/label_studio.py +1 -1
pixeltable/io/parquet.py +39 -19
pixeltable/iterators/__init__.py +1 -0
pixeltable/iterators/document.py +12 -0
pixeltable/iterators/image.py +100 -0
pixeltable/iterators/video.py +7 -8
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_16.py +2 -1
pixeltable/metadata/converters/convert_17.py +2 -1
pixeltable/metadata/converters/convert_22.py +17 -0
pixeltable/metadata/converters/convert_23.py +35 -0
pixeltable/metadata/converters/convert_24.py +56 -0
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/converters/util.py +4 -2
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +1 -0
pixeltable/plan.py +129 -51
pixeltable/store.py +1 -1
pixeltable/type_system.py +196 -54
pixeltable/utils/arrow.py +8 -3
pixeltable/utils/description_helper.py +89 -0
pixeltable/utils/documents.py +14 -0
{pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/METADATA +32 -22
pixeltable-0.3.0.dist-info/RECORD +155 -0
{pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
pixeltable/tool/create_test_db_dump.py +0 -308
pixeltable/tool/create_test_video.py +0 -81
pixeltable/tool/doc_plugins/griffe.py +0 -50
pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
pixeltable/tool/embed_udf.py +0 -9
pixeltable/tool/mypy_plugin.py +0 -55
pixeltable-0.2.24.dist-info/RECORD +0 -153
pixeltable-0.2.24.dist-info/entry_points.txt +0 -3
{pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0

pixeltable/type_system.py CHANGED Viewed

@@ -5,7 +5,6 @@ import datetime
 import enum
 import io
 import json
-import types
 import typing
 import urllib.parse
 import urllib.request
@@ -14,7 +13,11 @@ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
 import PIL.Image
 import av  # type: ignore
+import jsonschema
+import jsonschema.protocols
+import jsonschema.validators
 import numpy as np
+import pydantic
 import sqlalchemy as sql
 from typing import _GenericAlias  # type: ignore[attr-defined]
 from typing_extensions import _AnnotatedAlias
@@ -166,7 +169,7 @@ class ColumnType:
         if t == cls.Type.DOCUMENT:
             return DocumentType()
-    def __str__(self) -> str:
+    def __repr__(self) -> str:
         return self._to_str(as_schema=False)
     def _to_str(self, as_schema: bool) -> str:
@@ -244,7 +247,7 @@ class ColumnType:
             if col_type is not None:
                 return col_type
             # this could still be json-serializable
-        if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray):
+        if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray) or isinstance(val, pydantic.BaseModel):
             try:
                 JsonType().validate_literal(val)
                 return JsonType(nullable=nullable)
@@ -337,7 +340,7 @@ class ColumnType:
                     return TimestampType(nullable=nullable_default)
                 if t is PIL.Image.Image:
                     return ImageType(nullable=nullable_default)
-                if issubclass(t, Sequence) or issubclass(t, Mapping):
+                if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
                     return JsonType(nullable=nullable_default)
         return None
@@ -479,6 +482,20 @@ class ColumnType:
         """
         pass
+    def to_json_schema(self) -> dict[str, Any]:
+        if self.nullable:
+            return {
+                'anyOf': [
+                    self._to_json_schema(),
+                    {'type': 'null'},
+                ]
+            }
+        else:
+            return self._to_json_schema()
+    def _to_json_schema(self) -> dict[str, Any]:
+        raise excs.Error(f'Pixeltable type {self} is not a valid JSON type')
 class InvalidType(ColumnType):
     def __init__(self, nullable: bool = False):
@@ -501,6 +518,9 @@ class StringType(ColumnType):
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.String()
+    def _to_json_schema(self) -> dict[str, Any]:
+        return {'type': 'string'}
     def print_value(self, val: Any) -> str:
         return f"'{val}'"
@@ -524,8 +544,13 @@ class IntType(ColumnType):
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.BigInteger()
+    def _to_json_schema(self) -> dict[str, Any]:
+        return {'type': 'integer'}
     def _validate_literal(self, val: Any) -> None:
-        if not isinstance(val, int):
+        # bool is a subclass of int, so we need to check for it
+        # explicitly first
+        if isinstance(val, bool) or not isinstance(val, int):
             raise TypeError(f'Expected int, got {val.__class__.__name__}')
@@ -536,6 +561,9 @@ class FloatType(ColumnType):
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.Float()
+    def _to_json_schema(self) -> dict[str, Any]:
+        return {'type': 'number'}
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, float):
             raise TypeError(f'Expected float, got {val.__class__.__name__}')
@@ -553,6 +581,9 @@ class BoolType(ColumnType):
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.Boolean()
+    def _to_json_schema(self) -> dict[str, Any]:
+        return {'type': 'boolean'}
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, bool):
             raise TypeError(f'Expected bool, got {val.__class__.__name__}')
@@ -581,61 +612,44 @@ class TimestampType(ColumnType):
 class JsonType(ColumnType):
-    # TODO: type_spec also needs to be able to express lists
-    def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
+    json_schema: Optional[dict[str, Any]]
+    __validator: Optional[jsonschema.protocols.Validator]
+    def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
         super().__init__(self.Type.JSON, nullable=nullable)
-        self.type_spec = type_spec
+        self.json_schema = json_schema
+        if json_schema is None:
+            self.__validator = None
+        else:
+            validator_cls = jsonschema.validators.validator_for(json_schema)
+            validator_cls.check_schema(json_schema)
+            self.__validator = validator_cls(json_schema)
     def copy(self, nullable: bool) -> ColumnType:
-        return JsonType(self.type_spec, nullable=nullable)
+        return JsonType(json_schema=self.json_schema, nullable=nullable)
     def matches(self, other: ColumnType) -> bool:
-        return isinstance(other, JsonType) and self.type_spec == other.type_spec
-    def supertype(self, other: ColumnType) -> Optional[JsonType]:
-        if not isinstance(other, JsonType):
-            return None
-        if self.type_spec is None:
-            # we don't have a type spec and can accept anything accepted by other
-            return JsonType(nullable=(self.nullable or other.nullable))
-        if other.type_spec is None:
-            # we have a type spec but other doesn't
-            return JsonType(nullable=(self.nullable or other.nullable))
-        # we both have type specs; the supertype's type spec is the union of the two
-        type_spec: dict[str, ColumnType] = {}
-        type_spec.update(self.type_spec)
-        for other_field_name, other_field_type in other.type_spec.items():
-            if other_field_name not in type_spec:
-                type_spec[other_field_name] = other_field_type
-            else:
-                # both type specs have this field
-                field_type = type_spec[other_field_name].supertype(other_field_type)
-                if field_type is None:
-                    # conflicting types
-                    return JsonType(nullable=(self.nullable or other.nullable))
-                type_spec[other_field_name] = field_type
-        return JsonType(type_spec, nullable=(self.nullable or other.nullable))
+        return isinstance(other, JsonType) and self.json_schema == other.json_schema
     def _as_dict(self) -> dict:
         result = super()._as_dict()
-        if self.type_spec is not None:
-            type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
-            result.update({'type_spec': type_spec_dict})
+        if self.json_schema is not None:
+            result.update({'json_schema': self.json_schema})
         return result
     @classmethod
     def _from_dict(cls, d: dict) -> ColumnType:
-        type_spec = None
-        if 'type_spec' in d:
-            type_spec = {
-                field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
-            }
-        return cls(type_spec, nullable=d['nullable'])
+        return cls(json_schema=d.get('json_schema'), nullable=d['nullable'])
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.dialects.postgresql.JSONB()
+    def _to_json_schema(self) -> dict[str, Any]:
+        if self.json_schema is None:
+            return {}
+        return self.json_schema
     def print_value(self, val: Any) -> str:
         val_type = self.infer_literal_type(val)
         if val_type is None:
@@ -645,27 +659,138 @@ class JsonType(ColumnType):
         return val_type.print_value(val)
     def _validate_literal(self, val: Any) -> None:
-        if not isinstance(val, dict) and not isinstance(val, list):
-            # TODO In the future we should accept scalars too, which would enable us to remove this top-level check
-            raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
-        if not self.__is_valid_literal(val):
+        if not self.__is_valid_json(val):
             raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
+        if self.__validator is not None:
+            self.__validator.validate(val)
     @classmethod
-    def __is_valid_literal(cls, val: Any) -> bool:
+    def __is_valid_json(cls, val: Any) -> bool:
         if val is None or isinstance(val, (str, int, float, bool)):
             return True
         if isinstance(val, (list, tuple)):
-            return all(cls.__is_valid_literal(v) for v in val)
+            return all(cls.__is_valid_json(v) for v in val)
         if isinstance(val, dict):
-            return all(isinstance(k, str) and cls.__is_valid_literal(v) for k, v in val.items())
+            return all(isinstance(k, str) and cls.__is_valid_json(v) for k, v in val.items())
         return False
     def _create_literal(self, val: Any) -> Any:
         if isinstance(val, tuple):
             val = list(val)
+        if isinstance(val, pydantic.BaseModel):
+            return val.model_dump()
         return val
+    def supertype(self, other: ColumnType) -> Optional[JsonType]:
+        # Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
+        # are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
+        # merge their schemas.
+        basic_supertype = super().supertype(other)
+        if basic_supertype is not None:
+            assert isinstance(basic_supertype, JsonType)
+            return basic_supertype
+        if not isinstance(other, JsonType):
+            return None
+        if self.json_schema is None or other.json_schema is None:
+            return JsonType(nullable=(self.nullable or other.nullable))
+        superschema = self.__superschema(self.json_schema, other.json_schema)
+        return JsonType(
+            json_schema=(None if len(superschema) == 0 else superschema),
+            nullable=(self.nullable or other.nullable)
+        )
+    @classmethod
+    def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
+        # Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
+        # things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
+        # for the most important use cases (unions for type inference, validation of inline exprs). A schema is
+        # considered to be conforming if either:
+        # (i) it is a scalar (string, integer, number, boolean) or dictionary (object) type; or
+        # (ii) it is an "anyOf" schema of one of the above types and the exact schema {'type': 'null'}.
+        # Conforming schemas are organized into a type hierarchy in an internally consistent way. Nonconforming
+        # schemas are allowed, but they are isolates in the type hierarchy: a nonconforming schema has no proper
+        # subtypes, and its only proper supertype is an unconstrained JsonType().
+        #
+        # There is some subtlety in the handling of nullable fields. Nullable fields are represented in JSON
+        # schemas as (for example) {'anyOf': [{'type': 'string'}, {'type': 'null'}]}. When finding the supertype
+        # of schemas that might be nullable, we first unpack the 'anyOf's, find the supertype of the underlyings,
+        # then reapply the 'anyOf' if appropriate. The top-level schema (i.e., JsonType.json_schema) is presumed
+        # to NOT be in this form (since nullability is indicated by the `nullable` field of the JsonType object),
+        # so this subtlety is applicable only to types that occur in subfields.
+        #
+        # There is currently no special handling of lists; distinct schemas with type 'array' will union to the
+        # generic {'type': 'array'} schema. This could be a TODO item if there is a need for it in the future.
+        if a == b:
+            return a
+        if 'properties' in a and 'properties' in b:
+            a_props = a['properties']
+            b_props = b['properties']
+            a_req = a.get('required', [])
+            b_req = b.get('required', [])
+            super_props = {}
+            super_req = []
+            for key, a_prop_schema in a_props.items():
+                if key in b_props:  # in both a and b
+                    prop_schema = cls.__superschema_with_nulls(a_prop_schema, b_props[key])
+                    super_props[key] = prop_schema
+                    if key in a_req and key in b_req:
+                        super_req.append(key)
+                else:  # in a but not b
+                    # Add it to the supertype schema as optional (regardless of its status in a)
+                    super_props[key] = a_prop_schema
+            for key, b_prop_schema in b_props.items():
+                if key not in a_props:  # in b but not a
+                    super_props[key] = b_prop_schema
+            schema = {'type': 'object', 'properties': super_props}
+            if len(super_req) > 0:
+                schema['required'] = super_req
+            return schema
+        a_type = a.get('type')
+        b_type = b.get('type')
+        if (a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type):
+            # a and b both have the same type designation, but are not identical. This can happen if
+            # (for example) they have validators or other attributes that differ. In this case, we
+            # generalize to {'type': t}, where t is their shared type, with no other qualifications.
+            return {'type': a_type}
+        return {}  # Unresolvable type conflict; the supertype is an unrestricted JsonType.
+    @classmethod
+    def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
+        a, a_nullable = cls.__unpack_null_from_schema(a)
+        b, b_nullable = cls.__unpack_null_from_schema(b)
+        result = cls.__superschema(a, b)
+        if len(result) > 0 and (a_nullable or b_nullable):
+            # if len(result) == 0, then null is implicitly accepted; otherwise, we need to explicitly allow it
+            return {'anyOf': [result, {'type': 'null'}]}
+        return result
+    @classmethod
+    def __unpack_null_from_schema(cls, s: dict[str, Any]) -> tuple[dict[str, Any], bool]:
+        if 'anyOf' in s and len(s['anyOf']) == 2 and {'type': 'null'} in s['anyOf']:
+            try:
+                return next(s for s in s['anyOf'] if s != {'type': 'null'}), True
+            except StopIteration:
+                pass
+        return s, False
+    def _to_base_str(self) -> str:
+        if self.json_schema is None:
+            return 'Json'
+        elif 'title' in self.json_schema:
+            return f'Json[{self.json_schema["title"]}]'
+        else:
+            return f'Json[{self.json_schema}]'
 class ArrayType(ColumnType):
     def __init__(self, shape: tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
@@ -743,6 +868,12 @@ class ArrayType(ColumnType):
                 return False
         return val.dtype == self.numpy_dtype()
+    def _to_json_schema(self) -> dict[str, Any]:
+        return {
+            'type': 'array',
+            'items': self.pxt_dtype._to_json_schema(),
+        }
     def _validate_literal(self, val: Any) -> None:
         if not isinstance(val, np.ndarray):
             raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
@@ -752,7 +883,7 @@ class ArrayType(ColumnType):
                 f'got ndarray({val.shape}, dtype={val.dtype})'))
     def _create_literal(self, val: Any) -> Any:
-        if isinstance(val, (list,tuple)):
+        if isinstance(val, (list, tuple)):
             # map python float to whichever numpy float is
             # declared for this type, rather than assume float64
             return np.array(val, dtype=self.numpy_dtype())
@@ -902,7 +1033,7 @@ class VideoType(ColumnType):
                 if num_decoded < 2:
                     # this is most likely an image file
                     raise excs.Error(f'Not a valid video: {val}')
-        except av.AVError:
+        except av.FFmpegError:
             raise excs.Error(f'Not a valid video: {val}') from None
@@ -929,7 +1060,7 @@ class AudioType(ColumnType):
                 for packet in container.demux(audio_stream):
                     for _ in packet.decode():
                         pass
-        except av.AVError as e:
+        except av.FFmpegError as e:
             raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
@@ -940,6 +1071,7 @@ class DocumentType(ColumnType):
         MD = 1
         PDF = 2
         XML = 3
+        TXT = 4
     def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
         super().__init__(self.Type.DOCUMENT, nullable=nullable)
@@ -1016,6 +1148,16 @@ class _PxtType:
 class Json(_PxtType):
+    def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
+        """
+        `item` (the type subscript) must be a `dict` representing a valid JSON Schema.
+        """
+        if not isinstance(item, dict):
+            raise TypeError('Json type parameter must be a dict')
+        # The JsonType initializer will validate the JSON Schema.
+        return typing.Annotated[Any, JsonType(json_schema=item, nullable=False)]
     @classmethod
     def as_col_type(cls, nullable: bool) -> ColumnType:
         return JsonType(nullable=nullable)

pixeltable/utils/arrow.py CHANGED Viewed

@@ -3,14 +3,17 @@ from typing import Any, Iterator, Optional, Union
 import numpy as np
 import pyarrow as pa
+import datetime
 import pixeltable.type_system as ts
+from pixeltable.env import Env
+_tz_def = Env().get().default_time_zone
 _logger = logging.getLogger(__name__)
 _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
-    pa.timestamp('us'): ts.TimestampType(nullable=True),
     pa.bool_(): ts.BoolType(nullable=True),
     pa.uint8(): ts.IntType(nullable=True),
     pa.int8(): ts.IntType(nullable=True),
@@ -23,7 +26,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
 _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
     ts.StringType: pa.string(),
-    ts.TimestampType: pa.timestamp('us'),  # postgres timestamp is microseconds
+    ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc),  # postgres timestamp is microseconds
     ts.BoolType: pa.bool_(),
     ts.IntType: pa.int64(),
     ts.FloatType: pa.float32(),
@@ -39,7 +42,9 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
     """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
     Returns None if no conversion is currently implemented.
     """
-    if arrow_type in _pa_to_pt:
+    if isinstance(arrow_type, pa.TimestampType):
+        return ts.TimestampType(nullable=True)
+    elif arrow_type in _pa_to_pt:
         return _pa_to_pt[arrow_type]
     elif isinstance(arrow_type, pa.FixedShapeTensorType):
         dtype = to_pixeltable_type(arrow_type.value_type)

pixeltable/utils/description_helper.py ADDED Viewed

@@ -0,0 +1,89 @@
+import dataclasses
+from typing import Optional, Union
+import pandas as pd
+from pandas.io.formats.style import Styler
+@dataclasses.dataclass
+class _Descriptor:
+    body: Union[str, pd.DataFrame]
+    # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
+    show_index: bool
+    show_header: bool
+    styler: Optional[Styler] = None
+class DescriptionHelper:
+    """
+    Helper class for rendering long-form descriptions of Pixeltable objects.
+    The output is specified as a list of "descriptors", each of which can be either a string or a Pandas DataFrame,
+    in any combination. The descriptors will be rendered in sequence. This is useful for long-form descriptions that
+    include tables with differing schemas or formatting, and/or a combination of tables and text.
+    DescriptionHelper can convert a list of descriptors into either HTML or plaintext and do something reasonable
+    in each case.
+    """
+    __descriptors: list[_Descriptor]
+    def __init__(self) -> None:
+        self.__descriptors = []
+    def append(
+        self,
+        descriptor: Union[str, pd.DataFrame],
+        show_index: bool = False,
+        show_header: bool = True,
+        styler: Optional[Styler] = None,
+    ) -> None:
+        self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
+    def to_string(self) -> str:
+        blocks = [self.__render_text(descriptor) for descriptor in self.__descriptors]
+        return '\n\n'.join(blocks)
+    def to_html(self) -> str:
+        html_blocks = [self.__apply_styles(descriptor).to_html() for descriptor in self.__descriptors]
+        return '\n'.join(html_blocks)
+    @classmethod
+    def __render_text(cls, descriptor: _Descriptor) -> str:
+        if isinstance(descriptor.body, str):
+            return descriptor.body
+        else:
+            # If `show_index=False`, we get cleaner output (better intercolumn spacing) by setting the index to a
+            # list of empty strings than by setting `index=False` in the call to `df.to_string()`. It's pretty silly
+            # that `index=False` has side effects in Pandas that go beyond simply not displaying the index, but it
+            # is what it is.
+            df = descriptor.body
+            if not descriptor.show_index:
+                df = df.copy()
+                df.index = [''] * len(df)  # type: ignore[assignment]
+            # max_colwidth=50 is the identical default that Pandas uses for a DataFrame's __repr__() output.
+            return df.to_string(header=descriptor.show_header, max_colwidth=50)
+    @classmethod
+    def __apply_styles(cls, descriptor: _Descriptor) -> Styler:
+        if isinstance(descriptor.body, str):
+            return (
+                # Render the string as a single-cell DataFrame. This will ensure a consistent style of output in
+                # cases where strings appear alongside DataFrames in the same DescriptionHelper.
+                pd.DataFrame([descriptor.body]).style
+                .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
+                .hide(axis='index').hide(axis='columns')
+            )
+        else:
+            styler = descriptor.styler
+            if styler is None:
+                styler = descriptor.body.style
+            styler = (
+                styler
+                .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
+                .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
+            )
+            if not descriptor.show_header:
+                styler = styler.hide(axis='columns')
+            if not descriptor.show_index:
+                styler = styler.hide(axis='index')
+            return styler

pixeltable/utils/documents.py CHANGED Viewed

@@ -15,6 +15,7 @@ class DocumentHandle:
     bs_doc: Optional[bs4.BeautifulSoup] = None
     md_ast: Optional[dict] = None
     pdf_doc: Optional[fitz.Document] = None
+    txt_doc: Optional[str] = None
 def get_document_handle(path: str) -> Optional[DocumentHandle]:
@@ -40,6 +41,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
         if bs_doc is not None:
             return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
+    if doc_format == '.txt':
+        txt_doc = get_txt(path)
+        if txt_doc is not None:
+            return DocumentHandle(format=ts.DocumentType.DocumentFormat.TXT, txt_doc=txt_doc)
     return None
@@ -84,3 +90,11 @@ def get_markdown_handle(path: str) -> Optional[dict]:
         return md_ast(text)
     except Exception:
         return None
+def get_txt(path: str) -> Optional[str]:
+    try:
+        with open(path, "r") as f:
+            doc = f.read()
+        return doc if doc != '' else None
+    except Exception:
+        return None

pixeltable 0.2.24__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.24py3-none-any.whl → 0.3.0py3-none-any.whl