pixeltable 0.2.24__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +2 -2
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/dir.py +6 -0
- pixeltable/catalog/globals.py +25 -0
- pixeltable/catalog/named_function.py +4 -0
- pixeltable/catalog/path_dict.py +37 -11
- pixeltable/catalog/schema_object.py +6 -0
- pixeltable/catalog/table.py +531 -251
- pixeltable/catalog/table_version.py +22 -8
- pixeltable/catalog/view.py +8 -7
- pixeltable/dataframe.py +439 -105
- pixeltable/env.py +19 -5
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/exec_node.py +6 -7
- pixeltable/exec/expr_eval_node.py +1 -1
- pixeltable/exec/sql_node.py +92 -45
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +1 -1
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +1 -1
- pixeltable/exprs/column_ref.py +29 -2
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/compound_predicate.py +1 -1
- pixeltable/exprs/expr.py +12 -5
- pixeltable/exprs/expr_set.py +8 -0
- pixeltable/exprs/function_call.py +147 -39
- pixeltable/exprs/in_predicate.py +1 -1
- pixeltable/exprs/inline_expr.py +25 -5
- pixeltable/exprs/is_null.py +1 -1
- pixeltable/exprs/json_mapper.py +1 -1
- pixeltable/exprs/json_path.py +1 -1
- pixeltable/exprs/method_ref.py +1 -1
- pixeltable/exprs/row_builder.py +1 -1
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/exprs/similarity_expr.py +17 -7
- pixeltable/exprs/sql_element_cache.py +4 -0
- pixeltable/exprs/type_cast.py +2 -2
- pixeltable/exprs/variable.py +3 -0
- pixeltable/func/__init__.py +5 -4
- pixeltable/func/aggregate_function.py +151 -68
- pixeltable/func/callable_function.py +48 -16
- pixeltable/func/expr_template_function.py +64 -23
- pixeltable/func/function.py +227 -23
- pixeltable/func/function_registry.py +2 -1
- pixeltable/func/query_template_function.py +51 -9
- pixeltable/func/signature.py +65 -7
- pixeltable/func/tools.py +153 -0
- pixeltable/func/udf.py +57 -35
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/anthropic.py +51 -4
- pixeltable/functions/gemini.py +85 -0
- pixeltable/functions/globals.py +54 -34
- pixeltable/functions/huggingface.py +10 -28
- pixeltable/functions/json.py +3 -8
- pixeltable/functions/math.py +67 -0
- pixeltable/functions/mistralai.py +0 -2
- pixeltable/functions/ollama.py +8 -8
- pixeltable/functions/openai.py +51 -4
- pixeltable/functions/timestamp.py +1 -1
- pixeltable/functions/video.py +3 -9
- pixeltable/functions/vision.py +1 -1
- pixeltable/globals.py +374 -89
- pixeltable/index/embedding_index.py +106 -29
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +39 -19
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/document.py +12 -0
- pixeltable/iterators/image.py +100 -0
- pixeltable/iterators/video.py +7 -8
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_16.py +2 -1
- pixeltable/metadata/converters/convert_17.py +2 -1
- pixeltable/metadata/converters/convert_22.py +17 -0
- pixeltable/metadata/converters/convert_23.py +35 -0
- pixeltable/metadata/converters/convert_24.py +56 -0
- pixeltable/metadata/converters/convert_25.py +19 -0
- pixeltable/metadata/converters/util.py +4 -2
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +1 -0
- pixeltable/plan.py +129 -51
- pixeltable/store.py +1 -1
- pixeltable/type_system.py +196 -54
- pixeltable/utils/arrow.py +8 -3
- pixeltable/utils/description_helper.py +89 -0
- pixeltable/utils/documents.py +14 -0
- {pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/METADATA +32 -22
- pixeltable-0.3.0.dist-info/RECORD +155 -0
- {pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
- pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
- pixeltable/tool/create_test_db_dump.py +0 -308
- pixeltable/tool/create_test_video.py +0 -81
- pixeltable/tool/doc_plugins/griffe.py +0 -50
- pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
- pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
- pixeltable/tool/embed_udf.py +0 -9
- pixeltable/tool/mypy_plugin.py +0 -55
- pixeltable-0.2.24.dist-info/RECORD +0 -153
- pixeltable-0.2.24.dist-info/entry_points.txt +0 -3
- {pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0
pixeltable/type_system.py
CHANGED
|
@@ -5,7 +5,6 @@ import datetime
|
|
|
5
5
|
import enum
|
|
6
6
|
import io
|
|
7
7
|
import json
|
|
8
|
-
import types
|
|
9
8
|
import typing
|
|
10
9
|
import urllib.parse
|
|
11
10
|
import urllib.request
|
|
@@ -14,7 +13,11 @@ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
|
14
13
|
|
|
15
14
|
import PIL.Image
|
|
16
15
|
import av # type: ignore
|
|
16
|
+
import jsonschema
|
|
17
|
+
import jsonschema.protocols
|
|
18
|
+
import jsonschema.validators
|
|
17
19
|
import numpy as np
|
|
20
|
+
import pydantic
|
|
18
21
|
import sqlalchemy as sql
|
|
19
22
|
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
20
23
|
from typing_extensions import _AnnotatedAlias
|
|
@@ -166,7 +169,7 @@ class ColumnType:
|
|
|
166
169
|
if t == cls.Type.DOCUMENT:
|
|
167
170
|
return DocumentType()
|
|
168
171
|
|
|
169
|
-
def
|
|
172
|
+
def __repr__(self) -> str:
|
|
170
173
|
return self._to_str(as_schema=False)
|
|
171
174
|
|
|
172
175
|
def _to_str(self, as_schema: bool) -> str:
|
|
@@ -244,7 +247,7 @@ class ColumnType:
|
|
|
244
247
|
if col_type is not None:
|
|
245
248
|
return col_type
|
|
246
249
|
# this could still be json-serializable
|
|
247
|
-
if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray):
|
|
250
|
+
if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray) or isinstance(val, pydantic.BaseModel):
|
|
248
251
|
try:
|
|
249
252
|
JsonType().validate_literal(val)
|
|
250
253
|
return JsonType(nullable=nullable)
|
|
@@ -337,7 +340,7 @@ class ColumnType:
|
|
|
337
340
|
return TimestampType(nullable=nullable_default)
|
|
338
341
|
if t is PIL.Image.Image:
|
|
339
342
|
return ImageType(nullable=nullable_default)
|
|
340
|
-
if issubclass(t, Sequence) or issubclass(t, Mapping):
|
|
343
|
+
if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
|
|
341
344
|
return JsonType(nullable=nullable_default)
|
|
342
345
|
return None
|
|
343
346
|
|
|
@@ -479,6 +482,20 @@ class ColumnType:
|
|
|
479
482
|
"""
|
|
480
483
|
pass
|
|
481
484
|
|
|
485
|
+
def to_json_schema(self) -> dict[str, Any]:
|
|
486
|
+
if self.nullable:
|
|
487
|
+
return {
|
|
488
|
+
'anyOf': [
|
|
489
|
+
self._to_json_schema(),
|
|
490
|
+
{'type': 'null'},
|
|
491
|
+
]
|
|
492
|
+
}
|
|
493
|
+
else:
|
|
494
|
+
return self._to_json_schema()
|
|
495
|
+
|
|
496
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
497
|
+
raise excs.Error(f'Pixeltable type {self} is not a valid JSON type')
|
|
498
|
+
|
|
482
499
|
|
|
483
500
|
class InvalidType(ColumnType):
|
|
484
501
|
def __init__(self, nullable: bool = False):
|
|
@@ -501,6 +518,9 @@ class StringType(ColumnType):
|
|
|
501
518
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
502
519
|
return sql.String()
|
|
503
520
|
|
|
521
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
522
|
+
return {'type': 'string'}
|
|
523
|
+
|
|
504
524
|
def print_value(self, val: Any) -> str:
|
|
505
525
|
return f"'{val}'"
|
|
506
526
|
|
|
@@ -524,8 +544,13 @@ class IntType(ColumnType):
|
|
|
524
544
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
525
545
|
return sql.BigInteger()
|
|
526
546
|
|
|
547
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
548
|
+
return {'type': 'integer'}
|
|
549
|
+
|
|
527
550
|
def _validate_literal(self, val: Any) -> None:
|
|
528
|
-
|
|
551
|
+
# bool is a subclass of int, so we need to check for it
|
|
552
|
+
# explicitly first
|
|
553
|
+
if isinstance(val, bool) or not isinstance(val, int):
|
|
529
554
|
raise TypeError(f'Expected int, got {val.__class__.__name__}')
|
|
530
555
|
|
|
531
556
|
|
|
@@ -536,6 +561,9 @@ class FloatType(ColumnType):
|
|
|
536
561
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
537
562
|
return sql.Float()
|
|
538
563
|
|
|
564
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
565
|
+
return {'type': 'number'}
|
|
566
|
+
|
|
539
567
|
def _validate_literal(self, val: Any) -> None:
|
|
540
568
|
if not isinstance(val, float):
|
|
541
569
|
raise TypeError(f'Expected float, got {val.__class__.__name__}')
|
|
@@ -553,6 +581,9 @@ class BoolType(ColumnType):
|
|
|
553
581
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
554
582
|
return sql.Boolean()
|
|
555
583
|
|
|
584
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
585
|
+
return {'type': 'boolean'}
|
|
586
|
+
|
|
556
587
|
def _validate_literal(self, val: Any) -> None:
|
|
557
588
|
if not isinstance(val, bool):
|
|
558
589
|
raise TypeError(f'Expected bool, got {val.__class__.__name__}')
|
|
@@ -581,61 +612,44 @@ class TimestampType(ColumnType):
|
|
|
581
612
|
|
|
582
613
|
|
|
583
614
|
class JsonType(ColumnType):
|
|
584
|
-
|
|
585
|
-
|
|
615
|
+
|
|
616
|
+
json_schema: Optional[dict[str, Any]]
|
|
617
|
+
__validator: Optional[jsonschema.protocols.Validator]
|
|
618
|
+
|
|
619
|
+
def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
|
|
586
620
|
super().__init__(self.Type.JSON, nullable=nullable)
|
|
587
|
-
self.
|
|
621
|
+
self.json_schema = json_schema
|
|
622
|
+
if json_schema is None:
|
|
623
|
+
self.__validator = None
|
|
624
|
+
else:
|
|
625
|
+
validator_cls = jsonschema.validators.validator_for(json_schema)
|
|
626
|
+
validator_cls.check_schema(json_schema)
|
|
627
|
+
self.__validator = validator_cls(json_schema)
|
|
588
628
|
|
|
589
629
|
def copy(self, nullable: bool) -> ColumnType:
|
|
590
|
-
return JsonType(self.
|
|
630
|
+
return JsonType(json_schema=self.json_schema, nullable=nullable)
|
|
591
631
|
|
|
592
632
|
def matches(self, other: ColumnType) -> bool:
|
|
593
|
-
return isinstance(other, JsonType) and self.
|
|
594
|
-
|
|
595
|
-
def supertype(self, other: ColumnType) -> Optional[JsonType]:
|
|
596
|
-
if not isinstance(other, JsonType):
|
|
597
|
-
return None
|
|
598
|
-
if self.type_spec is None:
|
|
599
|
-
# we don't have a type spec and can accept anything accepted by other
|
|
600
|
-
return JsonType(nullable=(self.nullable or other.nullable))
|
|
601
|
-
if other.type_spec is None:
|
|
602
|
-
# we have a type spec but other doesn't
|
|
603
|
-
return JsonType(nullable=(self.nullable or other.nullable))
|
|
604
|
-
|
|
605
|
-
# we both have type specs; the supertype's type spec is the union of the two
|
|
606
|
-
type_spec: dict[str, ColumnType] = {}
|
|
607
|
-
type_spec.update(self.type_spec)
|
|
608
|
-
for other_field_name, other_field_type in other.type_spec.items():
|
|
609
|
-
if other_field_name not in type_spec:
|
|
610
|
-
type_spec[other_field_name] = other_field_type
|
|
611
|
-
else:
|
|
612
|
-
# both type specs have this field
|
|
613
|
-
field_type = type_spec[other_field_name].supertype(other_field_type)
|
|
614
|
-
if field_type is None:
|
|
615
|
-
# conflicting types
|
|
616
|
-
return JsonType(nullable=(self.nullable or other.nullable))
|
|
617
|
-
type_spec[other_field_name] = field_type
|
|
618
|
-
return JsonType(type_spec, nullable=(self.nullable or other.nullable))
|
|
633
|
+
return isinstance(other, JsonType) and self.json_schema == other.json_schema
|
|
619
634
|
|
|
620
635
|
def _as_dict(self) -> dict:
|
|
621
636
|
result = super()._as_dict()
|
|
622
|
-
if self.
|
|
623
|
-
|
|
624
|
-
result.update({'type_spec': type_spec_dict})
|
|
637
|
+
if self.json_schema is not None:
|
|
638
|
+
result.update({'json_schema': self.json_schema})
|
|
625
639
|
return result
|
|
626
640
|
|
|
627
641
|
@classmethod
|
|
628
642
|
def _from_dict(cls, d: dict) -> ColumnType:
|
|
629
|
-
|
|
630
|
-
if 'type_spec' in d:
|
|
631
|
-
type_spec = {
|
|
632
|
-
field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
|
|
633
|
-
}
|
|
634
|
-
return cls(type_spec, nullable=d['nullable'])
|
|
643
|
+
return cls(json_schema=d.get('json_schema'), nullable=d['nullable'])
|
|
635
644
|
|
|
636
645
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
637
646
|
return sql.dialects.postgresql.JSONB()
|
|
638
647
|
|
|
648
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
649
|
+
if self.json_schema is None:
|
|
650
|
+
return {}
|
|
651
|
+
return self.json_schema
|
|
652
|
+
|
|
639
653
|
def print_value(self, val: Any) -> str:
|
|
640
654
|
val_type = self.infer_literal_type(val)
|
|
641
655
|
if val_type is None:
|
|
@@ -645,27 +659,138 @@ class JsonType(ColumnType):
|
|
|
645
659
|
return val_type.print_value(val)
|
|
646
660
|
|
|
647
661
|
def _validate_literal(self, val: Any) -> None:
|
|
648
|
-
if not
|
|
649
|
-
# TODO In the future we should accept scalars too, which would enable us to remove this top-level check
|
|
650
|
-
raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
|
|
651
|
-
if not self.__is_valid_literal(val):
|
|
662
|
+
if not self.__is_valid_json(val):
|
|
652
663
|
raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
|
|
664
|
+
if self.__validator is not None:
|
|
665
|
+
self.__validator.validate(val)
|
|
653
666
|
|
|
654
667
|
@classmethod
|
|
655
|
-
def
|
|
668
|
+
def __is_valid_json(cls, val: Any) -> bool:
|
|
656
669
|
if val is None or isinstance(val, (str, int, float, bool)):
|
|
657
670
|
return True
|
|
658
671
|
if isinstance(val, (list, tuple)):
|
|
659
|
-
return all(cls.
|
|
672
|
+
return all(cls.__is_valid_json(v) for v in val)
|
|
660
673
|
if isinstance(val, dict):
|
|
661
|
-
return all(isinstance(k, str) and cls.
|
|
674
|
+
return all(isinstance(k, str) and cls.__is_valid_json(v) for k, v in val.items())
|
|
662
675
|
return False
|
|
663
676
|
|
|
664
677
|
def _create_literal(self, val: Any) -> Any:
|
|
665
678
|
if isinstance(val, tuple):
|
|
666
679
|
val = list(val)
|
|
680
|
+
if isinstance(val, pydantic.BaseModel):
|
|
681
|
+
return val.model_dump()
|
|
667
682
|
return val
|
|
668
683
|
|
|
684
|
+
def supertype(self, other: ColumnType) -> Optional[JsonType]:
|
|
685
|
+
# Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
|
|
686
|
+
# are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
|
|
687
|
+
# merge their schemas.
|
|
688
|
+
basic_supertype = super().supertype(other)
|
|
689
|
+
if basic_supertype is not None:
|
|
690
|
+
assert isinstance(basic_supertype, JsonType)
|
|
691
|
+
return basic_supertype
|
|
692
|
+
|
|
693
|
+
if not isinstance(other, JsonType):
|
|
694
|
+
return None
|
|
695
|
+
|
|
696
|
+
if self.json_schema is None or other.json_schema is None:
|
|
697
|
+
return JsonType(nullable=(self.nullable or other.nullable))
|
|
698
|
+
|
|
699
|
+
superschema = self.__superschema(self.json_schema, other.json_schema)
|
|
700
|
+
|
|
701
|
+
return JsonType(
|
|
702
|
+
json_schema=(None if len(superschema) == 0 else superschema),
|
|
703
|
+
nullable=(self.nullable or other.nullable)
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
@classmethod
|
|
707
|
+
def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
|
|
708
|
+
# Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
|
|
709
|
+
# things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
|
|
710
|
+
# for the most important use cases (unions for type inference, validation of inline exprs). A schema is
|
|
711
|
+
# considered to be conforming if either:
|
|
712
|
+
# (i) it is a scalar (string, integer, number, boolean) or dictionary (object) type; or
|
|
713
|
+
# (ii) it is an "anyOf" schema of one of the above types and the exact schema {'type': 'null'}.
|
|
714
|
+
# Conforming schemas are organized into a type hierarchy in an internally consistent way. Nonconforming
|
|
715
|
+
# schemas are allowed, but they are isolates in the type hierarchy: a nonconforming schema has no proper
|
|
716
|
+
# subtypes, and its only proper supertype is an unconstrained JsonType().
|
|
717
|
+
#
|
|
718
|
+
# There is some subtlety in the handling of nullable fields. Nullable fields are represented in JSON
|
|
719
|
+
# schemas as (for example) {'anyOf': [{'type': 'string'}, {'type': 'null'}]}. When finding the supertype
|
|
720
|
+
# of schemas that might be nullable, we first unpack the 'anyOf's, find the supertype of the underlyings,
|
|
721
|
+
# then reapply the 'anyOf' if appropriate. The top-level schema (i.e., JsonType.json_schema) is presumed
|
|
722
|
+
# to NOT be in this form (since nullability is indicated by the `nullable` field of the JsonType object),
|
|
723
|
+
# so this subtlety is applicable only to types that occur in subfields.
|
|
724
|
+
#
|
|
725
|
+
# There is currently no special handling of lists; distinct schemas with type 'array' will union to the
|
|
726
|
+
# generic {'type': 'array'} schema. This could be a TODO item if there is a need for it in the future.
|
|
727
|
+
|
|
728
|
+
if a == b:
|
|
729
|
+
return a
|
|
730
|
+
|
|
731
|
+
if 'properties' in a and 'properties' in b:
|
|
732
|
+
a_props = a['properties']
|
|
733
|
+
b_props = b['properties']
|
|
734
|
+
a_req = a.get('required', [])
|
|
735
|
+
b_req = b.get('required', [])
|
|
736
|
+
super_props = {}
|
|
737
|
+
super_req = []
|
|
738
|
+
for key, a_prop_schema in a_props.items():
|
|
739
|
+
if key in b_props: # in both a and b
|
|
740
|
+
prop_schema = cls.__superschema_with_nulls(a_prop_schema, b_props[key])
|
|
741
|
+
super_props[key] = prop_schema
|
|
742
|
+
if key in a_req and key in b_req:
|
|
743
|
+
super_req.append(key)
|
|
744
|
+
else: # in a but not b
|
|
745
|
+
# Add it to the supertype schema as optional (regardless of its status in a)
|
|
746
|
+
super_props[key] = a_prop_schema
|
|
747
|
+
for key, b_prop_schema in b_props.items():
|
|
748
|
+
if key not in a_props: # in b but not a
|
|
749
|
+
super_props[key] = b_prop_schema
|
|
750
|
+
schema = {'type': 'object', 'properties': super_props}
|
|
751
|
+
if len(super_req) > 0:
|
|
752
|
+
schema['required'] = super_req
|
|
753
|
+
return schema
|
|
754
|
+
|
|
755
|
+
a_type = a.get('type')
|
|
756
|
+
b_type = b.get('type')
|
|
757
|
+
|
|
758
|
+
if (a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type):
|
|
759
|
+
# a and b both have the same type designation, but are not identical. This can happen if
|
|
760
|
+
# (for example) they have validators or other attributes that differ. In this case, we
|
|
761
|
+
# generalize to {'type': t}, where t is their shared type, with no other qualifications.
|
|
762
|
+
return {'type': a_type}
|
|
763
|
+
|
|
764
|
+
return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
|
|
765
|
+
|
|
766
|
+
@classmethod
|
|
767
|
+
def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
|
|
768
|
+
a, a_nullable = cls.__unpack_null_from_schema(a)
|
|
769
|
+
b, b_nullable = cls.__unpack_null_from_schema(b)
|
|
770
|
+
|
|
771
|
+
result = cls.__superschema(a, b)
|
|
772
|
+
if len(result) > 0 and (a_nullable or b_nullable):
|
|
773
|
+
# if len(result) == 0, then null is implicitly accepted; otherwise, we need to explicitly allow it
|
|
774
|
+
return {'anyOf': [result, {'type': 'null'}]}
|
|
775
|
+
return result
|
|
776
|
+
|
|
777
|
+
@classmethod
|
|
778
|
+
def __unpack_null_from_schema(cls, s: dict[str, Any]) -> tuple[dict[str, Any], bool]:
|
|
779
|
+
if 'anyOf' in s and len(s['anyOf']) == 2 and {'type': 'null'} in s['anyOf']:
|
|
780
|
+
try:
|
|
781
|
+
return next(s for s in s['anyOf'] if s != {'type': 'null'}), True
|
|
782
|
+
except StopIteration:
|
|
783
|
+
pass
|
|
784
|
+
return s, False
|
|
785
|
+
|
|
786
|
+
def _to_base_str(self) -> str:
|
|
787
|
+
if self.json_schema is None:
|
|
788
|
+
return 'Json'
|
|
789
|
+
elif 'title' in self.json_schema:
|
|
790
|
+
return f'Json[{self.json_schema["title"]}]'
|
|
791
|
+
else:
|
|
792
|
+
return f'Json[{self.json_schema}]'
|
|
793
|
+
|
|
669
794
|
|
|
670
795
|
class ArrayType(ColumnType):
|
|
671
796
|
def __init__(self, shape: tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
|
|
@@ -743,6 +868,12 @@ class ArrayType(ColumnType):
|
|
|
743
868
|
return False
|
|
744
869
|
return val.dtype == self.numpy_dtype()
|
|
745
870
|
|
|
871
|
+
def _to_json_schema(self) -> dict[str, Any]:
|
|
872
|
+
return {
|
|
873
|
+
'type': 'array',
|
|
874
|
+
'items': self.pxt_dtype._to_json_schema(),
|
|
875
|
+
}
|
|
876
|
+
|
|
746
877
|
def _validate_literal(self, val: Any) -> None:
|
|
747
878
|
if not isinstance(val, np.ndarray):
|
|
748
879
|
raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
|
|
@@ -752,7 +883,7 @@ class ArrayType(ColumnType):
|
|
|
752
883
|
f'got ndarray({val.shape}, dtype={val.dtype})'))
|
|
753
884
|
|
|
754
885
|
def _create_literal(self, val: Any) -> Any:
|
|
755
|
-
if isinstance(val, (list,tuple)):
|
|
886
|
+
if isinstance(val, (list, tuple)):
|
|
756
887
|
# map python float to whichever numpy float is
|
|
757
888
|
# declared for this type, rather than assume float64
|
|
758
889
|
return np.array(val, dtype=self.numpy_dtype())
|
|
@@ -902,7 +1033,7 @@ class VideoType(ColumnType):
|
|
|
902
1033
|
if num_decoded < 2:
|
|
903
1034
|
# this is most likely an image file
|
|
904
1035
|
raise excs.Error(f'Not a valid video: {val}')
|
|
905
|
-
except av.
|
|
1036
|
+
except av.FFmpegError:
|
|
906
1037
|
raise excs.Error(f'Not a valid video: {val}') from None
|
|
907
1038
|
|
|
908
1039
|
|
|
@@ -929,7 +1060,7 @@ class AudioType(ColumnType):
|
|
|
929
1060
|
for packet in container.demux(audio_stream):
|
|
930
1061
|
for _ in packet.decode():
|
|
931
1062
|
pass
|
|
932
|
-
except av.
|
|
1063
|
+
except av.FFmpegError as e:
|
|
933
1064
|
raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
|
|
934
1065
|
|
|
935
1066
|
|
|
@@ -940,6 +1071,7 @@ class DocumentType(ColumnType):
|
|
|
940
1071
|
MD = 1
|
|
941
1072
|
PDF = 2
|
|
942
1073
|
XML = 3
|
|
1074
|
+
TXT = 4
|
|
943
1075
|
|
|
944
1076
|
def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
|
|
945
1077
|
super().__init__(self.Type.DOCUMENT, nullable=nullable)
|
|
@@ -1016,6 +1148,16 @@ class _PxtType:
|
|
|
1016
1148
|
|
|
1017
1149
|
|
|
1018
1150
|
class Json(_PxtType):
|
|
1151
|
+
def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
|
|
1152
|
+
"""
|
|
1153
|
+
`item` (the type subscript) must be a `dict` representing a valid JSON Schema.
|
|
1154
|
+
"""
|
|
1155
|
+
if not isinstance(item, dict):
|
|
1156
|
+
raise TypeError('Json type parameter must be a dict')
|
|
1157
|
+
|
|
1158
|
+
# The JsonType initializer will validate the JSON Schema.
|
|
1159
|
+
return typing.Annotated[Any, JsonType(json_schema=item, nullable=False)]
|
|
1160
|
+
|
|
1019
1161
|
@classmethod
|
|
1020
1162
|
def as_col_type(cls, nullable: bool) -> ColumnType:
|
|
1021
1163
|
return JsonType(nullable=nullable)
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -3,14 +3,17 @@ from typing import Any, Iterator, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pyarrow as pa
|
|
6
|
+
import datetime
|
|
6
7
|
|
|
7
8
|
import pixeltable.type_system as ts
|
|
9
|
+
from pixeltable.env import Env
|
|
10
|
+
|
|
11
|
+
_tz_def = Env().get().default_time_zone
|
|
8
12
|
|
|
9
13
|
_logger = logging.getLogger(__name__)
|
|
10
14
|
|
|
11
15
|
_pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
12
16
|
pa.string(): ts.StringType(nullable=True),
|
|
13
|
-
pa.timestamp('us'): ts.TimestampType(nullable=True),
|
|
14
17
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
15
18
|
pa.uint8(): ts.IntType(nullable=True),
|
|
16
19
|
pa.int8(): ts.IntType(nullable=True),
|
|
@@ -23,7 +26,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
|
23
26
|
|
|
24
27
|
_pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
|
|
25
28
|
ts.StringType: pa.string(),
|
|
26
|
-
ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
|
|
29
|
+
ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
|
|
27
30
|
ts.BoolType: pa.bool_(),
|
|
28
31
|
ts.IntType: pa.int64(),
|
|
29
32
|
ts.FloatType: pa.float32(),
|
|
@@ -39,7 +42,9 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
|
|
|
39
42
|
"""Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
|
|
40
43
|
Returns None if no conversion is currently implemented.
|
|
41
44
|
"""
|
|
42
|
-
if arrow_type
|
|
45
|
+
if isinstance(arrow_type, pa.TimestampType):
|
|
46
|
+
return ts.TimestampType(nullable=True)
|
|
47
|
+
elif arrow_type in _pa_to_pt:
|
|
43
48
|
return _pa_to_pt[arrow_type]
|
|
44
49
|
elif isinstance(arrow_type, pa.FixedShapeTensorType):
|
|
45
50
|
dtype = to_pixeltable_type(arrow_type.value_type)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from pandas.io.formats.style import Styler
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclasses.dataclass
|
|
9
|
+
class _Descriptor:
|
|
10
|
+
body: Union[str, pd.DataFrame]
|
|
11
|
+
# The remaining fields only affect the behavior if `body` is a pd.DataFrame.
|
|
12
|
+
show_index: bool
|
|
13
|
+
show_header: bool
|
|
14
|
+
styler: Optional[Styler] = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DescriptionHelper:
|
|
18
|
+
"""
|
|
19
|
+
Helper class for rendering long-form descriptions of Pixeltable objects.
|
|
20
|
+
|
|
21
|
+
The output is specified as a list of "descriptors", each of which can be either a string or a Pandas DataFrame,
|
|
22
|
+
in any combination. The descriptors will be rendered in sequence. This is useful for long-form descriptions that
|
|
23
|
+
include tables with differing schemas or formatting, and/or a combination of tables and text.
|
|
24
|
+
|
|
25
|
+
DescriptionHelper can convert a list of descriptors into either HTML or plaintext and do something reasonable
|
|
26
|
+
in each case.
|
|
27
|
+
"""
|
|
28
|
+
__descriptors: list[_Descriptor]
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self.__descriptors = []
|
|
32
|
+
|
|
33
|
+
def append(
|
|
34
|
+
self,
|
|
35
|
+
descriptor: Union[str, pd.DataFrame],
|
|
36
|
+
show_index: bool = False,
|
|
37
|
+
show_header: bool = True,
|
|
38
|
+
styler: Optional[Styler] = None,
|
|
39
|
+
) -> None:
|
|
40
|
+
self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
|
|
41
|
+
|
|
42
|
+
def to_string(self) -> str:
|
|
43
|
+
blocks = [self.__render_text(descriptor) for descriptor in self.__descriptors]
|
|
44
|
+
return '\n\n'.join(blocks)
|
|
45
|
+
|
|
46
|
+
def to_html(self) -> str:
|
|
47
|
+
html_blocks = [self.__apply_styles(descriptor).to_html() for descriptor in self.__descriptors]
|
|
48
|
+
return '\n'.join(html_blocks)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def __render_text(cls, descriptor: _Descriptor) -> str:
|
|
52
|
+
if isinstance(descriptor.body, str):
|
|
53
|
+
return descriptor.body
|
|
54
|
+
else:
|
|
55
|
+
# If `show_index=False`, we get cleaner output (better intercolumn spacing) by setting the index to a
|
|
56
|
+
# list of empty strings than by setting `index=False` in the call to `df.to_string()`. It's pretty silly
|
|
57
|
+
# that `index=False` has side effects in Pandas that go beyond simply not displaying the index, but it
|
|
58
|
+
# is what it is.
|
|
59
|
+
df = descriptor.body
|
|
60
|
+
if not descriptor.show_index:
|
|
61
|
+
df = df.copy()
|
|
62
|
+
df.index = [''] * len(df) # type: ignore[assignment]
|
|
63
|
+
# max_colwidth=50 is the identical default that Pandas uses for a DataFrame's __repr__() output.
|
|
64
|
+
return df.to_string(header=descriptor.show_header, max_colwidth=50)
|
|
65
|
+
|
|
66
|
+
@classmethod
|
|
67
|
+
def __apply_styles(cls, descriptor: _Descriptor) -> Styler:
|
|
68
|
+
if isinstance(descriptor.body, str):
|
|
69
|
+
return (
|
|
70
|
+
# Render the string as a single-cell DataFrame. This will ensure a consistent style of output in
|
|
71
|
+
# cases where strings appear alongside DataFrames in the same DescriptionHelper.
|
|
72
|
+
pd.DataFrame([descriptor.body]).style
|
|
73
|
+
.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
|
|
74
|
+
.hide(axis='index').hide(axis='columns')
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
styler = descriptor.styler
|
|
78
|
+
if styler is None:
|
|
79
|
+
styler = descriptor.body.style
|
|
80
|
+
styler = (
|
|
81
|
+
styler
|
|
82
|
+
.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
|
|
83
|
+
.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
|
|
84
|
+
)
|
|
85
|
+
if not descriptor.show_header:
|
|
86
|
+
styler = styler.hide(axis='columns')
|
|
87
|
+
if not descriptor.show_index:
|
|
88
|
+
styler = styler.hide(axis='index')
|
|
89
|
+
return styler
|
pixeltable/utils/documents.py
CHANGED
|
@@ -15,6 +15,7 @@ class DocumentHandle:
|
|
|
15
15
|
bs_doc: Optional[bs4.BeautifulSoup] = None
|
|
16
16
|
md_ast: Optional[dict] = None
|
|
17
17
|
pdf_doc: Optional[fitz.Document] = None
|
|
18
|
+
txt_doc: Optional[str] = None
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
def get_document_handle(path: str) -> Optional[DocumentHandle]:
|
|
@@ -40,6 +41,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
|
|
|
40
41
|
if bs_doc is not None:
|
|
41
42
|
return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
|
|
42
43
|
|
|
44
|
+
if doc_format == '.txt':
|
|
45
|
+
txt_doc = get_txt(path)
|
|
46
|
+
if txt_doc is not None:
|
|
47
|
+
return DocumentHandle(format=ts.DocumentType.DocumentFormat.TXT, txt_doc=txt_doc)
|
|
48
|
+
|
|
43
49
|
return None
|
|
44
50
|
|
|
45
51
|
|
|
@@ -84,3 +90,11 @@ def get_markdown_handle(path: str) -> Optional[dict]:
|
|
|
84
90
|
return md_ast(text)
|
|
85
91
|
except Exception:
|
|
86
92
|
return None
|
|
93
|
+
|
|
94
|
+
def get_txt(path: str) -> Optional[str]:
|
|
95
|
+
try:
|
|
96
|
+
with open(path, "r") as f:
|
|
97
|
+
doc = f.read()
|
|
98
|
+
return doc if doc != '' else None
|
|
99
|
+
except Exception:
|
|
100
|
+
return None
|