pixeltable 0.2.24__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (101) hide show
  1. pixeltable/__init__.py +2 -2
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +1 -1
  4. pixeltable/catalog/dir.py +6 -0
  5. pixeltable/catalog/globals.py +25 -0
  6. pixeltable/catalog/named_function.py +4 -0
  7. pixeltable/catalog/path_dict.py +37 -11
  8. pixeltable/catalog/schema_object.py +6 -0
  9. pixeltable/catalog/table.py +531 -251
  10. pixeltable/catalog/table_version.py +22 -8
  11. pixeltable/catalog/view.py +8 -7
  12. pixeltable/dataframe.py +439 -105
  13. pixeltable/env.py +19 -5
  14. pixeltable/exec/__init__.py +1 -1
  15. pixeltable/exec/exec_node.py +6 -7
  16. pixeltable/exec/expr_eval_node.py +1 -1
  17. pixeltable/exec/sql_node.py +92 -45
  18. pixeltable/exprs/__init__.py +1 -0
  19. pixeltable/exprs/arithmetic_expr.py +1 -1
  20. pixeltable/exprs/array_slice.py +1 -1
  21. pixeltable/exprs/column_property_ref.py +1 -1
  22. pixeltable/exprs/column_ref.py +29 -2
  23. pixeltable/exprs/comparison.py +1 -1
  24. pixeltable/exprs/compound_predicate.py +1 -1
  25. pixeltable/exprs/expr.py +12 -5
  26. pixeltable/exprs/expr_set.py +8 -0
  27. pixeltable/exprs/function_call.py +147 -39
  28. pixeltable/exprs/in_predicate.py +1 -1
  29. pixeltable/exprs/inline_expr.py +25 -5
  30. pixeltable/exprs/is_null.py +1 -1
  31. pixeltable/exprs/json_mapper.py +1 -1
  32. pixeltable/exprs/json_path.py +1 -1
  33. pixeltable/exprs/method_ref.py +1 -1
  34. pixeltable/exprs/row_builder.py +1 -1
  35. pixeltable/exprs/rowid_ref.py +1 -1
  36. pixeltable/exprs/similarity_expr.py +17 -7
  37. pixeltable/exprs/sql_element_cache.py +4 -0
  38. pixeltable/exprs/type_cast.py +2 -2
  39. pixeltable/exprs/variable.py +3 -0
  40. pixeltable/func/__init__.py +5 -4
  41. pixeltable/func/aggregate_function.py +151 -68
  42. pixeltable/func/callable_function.py +48 -16
  43. pixeltable/func/expr_template_function.py +64 -23
  44. pixeltable/func/function.py +227 -23
  45. pixeltable/func/function_registry.py +2 -1
  46. pixeltable/func/query_template_function.py +51 -9
  47. pixeltable/func/signature.py +65 -7
  48. pixeltable/func/tools.py +153 -0
  49. pixeltable/func/udf.py +57 -35
  50. pixeltable/functions/__init__.py +2 -2
  51. pixeltable/functions/anthropic.py +51 -4
  52. pixeltable/functions/gemini.py +85 -0
  53. pixeltable/functions/globals.py +54 -34
  54. pixeltable/functions/huggingface.py +10 -28
  55. pixeltable/functions/json.py +3 -8
  56. pixeltable/functions/math.py +67 -0
  57. pixeltable/functions/mistralai.py +0 -2
  58. pixeltable/functions/ollama.py +8 -8
  59. pixeltable/functions/openai.py +51 -4
  60. pixeltable/functions/timestamp.py +1 -1
  61. pixeltable/functions/video.py +3 -9
  62. pixeltable/functions/vision.py +1 -1
  63. pixeltable/globals.py +374 -89
  64. pixeltable/index/embedding_index.py +106 -29
  65. pixeltable/io/__init__.py +1 -1
  66. pixeltable/io/label_studio.py +1 -1
  67. pixeltable/io/parquet.py +39 -19
  68. pixeltable/iterators/__init__.py +1 -0
  69. pixeltable/iterators/document.py +12 -0
  70. pixeltable/iterators/image.py +100 -0
  71. pixeltable/iterators/video.py +7 -8
  72. pixeltable/metadata/__init__.py +1 -1
  73. pixeltable/metadata/converters/convert_16.py +2 -1
  74. pixeltable/metadata/converters/convert_17.py +2 -1
  75. pixeltable/metadata/converters/convert_22.py +17 -0
  76. pixeltable/metadata/converters/convert_23.py +35 -0
  77. pixeltable/metadata/converters/convert_24.py +56 -0
  78. pixeltable/metadata/converters/convert_25.py +19 -0
  79. pixeltable/metadata/converters/util.py +4 -2
  80. pixeltable/metadata/notes.py +4 -0
  81. pixeltable/metadata/schema.py +1 -0
  82. pixeltable/plan.py +129 -51
  83. pixeltable/store.py +1 -1
  84. pixeltable/type_system.py +196 -54
  85. pixeltable/utils/arrow.py +8 -3
  86. pixeltable/utils/description_helper.py +89 -0
  87. pixeltable/utils/documents.py +14 -0
  88. {pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/METADATA +32 -22
  89. pixeltable-0.3.0.dist-info/RECORD +155 -0
  90. {pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
  91. pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
  92. pixeltable/tool/create_test_db_dump.py +0 -308
  93. pixeltable/tool/create_test_video.py +0 -81
  94. pixeltable/tool/doc_plugins/griffe.py +0 -50
  95. pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
  96. pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
  97. pixeltable/tool/embed_udf.py +0 -9
  98. pixeltable/tool/mypy_plugin.py +0 -55
  99. pixeltable-0.2.24.dist-info/RECORD +0 -153
  100. pixeltable-0.2.24.dist-info/entry_points.txt +0 -3
  101. {pixeltable-0.2.24.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0
pixeltable/type_system.py CHANGED
@@ -5,7 +5,6 @@ import datetime
5
5
  import enum
6
6
  import io
7
7
  import json
8
- import types
9
8
  import typing
10
9
  import urllib.parse
11
10
  import urllib.request
@@ -14,7 +13,11 @@ from typing import Any, Iterable, Mapping, Optional, Sequence, Union
14
13
 
15
14
  import PIL.Image
16
15
  import av # type: ignore
16
+ import jsonschema
17
+ import jsonschema.protocols
18
+ import jsonschema.validators
17
19
  import numpy as np
20
+ import pydantic
18
21
  import sqlalchemy as sql
19
22
  from typing import _GenericAlias # type: ignore[attr-defined]
20
23
  from typing_extensions import _AnnotatedAlias
@@ -166,7 +169,7 @@ class ColumnType:
166
169
  if t == cls.Type.DOCUMENT:
167
170
  return DocumentType()
168
171
 
169
- def __str__(self) -> str:
172
+ def __repr__(self) -> str:
170
173
  return self._to_str(as_schema=False)
171
174
 
172
175
  def _to_str(self, as_schema: bool) -> str:
@@ -244,7 +247,7 @@ class ColumnType:
244
247
  if col_type is not None:
245
248
  return col_type
246
249
  # this could still be json-serializable
247
- if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray):
250
+ if isinstance(val, dict) or isinstance(val, list) or isinstance(val, np.ndarray) or isinstance(val, pydantic.BaseModel):
248
251
  try:
249
252
  JsonType().validate_literal(val)
250
253
  return JsonType(nullable=nullable)
@@ -337,7 +340,7 @@ class ColumnType:
337
340
  return TimestampType(nullable=nullable_default)
338
341
  if t is PIL.Image.Image:
339
342
  return ImageType(nullable=nullable_default)
340
- if issubclass(t, Sequence) or issubclass(t, Mapping):
343
+ if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
341
344
  return JsonType(nullable=nullable_default)
342
345
  return None
343
346
 
@@ -479,6 +482,20 @@ class ColumnType:
479
482
  """
480
483
  pass
481
484
 
485
+ def to_json_schema(self) -> dict[str, Any]:
486
+ if self.nullable:
487
+ return {
488
+ 'anyOf': [
489
+ self._to_json_schema(),
490
+ {'type': 'null'},
491
+ ]
492
+ }
493
+ else:
494
+ return self._to_json_schema()
495
+
496
+ def _to_json_schema(self) -> dict[str, Any]:
497
+ raise excs.Error(f'Pixeltable type {self} is not a valid JSON type')
498
+
482
499
 
483
500
  class InvalidType(ColumnType):
484
501
  def __init__(self, nullable: bool = False):
@@ -501,6 +518,9 @@ class StringType(ColumnType):
501
518
  def to_sa_type(self) -> sql.types.TypeEngine:
502
519
  return sql.String()
503
520
 
521
+ def _to_json_schema(self) -> dict[str, Any]:
522
+ return {'type': 'string'}
523
+
504
524
  def print_value(self, val: Any) -> str:
505
525
  return f"'{val}'"
506
526
 
@@ -524,8 +544,13 @@ class IntType(ColumnType):
524
544
  def to_sa_type(self) -> sql.types.TypeEngine:
525
545
  return sql.BigInteger()
526
546
 
547
+ def _to_json_schema(self) -> dict[str, Any]:
548
+ return {'type': 'integer'}
549
+
527
550
  def _validate_literal(self, val: Any) -> None:
528
- if not isinstance(val, int):
551
+ # bool is a subclass of int, so we need to check for it
552
+ # explicitly first
553
+ if isinstance(val, bool) or not isinstance(val, int):
529
554
  raise TypeError(f'Expected int, got {val.__class__.__name__}')
530
555
 
531
556
 
@@ -536,6 +561,9 @@ class FloatType(ColumnType):
536
561
  def to_sa_type(self) -> sql.types.TypeEngine:
537
562
  return sql.Float()
538
563
 
564
+ def _to_json_schema(self) -> dict[str, Any]:
565
+ return {'type': 'number'}
566
+
539
567
  def _validate_literal(self, val: Any) -> None:
540
568
  if not isinstance(val, float):
541
569
  raise TypeError(f'Expected float, got {val.__class__.__name__}')
@@ -553,6 +581,9 @@ class BoolType(ColumnType):
553
581
  def to_sa_type(self) -> sql.types.TypeEngine:
554
582
  return sql.Boolean()
555
583
 
584
+ def _to_json_schema(self) -> dict[str, Any]:
585
+ return {'type': 'boolean'}
586
+
556
587
  def _validate_literal(self, val: Any) -> None:
557
588
  if not isinstance(val, bool):
558
589
  raise TypeError(f'Expected bool, got {val.__class__.__name__}')
@@ -581,61 +612,44 @@ class TimestampType(ColumnType):
581
612
 
582
613
 
583
614
  class JsonType(ColumnType):
584
- # TODO: type_spec also needs to be able to express lists
585
- def __init__(self, type_spec: Optional[dict[str, ColumnType]] = None, nullable: bool = False):
615
+
616
+ json_schema: Optional[dict[str, Any]]
617
+ __validator: Optional[jsonschema.protocols.Validator]
618
+
619
+ def __init__(self, json_schema: Optional[dict[str, Any]] = None, nullable: bool = False):
586
620
  super().__init__(self.Type.JSON, nullable=nullable)
587
- self.type_spec = type_spec
621
+ self.json_schema = json_schema
622
+ if json_schema is None:
623
+ self.__validator = None
624
+ else:
625
+ validator_cls = jsonschema.validators.validator_for(json_schema)
626
+ validator_cls.check_schema(json_schema)
627
+ self.__validator = validator_cls(json_schema)
588
628
 
589
629
  def copy(self, nullable: bool) -> ColumnType:
590
- return JsonType(self.type_spec, nullable=nullable)
630
+ return JsonType(json_schema=self.json_schema, nullable=nullable)
591
631
 
592
632
  def matches(self, other: ColumnType) -> bool:
593
- return isinstance(other, JsonType) and self.type_spec == other.type_spec
594
-
595
- def supertype(self, other: ColumnType) -> Optional[JsonType]:
596
- if not isinstance(other, JsonType):
597
- return None
598
- if self.type_spec is None:
599
- # we don't have a type spec and can accept anything accepted by other
600
- return JsonType(nullable=(self.nullable or other.nullable))
601
- if other.type_spec is None:
602
- # we have a type spec but other doesn't
603
- return JsonType(nullable=(self.nullable or other.nullable))
604
-
605
- # we both have type specs; the supertype's type spec is the union of the two
606
- type_spec: dict[str, ColumnType] = {}
607
- type_spec.update(self.type_spec)
608
- for other_field_name, other_field_type in other.type_spec.items():
609
- if other_field_name not in type_spec:
610
- type_spec[other_field_name] = other_field_type
611
- else:
612
- # both type specs have this field
613
- field_type = type_spec[other_field_name].supertype(other_field_type)
614
- if field_type is None:
615
- # conflicting types
616
- return JsonType(nullable=(self.nullable or other.nullable))
617
- type_spec[other_field_name] = field_type
618
- return JsonType(type_spec, nullable=(self.nullable or other.nullable))
633
+ return isinstance(other, JsonType) and self.json_schema == other.json_schema
619
634
 
620
635
  def _as_dict(self) -> dict:
621
636
  result = super()._as_dict()
622
- if self.type_spec is not None:
623
- type_spec_dict = {field_name: field_type.serialize() for field_name, field_type in self.type_spec.items()}
624
- result.update({'type_spec': type_spec_dict})
637
+ if self.json_schema is not None:
638
+ result.update({'json_schema': self.json_schema})
625
639
  return result
626
640
 
627
641
  @classmethod
628
642
  def _from_dict(cls, d: dict) -> ColumnType:
629
- type_spec = None
630
- if 'type_spec' in d:
631
- type_spec = {
632
- field_name: cls.deserialize(field_type_dict) for field_name, field_type_dict in d['type_spec'].items()
633
- }
634
- return cls(type_spec, nullable=d['nullable'])
643
+ return cls(json_schema=d.get('json_schema'), nullable=d['nullable'])
635
644
 
636
645
  def to_sa_type(self) -> sql.types.TypeEngine:
637
646
  return sql.dialects.postgresql.JSONB()
638
647
 
648
+ def _to_json_schema(self) -> dict[str, Any]:
649
+ if self.json_schema is None:
650
+ return {}
651
+ return self.json_schema
652
+
639
653
  def print_value(self, val: Any) -> str:
640
654
  val_type = self.infer_literal_type(val)
641
655
  if val_type is None:
@@ -645,27 +659,138 @@ class JsonType(ColumnType):
645
659
  return val_type.print_value(val)
646
660
 
647
661
  def _validate_literal(self, val: Any) -> None:
648
- if not isinstance(val, dict) and not isinstance(val, list):
649
- # TODO In the future we should accept scalars too, which would enable us to remove this top-level check
650
- raise TypeError(f'Expected dict or list, got {val.__class__.__name__}')
651
- if not self.__is_valid_literal(val):
662
+ if not self.__is_valid_json(val):
652
663
  raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
664
+ if self.__validator is not None:
665
+ self.__validator.validate(val)
653
666
 
654
667
  @classmethod
655
- def __is_valid_literal(cls, val: Any) -> bool:
668
+ def __is_valid_json(cls, val: Any) -> bool:
656
669
  if val is None or isinstance(val, (str, int, float, bool)):
657
670
  return True
658
671
  if isinstance(val, (list, tuple)):
659
- return all(cls.__is_valid_literal(v) for v in val)
672
+ return all(cls.__is_valid_json(v) for v in val)
660
673
  if isinstance(val, dict):
661
- return all(isinstance(k, str) and cls.__is_valid_literal(v) for k, v in val.items())
674
+ return all(isinstance(k, str) and cls.__is_valid_json(v) for k, v in val.items())
662
675
  return False
663
676
 
664
677
  def _create_literal(self, val: Any) -> Any:
665
678
  if isinstance(val, tuple):
666
679
  val = list(val)
680
+ if isinstance(val, pydantic.BaseModel):
681
+ return val.model_dump()
667
682
  return val
668
683
 
684
+ def supertype(self, other: ColumnType) -> Optional[JsonType]:
685
+ # Try using the (much faster) supertype logic in ColumnType first. That will work if, for example, the types
686
+ # are identical except for nullability. If that doesn't work and both types are JsonType, then we will need to
687
+ # merge their schemas.
688
+ basic_supertype = super().supertype(other)
689
+ if basic_supertype is not None:
690
+ assert isinstance(basic_supertype, JsonType)
691
+ return basic_supertype
692
+
693
+ if not isinstance(other, JsonType):
694
+ return None
695
+
696
+ if self.json_schema is None or other.json_schema is None:
697
+ return JsonType(nullable=(self.nullable or other.nullable))
698
+
699
+ superschema = self.__superschema(self.json_schema, other.json_schema)
700
+
701
+ return JsonType(
702
+ json_schema=(None if len(superschema) == 0 else superschema),
703
+ nullable=(self.nullable or other.nullable)
704
+ )
705
+
706
+ @classmethod
707
+ def __superschema(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
708
+ # Defining a general type hierarchy over all JSON schemas would be a challenging problem. In order to keep
709
+ # things manageable, we only define a hierarchy among "conforming" schemas, which provides enough generality
710
+ # for the most important use cases (unions for type inference, validation of inline exprs). A schema is
711
+ # considered to be conforming if either:
712
+ # (i) it is a scalar (string, integer, number, boolean) or dictionary (object) type; or
713
+ # (ii) it is an "anyOf" schema of one of the above types and the exact schema {'type': 'null'}.
714
+ # Conforming schemas are organized into a type hierarchy in an internally consistent way. Nonconforming
715
+ # schemas are allowed, but they are isolates in the type hierarchy: a nonconforming schema has no proper
716
+ # subtypes, and its only proper supertype is an unconstrained JsonType().
717
+ #
718
+ # There is some subtlety in the handling of nullable fields. Nullable fields are represented in JSON
719
+ # schemas as (for example) {'anyOf': [{'type': 'string'}, {'type': 'null'}]}. When finding the supertype
720
+ # of schemas that might be nullable, we first unpack the 'anyOf's, find the supertype of the underlyings,
721
+ # then reapply the 'anyOf' if appropriate. The top-level schema (i.e., JsonType.json_schema) is presumed
722
+ # to NOT be in this form (since nullability is indicated by the `nullable` field of the JsonType object),
723
+ # so this subtlety is applicable only to types that occur in subfields.
724
+ #
725
+ # There is currently no special handling of lists; distinct schemas with type 'array' will union to the
726
+ # generic {'type': 'array'} schema. This could be a TODO item if there is a need for it in the future.
727
+
728
+ if a == b:
729
+ return a
730
+
731
+ if 'properties' in a and 'properties' in b:
732
+ a_props = a['properties']
733
+ b_props = b['properties']
734
+ a_req = a.get('required', [])
735
+ b_req = b.get('required', [])
736
+ super_props = {}
737
+ super_req = []
738
+ for key, a_prop_schema in a_props.items():
739
+ if key in b_props: # in both a and b
740
+ prop_schema = cls.__superschema_with_nulls(a_prop_schema, b_props[key])
741
+ super_props[key] = prop_schema
742
+ if key in a_req and key in b_req:
743
+ super_req.append(key)
744
+ else: # in a but not b
745
+ # Add it to the supertype schema as optional (regardless of its status in a)
746
+ super_props[key] = a_prop_schema
747
+ for key, b_prop_schema in b_props.items():
748
+ if key not in a_props: # in b but not a
749
+ super_props[key] = b_prop_schema
750
+ schema = {'type': 'object', 'properties': super_props}
751
+ if len(super_req) > 0:
752
+ schema['required'] = super_req
753
+ return schema
754
+
755
+ a_type = a.get('type')
756
+ b_type = b.get('type')
757
+
758
+ if (a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type):
759
+ # a and b both have the same type designation, but are not identical. This can happen if
760
+ # (for example) they have validators or other attributes that differ. In this case, we
761
+ # generalize to {'type': t}, where t is their shared type, with no other qualifications.
762
+ return {'type': a_type}
763
+
764
+ return {} # Unresolvable type conflict; the supertype is an unrestricted JsonType.
765
+
766
+ @classmethod
767
+ def __superschema_with_nulls(cls, a: dict[str, Any], b: dict[str, Any]) -> Optional[dict[str, Any]]:
768
+ a, a_nullable = cls.__unpack_null_from_schema(a)
769
+ b, b_nullable = cls.__unpack_null_from_schema(b)
770
+
771
+ result = cls.__superschema(a, b)
772
+ if len(result) > 0 and (a_nullable or b_nullable):
773
+ # if len(result) == 0, then null is implicitly accepted; otherwise, we need to explicitly allow it
774
+ return {'anyOf': [result, {'type': 'null'}]}
775
+ return result
776
+
777
+ @classmethod
778
+ def __unpack_null_from_schema(cls, s: dict[str, Any]) -> tuple[dict[str, Any], bool]:
779
+ if 'anyOf' in s and len(s['anyOf']) == 2 and {'type': 'null'} in s['anyOf']:
780
+ try:
781
+ return next(s for s in s['anyOf'] if s != {'type': 'null'}), True
782
+ except StopIteration:
783
+ pass
784
+ return s, False
785
+
786
+ def _to_base_str(self) -> str:
787
+ if self.json_schema is None:
788
+ return 'Json'
789
+ elif 'title' in self.json_schema:
790
+ return f'Json[{self.json_schema["title"]}]'
791
+ else:
792
+ return f'Json[{self.json_schema}]'
793
+
669
794
 
670
795
  class ArrayType(ColumnType):
671
796
  def __init__(self, shape: tuple[Union[int, None], ...], dtype: ColumnType, nullable: bool = False):
@@ -743,6 +868,12 @@ class ArrayType(ColumnType):
743
868
  return False
744
869
  return val.dtype == self.numpy_dtype()
745
870
 
871
+ def _to_json_schema(self) -> dict[str, Any]:
872
+ return {
873
+ 'type': 'array',
874
+ 'items': self.pxt_dtype._to_json_schema(),
875
+ }
876
+
746
877
  def _validate_literal(self, val: Any) -> None:
747
878
  if not isinstance(val, np.ndarray):
748
879
  raise TypeError(f'Expected numpy.ndarray, got {val.__class__.__name__}')
@@ -752,7 +883,7 @@ class ArrayType(ColumnType):
752
883
  f'got ndarray({val.shape}, dtype={val.dtype})'))
753
884
 
754
885
  def _create_literal(self, val: Any) -> Any:
755
- if isinstance(val, (list,tuple)):
886
+ if isinstance(val, (list, tuple)):
756
887
  # map python float to whichever numpy float is
757
888
  # declared for this type, rather than assume float64
758
889
  return np.array(val, dtype=self.numpy_dtype())
@@ -902,7 +1033,7 @@ class VideoType(ColumnType):
902
1033
  if num_decoded < 2:
903
1034
  # this is most likely an image file
904
1035
  raise excs.Error(f'Not a valid video: {val}')
905
- except av.AVError:
1036
+ except av.FFmpegError:
906
1037
  raise excs.Error(f'Not a valid video: {val}') from None
907
1038
 
908
1039
 
@@ -929,7 +1060,7 @@ class AudioType(ColumnType):
929
1060
  for packet in container.demux(audio_stream):
930
1061
  for _ in packet.decode():
931
1062
  pass
932
- except av.AVError as e:
1063
+ except av.FFmpegError as e:
933
1064
  raise excs.Error(f'Not a valid audio file: {val}\n{e}') from None
934
1065
 
935
1066
 
@@ -940,6 +1071,7 @@ class DocumentType(ColumnType):
940
1071
  MD = 1
941
1072
  PDF = 2
942
1073
  XML = 3
1074
+ TXT = 4
943
1075
 
944
1076
  def __init__(self, nullable: bool = False, doc_formats: Optional[str] = None):
945
1077
  super().__init__(self.Type.DOCUMENT, nullable=nullable)
@@ -1016,6 +1148,16 @@ class _PxtType:
1016
1148
 
1017
1149
 
1018
1150
  class Json(_PxtType):
1151
+ def __class_getitem__(cls, item: Any) -> _AnnotatedAlias:
1152
+ """
1153
+ `item` (the type subscript) must be a `dict` representing a valid JSON Schema.
1154
+ """
1155
+ if not isinstance(item, dict):
1156
+ raise TypeError('Json type parameter must be a dict')
1157
+
1158
+ # The JsonType initializer will validate the JSON Schema.
1159
+ return typing.Annotated[Any, JsonType(json_schema=item, nullable=False)]
1160
+
1019
1161
  @classmethod
1020
1162
  def as_col_type(cls, nullable: bool) -> ColumnType:
1021
1163
  return JsonType(nullable=nullable)
pixeltable/utils/arrow.py CHANGED
@@ -3,14 +3,17 @@ from typing import Any, Iterator, Optional, Union
3
3
 
4
4
  import numpy as np
5
5
  import pyarrow as pa
6
+ import datetime
6
7
 
7
8
  import pixeltable.type_system as ts
9
+ from pixeltable.env import Env
10
+
11
+ _tz_def = Env().get().default_time_zone
8
12
 
9
13
  _logger = logging.getLogger(__name__)
10
14
 
11
15
  _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
12
16
  pa.string(): ts.StringType(nullable=True),
13
- pa.timestamp('us'): ts.TimestampType(nullable=True),
14
17
  pa.bool_(): ts.BoolType(nullable=True),
15
18
  pa.uint8(): ts.IntType(nullable=True),
16
19
  pa.int8(): ts.IntType(nullable=True),
@@ -23,7 +26,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
23
26
 
24
27
  _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
25
28
  ts.StringType: pa.string(),
26
- ts.TimestampType: pa.timestamp('us'), # postgres timestamp is microseconds
29
+ ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
27
30
  ts.BoolType: pa.bool_(),
28
31
  ts.IntType: pa.int64(),
29
32
  ts.FloatType: pa.float32(),
@@ -39,7 +42,9 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
39
42
  """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
40
43
  Returns None if no conversion is currently implemented.
41
44
  """
42
- if arrow_type in _pa_to_pt:
45
+ if isinstance(arrow_type, pa.TimestampType):
46
+ return ts.TimestampType(nullable=True)
47
+ elif arrow_type in _pa_to_pt:
43
48
  return _pa_to_pt[arrow_type]
44
49
  elif isinstance(arrow_type, pa.FixedShapeTensorType):
45
50
  dtype = to_pixeltable_type(arrow_type.value_type)
@@ -0,0 +1,89 @@
1
+ import dataclasses
2
+ from typing import Optional, Union
3
+
4
+ import pandas as pd
5
+ from pandas.io.formats.style import Styler
6
+
7
+
8
+ @dataclasses.dataclass
9
+ class _Descriptor:
10
+ body: Union[str, pd.DataFrame]
11
+ # The remaining fields only affect the behavior if `body` is a pd.DataFrame.
12
+ show_index: bool
13
+ show_header: bool
14
+ styler: Optional[Styler] = None
15
+
16
+
17
+ class DescriptionHelper:
18
+ """
19
+ Helper class for rendering long-form descriptions of Pixeltable objects.
20
+
21
+ The output is specified as a list of "descriptors", each of which can be either a string or a Pandas DataFrame,
22
+ in any combination. The descriptors will be rendered in sequence. This is useful for long-form descriptions that
23
+ include tables with differing schemas or formatting, and/or a combination of tables and text.
24
+
25
+ DescriptionHelper can convert a list of descriptors into either HTML or plaintext and do something reasonable
26
+ in each case.
27
+ """
28
+ __descriptors: list[_Descriptor]
29
+
30
+ def __init__(self) -> None:
31
+ self.__descriptors = []
32
+
33
+ def append(
34
+ self,
35
+ descriptor: Union[str, pd.DataFrame],
36
+ show_index: bool = False,
37
+ show_header: bool = True,
38
+ styler: Optional[Styler] = None,
39
+ ) -> None:
40
+ self.__descriptors.append(_Descriptor(descriptor, show_index, show_header, styler))
41
+
42
+ def to_string(self) -> str:
43
+ blocks = [self.__render_text(descriptor) for descriptor in self.__descriptors]
44
+ return '\n\n'.join(blocks)
45
+
46
+ def to_html(self) -> str:
47
+ html_blocks = [self.__apply_styles(descriptor).to_html() for descriptor in self.__descriptors]
48
+ return '\n'.join(html_blocks)
49
+
50
+ @classmethod
51
+ def __render_text(cls, descriptor: _Descriptor) -> str:
52
+ if isinstance(descriptor.body, str):
53
+ return descriptor.body
54
+ else:
55
+ # If `show_index=False`, we get cleaner output (better intercolumn spacing) by setting the index to a
56
+ # list of empty strings than by setting `index=False` in the call to `df.to_string()`. It's pretty silly
57
+ # that `index=False` has side effects in Pandas that go beyond simply not displaying the index, but it
58
+ # is what it is.
59
+ df = descriptor.body
60
+ if not descriptor.show_index:
61
+ df = df.copy()
62
+ df.index = [''] * len(df) # type: ignore[assignment]
63
+ # max_colwidth=50 is the identical default that Pandas uses for a DataFrame's __repr__() output.
64
+ return df.to_string(header=descriptor.show_header, max_colwidth=50)
65
+
66
+ @classmethod
67
+ def __apply_styles(cls, descriptor: _Descriptor) -> Styler:
68
+ if isinstance(descriptor.body, str):
69
+ return (
70
+ # Render the string as a single-cell DataFrame. This will ensure a consistent style of output in
71
+ # cases where strings appear alongside DataFrames in the same DescriptionHelper.
72
+ pd.DataFrame([descriptor.body]).style
73
+ .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
74
+ .hide(axis='index').hide(axis='columns')
75
+ )
76
+ else:
77
+ styler = descriptor.styler
78
+ if styler is None:
79
+ styler = descriptor.body.style
80
+ styler = (
81
+ styler
82
+ .set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
83
+ .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
84
+ )
85
+ if not descriptor.show_header:
86
+ styler = styler.hide(axis='columns')
87
+ if not descriptor.show_index:
88
+ styler = styler.hide(axis='index')
89
+ return styler
@@ -15,6 +15,7 @@ class DocumentHandle:
15
15
  bs_doc: Optional[bs4.BeautifulSoup] = None
16
16
  md_ast: Optional[dict] = None
17
17
  pdf_doc: Optional[fitz.Document] = None
18
+ txt_doc: Optional[str] = None
18
19
 
19
20
 
20
21
  def get_document_handle(path: str) -> Optional[DocumentHandle]:
@@ -40,6 +41,11 @@ def get_document_handle(path: str) -> Optional[DocumentHandle]:
40
41
  if bs_doc is not None:
41
42
  return DocumentHandle(format=ts.DocumentType.DocumentFormat.XML, bs_doc=bs_doc)
42
43
 
44
+ if doc_format == '.txt':
45
+ txt_doc = get_txt(path)
46
+ if txt_doc is not None:
47
+ return DocumentHandle(format=ts.DocumentType.DocumentFormat.TXT, txt_doc=txt_doc)
48
+
43
49
  return None
44
50
 
45
51
 
@@ -84,3 +90,11 @@ def get_markdown_handle(path: str) -> Optional[dict]:
84
90
  return md_ast(text)
85
91
  except Exception:
86
92
  return None
93
+
94
+ def get_txt(path: str) -> Optional[str]:
95
+ try:
96
+ with open(path, "r") as f:
97
+ doc = f.read()
98
+ return doc if doc != '' else None
99
+ except Exception:
100
+ return None