pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +106 -81
- pixeltable/env.py +28 -24
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -9
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +108 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +231 -113
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +36 -23
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/share/__init__.py +0 -0
- pixeltable/share/packager.py +218 -0
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +102 -75
- pixeltable/utils/arrow.py +7 -8
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +3 -2
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
- pixeltable-0.3.4.dist-info/RECORD +166 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0
pixeltable/type_system.py
CHANGED
|
@@ -9,21 +9,22 @@ import typing
|
|
|
9
9
|
import urllib.parse
|
|
10
10
|
import urllib.request
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
12
|
+
from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
|
|
13
13
|
|
|
14
|
-
import PIL.Image
|
|
15
14
|
import av # type: ignore
|
|
16
15
|
import jsonschema
|
|
17
16
|
import jsonschema.protocols
|
|
18
17
|
import jsonschema.validators
|
|
19
18
|
import numpy as np
|
|
19
|
+
import PIL.Image
|
|
20
20
|
import pydantic
|
|
21
21
|
import sqlalchemy as sql
|
|
22
|
-
from typing import _GenericAlias # type: ignore[attr-defined]
|
|
23
22
|
from typing_extensions import _AnnotatedAlias
|
|
24
23
|
|
|
25
24
|
import pixeltable.exceptions as excs
|
|
26
25
|
|
|
26
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
27
|
+
|
|
27
28
|
|
|
28
29
|
class ColumnType:
|
|
29
30
|
@enum.unique
|
|
@@ -45,9 +46,11 @@ class ColumnType:
|
|
|
45
46
|
|
|
46
47
|
@classmethod
|
|
47
48
|
def supertype(
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
49
|
+
cls,
|
|
50
|
+
type1: 'ColumnType.Type',
|
|
51
|
+
type2: 'ColumnType.Type',
|
|
52
|
+
# we need to pass this in because we can't easily append it as a class member
|
|
53
|
+
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
|
|
51
54
|
) -> Optional['ColumnType.Type']:
|
|
52
55
|
if type1 == type2:
|
|
53
56
|
return type1
|
|
@@ -59,23 +62,23 @@ class ColumnType:
|
|
|
59
62
|
return t
|
|
60
63
|
return None
|
|
61
64
|
|
|
62
|
-
|
|
63
65
|
@enum.unique
|
|
64
66
|
class DType(enum.Enum):
|
|
65
67
|
"""
|
|
66
68
|
Base type used in images and arrays
|
|
67
69
|
"""
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
70
|
+
|
|
71
|
+
BOOL = (0,)
|
|
72
|
+
INT8 = (1,)
|
|
73
|
+
INT16 = (2,)
|
|
74
|
+
INT32 = (3,)
|
|
75
|
+
INT64 = (4,)
|
|
76
|
+
UINT8 = (5,)
|
|
77
|
+
UINT16 = (6,)
|
|
78
|
+
UINT32 = (7,)
|
|
79
|
+
UINT64 = (8,)
|
|
80
|
+
FLOAT16 = (9,)
|
|
81
|
+
FLOAT32 = (10,)
|
|
79
82
|
FLOAT64 = 11
|
|
80
83
|
|
|
81
84
|
scalar_types = {Type.STRING, Type.INT, Type.FLOAT, Type.BOOL, Type.TIMESTAMP}
|
|
@@ -113,10 +116,7 @@ class ColumnType:
|
|
|
113
116
|
return json.dumps([t.as_dict() for t in type_list])
|
|
114
117
|
|
|
115
118
|
def as_dict(self) -> dict:
|
|
116
|
-
return {
|
|
117
|
-
'_classname': self.__class__.__name__,
|
|
118
|
-
**self._as_dict(),
|
|
119
|
-
}
|
|
119
|
+
return {'_classname': self.__class__.__name__, **self._as_dict()}
|
|
120
120
|
|
|
121
121
|
def _as_dict(self) -> dict:
|
|
122
122
|
return {'nullable': self.nullable}
|
|
@@ -213,9 +213,9 @@ class ColumnType:
|
|
|
213
213
|
return self.copy(nullable=(self.nullable or other.nullable))
|
|
214
214
|
|
|
215
215
|
if self.is_invalid_type():
|
|
216
|
-
return other
|
|
216
|
+
return other.copy(nullable=(self.nullable or other.nullable))
|
|
217
217
|
if other.is_invalid_type():
|
|
218
|
-
return self
|
|
218
|
+
return self.copy(nullable=(self.nullable or other.nullable))
|
|
219
219
|
|
|
220
220
|
if self.is_scalar_type() and other.is_scalar_type():
|
|
221
221
|
t = self.Type.supertype(self._type, other._type, self.common_supertypes)
|
|
@@ -277,10 +277,7 @@ class ColumnType:
|
|
|
277
277
|
|
|
278
278
|
@classmethod
|
|
279
279
|
def from_python_type(
|
|
280
|
-
cls,
|
|
281
|
-
t: Union[type, _GenericAlias],
|
|
282
|
-
nullable_default: bool = False,
|
|
283
|
-
allow_builtin_types: bool = True
|
|
280
|
+
cls, t: Union[type, _GenericAlias], nullable_default: bool = False, allow_builtin_types: bool = True
|
|
284
281
|
) -> Optional[ColumnType]:
|
|
285
282
|
"""
|
|
286
283
|
Convert a Python type into a Pixeltable `ColumnType` instance.
|
|
@@ -295,28 +292,24 @@ class ColumnType:
|
|
|
295
292
|
designations will be allowed regardless.
|
|
296
293
|
"""
|
|
297
294
|
origin = typing.get_origin(t)
|
|
295
|
+
type_args = typing.get_args(t)
|
|
298
296
|
if origin is typing.Union:
|
|
299
297
|
# Check if `t` has the form Optional[T].
|
|
300
|
-
|
|
301
|
-
if len(union_args) == 2 and type(None) in union_args:
|
|
298
|
+
if len(type_args) == 2 and type(None) in type_args:
|
|
302
299
|
# `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
|
|
303
300
|
# We treat it as the underlying type but with nullable=True.
|
|
304
|
-
underlying_py_type =
|
|
301
|
+
underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
|
|
305
302
|
underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
|
|
306
303
|
if underlying is not None:
|
|
307
304
|
return underlying.copy(nullable=True)
|
|
308
305
|
elif origin is Required:
|
|
309
|
-
|
|
310
|
-
assert len(required_args) == 1
|
|
306
|
+
assert len(type_args) == 1
|
|
311
307
|
return cls.from_python_type(
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
allow_builtin_types=allow_builtin_types
|
|
315
|
-
)
|
|
308
|
+
type_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
|
|
309
|
+
).copy(nullable=False)
|
|
316
310
|
elif origin is typing.Annotated:
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
parameters = annotated_args[1]
|
|
311
|
+
origin = type_args[0]
|
|
312
|
+
parameters = type_args[1]
|
|
320
313
|
if isinstance(parameters, ColumnType):
|
|
321
314
|
return parameters.copy(nullable=nullable_default)
|
|
322
315
|
else:
|
|
@@ -328,6 +321,11 @@ class ColumnType:
|
|
|
328
321
|
if isinstance(t, type) and issubclass(t, _PxtType):
|
|
329
322
|
return t.as_col_type(nullable=nullable_default)
|
|
330
323
|
elif allow_builtin_types:
|
|
324
|
+
if t is Literal and len(type_args) > 0:
|
|
325
|
+
literal_type = cls.infer_common_literal_type(type_args)
|
|
326
|
+
if literal_type is None:
|
|
327
|
+
return None
|
|
328
|
+
return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
|
|
331
329
|
if t is str:
|
|
332
330
|
return StringType(nullable=nullable_default)
|
|
333
331
|
if t is int:
|
|
@@ -340,7 +338,7 @@ class ColumnType:
|
|
|
340
338
|
return TimestampType(nullable=nullable_default)
|
|
341
339
|
if t is PIL.Image.Image:
|
|
342
340
|
return ImageType(nullable=nullable_default)
|
|
343
|
-
if
|
|
341
|
+
if isinstance(t, type) and issubclass(t, (Sequence, Mapping, pydantic.BaseModel)):
|
|
344
342
|
return JsonType(nullable=nullable_default)
|
|
345
343
|
return None
|
|
346
344
|
|
|
@@ -349,7 +347,7 @@ class ColumnType:
|
|
|
349
347
|
cls,
|
|
350
348
|
t: Union[ColumnType, type, _AnnotatedAlias],
|
|
351
349
|
nullable_default: bool = False,
|
|
352
|
-
allow_builtin_types: bool = True
|
|
350
|
+
allow_builtin_types: bool = True,
|
|
353
351
|
) -> ColumnType:
|
|
354
352
|
"""
|
|
355
353
|
Convert any type recognizable by Pixeltable to its corresponding ColumnType.
|
|
@@ -415,7 +413,7 @@ class ColumnType:
|
|
|
415
413
|
|
|
416
414
|
def _create_literal(self, val: Any) -> Any:
|
|
417
415
|
"""Create a literal of this type from val, including any needed conversions.
|
|
418
|
-
|
|
416
|
+
val is guaranteed to be non-None"""
|
|
419
417
|
return val
|
|
420
418
|
|
|
421
419
|
def create_literal(self, val: Any) -> Any:
|
|
@@ -484,12 +482,7 @@ class ColumnType:
|
|
|
484
482
|
|
|
485
483
|
def to_json_schema(self) -> dict[str, Any]:
|
|
486
484
|
if self.nullable:
|
|
487
|
-
return {
|
|
488
|
-
'anyOf': [
|
|
489
|
-
self._to_json_schema(),
|
|
490
|
-
{'type': 'null'},
|
|
491
|
-
]
|
|
492
|
-
}
|
|
485
|
+
return {'anyOf': [self._to_json_schema(), {'type': 'null'}]}
|
|
493
486
|
else:
|
|
494
487
|
return self._to_json_schema()
|
|
495
488
|
|
|
@@ -612,7 +605,6 @@ class TimestampType(ColumnType):
|
|
|
612
605
|
|
|
613
606
|
|
|
614
607
|
class JsonType(ColumnType):
|
|
615
|
-
|
|
616
608
|
json_schema: Optional[dict[str, Any]]
|
|
617
609
|
__validator: Optional[jsonschema.protocols.Validator]
|
|
618
610
|
|
|
@@ -699,8 +691,7 @@ class JsonType(ColumnType):
|
|
|
699
691
|
superschema = self.__superschema(self.json_schema, other.json_schema)
|
|
700
692
|
|
|
701
693
|
return JsonType(
|
|
702
|
-
json_schema=(None if len(superschema) == 0 else superschema),
|
|
703
|
-
nullable=(self.nullable or other.nullable)
|
|
694
|
+
json_schema=(None if len(superschema) == 0 else superschema), nullable=(self.nullable or other.nullable)
|
|
704
695
|
)
|
|
705
696
|
|
|
706
697
|
@classmethod
|
|
@@ -755,7 +746,7 @@ class JsonType(ColumnType):
|
|
|
755
746
|
a_type = a.get('type')
|
|
756
747
|
b_type = b.get('type')
|
|
757
748
|
|
|
758
|
-
if
|
|
749
|
+
if a_type in ('string', 'integer', 'number', 'boolean', 'object', 'array') and a_type == b_type:
|
|
759
750
|
# a and b both have the same type designation, but are not identical. This can happen if
|
|
760
751
|
# (for example) they have validators or other attributes that differ. In this case, we
|
|
761
752
|
# generalize to {'type': t}, where t is their shared type, with no other qualifications.
|
|
@@ -793,15 +784,25 @@ class JsonType(ColumnType):
|
|
|
793
784
|
|
|
794
785
|
|
|
795
786
|
class ArrayType(ColumnType):
|
|
796
|
-
|
|
797
787
|
shape: Optional[tuple[Optional[int], ...]]
|
|
798
788
|
pxt_dtype: Optional[ColumnType]
|
|
799
789
|
dtype: Optional[ColumnType.Type]
|
|
800
790
|
|
|
801
|
-
def __init__(
|
|
791
|
+
def __init__(
|
|
792
|
+
self,
|
|
793
|
+
shape: Optional[tuple[Optional[int], ...]] = None,
|
|
794
|
+
dtype: Optional[ColumnType] = None,
|
|
795
|
+
nullable: bool = False,
|
|
796
|
+
):
|
|
802
797
|
super().__init__(self.Type.ARRAY, nullable=nullable)
|
|
803
798
|
assert shape is None or dtype is not None, (shape, dtype) # cannot specify a shape without a dtype
|
|
804
|
-
assert
|
|
799
|
+
assert (
|
|
800
|
+
dtype is None
|
|
801
|
+
or dtype.is_int_type()
|
|
802
|
+
or dtype.is_float_type()
|
|
803
|
+
or dtype.is_bool_type()
|
|
804
|
+
or dtype.is_string_type()
|
|
805
|
+
)
|
|
805
806
|
|
|
806
807
|
self.shape = shape
|
|
807
808
|
self.pxt_dtype = dtype # we need this for copy() and __str__()
|
|
@@ -853,21 +854,39 @@ class ArrayType(ColumnType):
|
|
|
853
854
|
dtype = None if d['dtype'] is None else cls.make_type(cls.Type(d['dtype']))
|
|
854
855
|
return cls(shape, dtype, nullable=d['nullable'])
|
|
855
856
|
|
|
857
|
+
@classmethod
|
|
858
|
+
def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> Optional[ColumnType]:
|
|
859
|
+
"""
|
|
860
|
+
Return pixeltable type corresponding to a given simple numpy dtype
|
|
861
|
+
"""
|
|
862
|
+
if np.issubdtype(dtype, np.integer):
|
|
863
|
+
return IntType(nullable=nullable)
|
|
864
|
+
|
|
865
|
+
if np.issubdtype(dtype, np.floating):
|
|
866
|
+
return FloatType(nullable=nullable)
|
|
867
|
+
|
|
868
|
+
if dtype == np.bool_:
|
|
869
|
+
return BoolType(nullable=nullable)
|
|
870
|
+
|
|
871
|
+
if np.issubdtype(dtype, np.str_):
|
|
872
|
+
return StringType(nullable=nullable)
|
|
873
|
+
|
|
874
|
+
if np.issubdtype(dtype, np.character):
|
|
875
|
+
return StringType(nullable=nullable)
|
|
876
|
+
|
|
877
|
+
if np.issubdtype(dtype, np.datetime64):
|
|
878
|
+
return TimestampType(nullable=nullable)
|
|
879
|
+
|
|
880
|
+
return None
|
|
881
|
+
|
|
856
882
|
@classmethod
|
|
857
883
|
def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
|
|
858
884
|
# determine our dtype
|
|
859
885
|
assert isinstance(val, np.ndarray)
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
elif np.issubdtype(val.dtype, np.floating):
|
|
863
|
-
dtype = FloatType()
|
|
864
|
-
elif val.dtype == np.bool_:
|
|
865
|
-
dtype = BoolType()
|
|
866
|
-
elif val.dtype == np.str_:
|
|
867
|
-
dtype = StringType()
|
|
868
|
-
else:
|
|
886
|
+
pxttype: Optional[ColumnType] = cls.from_np_dtype(val.dtype, nullable)
|
|
887
|
+
if pxttype == None:
|
|
869
888
|
return None
|
|
870
|
-
return cls(val.shape, dtype=
|
|
889
|
+
return cls(val.shape, dtype=pxttype, nullable=nullable)
|
|
871
890
|
|
|
872
891
|
def is_valid_literal(self, val: np.ndarray) -> bool:
|
|
873
892
|
if not isinstance(val, np.ndarray):
|
|
@@ -898,10 +917,7 @@ class ArrayType(ColumnType):
|
|
|
898
917
|
return True
|
|
899
918
|
|
|
900
919
|
def _to_json_schema(self) -> dict[str, Any]:
|
|
901
|
-
return {
|
|
902
|
-
'type': 'array',
|
|
903
|
-
'items': self.pxt_dtype._to_json_schema(),
|
|
904
|
-
}
|
|
920
|
+
return {'type': 'array', 'items': self.pxt_dtype._to_json_schema()}
|
|
905
921
|
|
|
906
922
|
def _validate_literal(self, val: Any) -> None:
|
|
907
923
|
if not isinstance(val, np.ndarray):
|
|
@@ -945,15 +961,19 @@ class ArrayType(ColumnType):
|
|
|
945
961
|
|
|
946
962
|
class ImageType(ColumnType):
|
|
947
963
|
def __init__(
|
|
948
|
-
|
|
949
|
-
|
|
964
|
+
self,
|
|
965
|
+
width: Optional[int] = None,
|
|
966
|
+
height: Optional[int] = None,
|
|
967
|
+
size: Optional[tuple[int, int]] = None,
|
|
968
|
+
mode: Optional[str] = None,
|
|
969
|
+
nullable: bool = False,
|
|
950
970
|
):
|
|
951
971
|
"""
|
|
952
972
|
TODO: does it make sense to specify only width or height?
|
|
953
973
|
"""
|
|
954
974
|
super().__init__(self.Type.IMAGE, nullable=nullable)
|
|
955
|
-
assert not(width is not None and size is not None)
|
|
956
|
-
assert not(height is not None and size is not None)
|
|
975
|
+
assert not (width is not None and size is not None)
|
|
976
|
+
assert not (height is not None and size is not None)
|
|
957
977
|
if size is not None:
|
|
958
978
|
self.width = size[0]
|
|
959
979
|
self.height = size[1]
|
|
@@ -1143,6 +1163,7 @@ class DocumentType(ColumnType):
|
|
|
1143
1163
|
def validate_media(self, val: Any) -> None:
|
|
1144
1164
|
assert isinstance(val, str)
|
|
1145
1165
|
from pixeltable.utils.documents import get_document_handle
|
|
1166
|
+
|
|
1146
1167
|
dh = get_document_handle(val)
|
|
1147
1168
|
if dh is None:
|
|
1148
1169
|
raise excs.Error(f'Not a recognized document format: {val}')
|
|
@@ -1156,6 +1177,7 @@ class Required(typing.Generic[T]):
|
|
|
1156
1177
|
Marker class to indicate that a column is non-nullable in a schema definition. This has no meaning as a type hint,
|
|
1157
1178
|
and is intended only for schema declarations.
|
|
1158
1179
|
"""
|
|
1180
|
+
|
|
1159
1181
|
pass
|
|
1160
1182
|
|
|
1161
1183
|
|
|
@@ -1178,6 +1200,7 @@ class _PxtType:
|
|
|
1178
1200
|
`Image[(300, 300), 'RGB']`. The specialized forms resolve to `typing.Annotated` instances whose annotation is a
|
|
1179
1201
|
`ColumnType`.
|
|
1180
1202
|
"""
|
|
1203
|
+
|
|
1181
1204
|
def __init__(self):
|
|
1182
1205
|
raise TypeError(f'Type `{type(self)}` cannot be instantiated.')
|
|
1183
1206
|
|
|
@@ -1256,7 +1279,11 @@ class Image(PIL.Image.Image, _PxtType):
|
|
|
1256
1279
|
mode: Optional[str] = None
|
|
1257
1280
|
for param in params:
|
|
1258
1281
|
if isinstance(param, tuple):
|
|
1259
|
-
if
|
|
1282
|
+
if (
|
|
1283
|
+
len(param) != 2
|
|
1284
|
+
or not isinstance(param[0], (int, type(None)))
|
|
1285
|
+
or not isinstance(param[1], (int, type(None)))
|
|
1286
|
+
):
|
|
1260
1287
|
raise TypeError(f'Invalid Image type parameter: {param}')
|
|
1261
1288
|
if size is not None:
|
|
1262
1289
|
raise TypeError(f'Duplicate Image type parameter: {param}')
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
from typing import Any, Iterator, Optional, Union
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pyarrow as pa
|
|
5
|
-
import datetime
|
|
6
6
|
|
|
7
7
|
import pixeltable.type_system as ts
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
_pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
9
|
+
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
11
10
|
pa.string(): ts.StringType(nullable=True),
|
|
12
11
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
13
12
|
pa.uint8(): ts.IntType(nullable=True),
|
|
@@ -19,7 +18,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
|
19
18
|
pa.float32(): ts.FloatType(nullable=True),
|
|
20
19
|
}
|
|
21
20
|
|
|
22
|
-
|
|
21
|
+
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
23
22
|
ts.StringType: pa.string(),
|
|
24
23
|
ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
|
|
25
24
|
ts.BoolType: pa.bool_(),
|
|
@@ -39,8 +38,8 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
|
|
|
39
38
|
"""
|
|
40
39
|
if isinstance(arrow_type, pa.TimestampType):
|
|
41
40
|
return ts.TimestampType(nullable=True)
|
|
42
|
-
elif arrow_type in
|
|
43
|
-
return
|
|
41
|
+
elif arrow_type in PA_TO_PXT_TYPES:
|
|
42
|
+
return PA_TO_PXT_TYPES[arrow_type]
|
|
44
43
|
elif isinstance(arrow_type, pa.FixedShapeTensorType):
|
|
45
44
|
dtype = to_pixeltable_type(arrow_type.value_type)
|
|
46
45
|
if dtype is None:
|
|
@@ -54,8 +53,8 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
54
53
|
"""Convert a pixeltable DataType to a pyarrow datatype if one is defined.
|
|
55
54
|
Returns None if no conversion is currently implemented.
|
|
56
55
|
"""
|
|
57
|
-
if pixeltable_type.__class__ in
|
|
58
|
-
return
|
|
56
|
+
if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
|
|
57
|
+
return PXT_TO_PA_TYPES[pixeltable_type.__class__]
|
|
59
58
|
elif isinstance(pixeltable_type, ts.ArrayType):
|
|
60
59
|
return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
|
|
61
60
|
else:
|
pixeltable/utils/coco.py
CHANGED
|
@@ -22,6 +22,7 @@ Required format:
|
|
|
22
22
|
}
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
|
+
|
|
25
26
|
def _verify_input_dict(input_dict: dict[str, Any]) -> None:
|
|
26
27
|
"""Verify that input_dict is a valid input dict for write_coco_dataset()"""
|
|
27
28
|
if not isinstance(input_dict, dict):
|
|
@@ -30,7 +31,7 @@ def _verify_input_dict(input_dict: dict[str, Any]) -> None:
|
|
|
30
31
|
raise excs.Error(f'Missing key "image" in input dict: {input_dict}{format_msg}')
|
|
31
32
|
if not isinstance(input_dict['image'], PIL.Image.Image):
|
|
32
33
|
raise excs.Error(f'Value for "image" is not a PIL.Image.Image: {input_dict}{format_msg}')
|
|
33
|
-
if
|
|
34
|
+
if 'annotations' not in input_dict:
|
|
34
35
|
raise excs.Error(f'Missing key "annotations" in input dict: {input_dict}{format_msg}')
|
|
35
36
|
if not isinstance(input_dict['annotations'], list):
|
|
36
37
|
raise excs.Error(f'Value for "annotations" is not a list: {input_dict}{format_msg}')
|
|
@@ -48,6 +49,7 @@ def _verify_input_dict(input_dict: dict[str, Any]) -> None:
|
|
|
48
49
|
if not isinstance(annotation['category'], (str, int)):
|
|
49
50
|
raise excs.Error(f'Value for "category" is not a str or int: {annotation}{format_msg}')
|
|
50
51
|
|
|
52
|
+
|
|
51
53
|
def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
|
|
52
54
|
"""Export a DataFrame result set as a COCO dataset in dest_path and return the path of the data.json file."""
|
|
53
55
|
# TODO: validate schema
|
|
@@ -96,12 +98,7 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
|
|
|
96
98
|
img_path = images_dir / f'{img_id}.jpg'
|
|
97
99
|
img.save(img_path)
|
|
98
100
|
|
|
99
|
-
images.append({
|
|
100
|
-
'id': img_id,
|
|
101
|
-
'file_name': str(img_path),
|
|
102
|
-
'width': img.width,
|
|
103
|
-
'height': img.height,
|
|
104
|
-
})
|
|
101
|
+
images.append({'id': img_id, 'file_name': str(img_path), 'width': img.width, 'height': img.height})
|
|
105
102
|
|
|
106
103
|
# create annotation records for this image
|
|
107
104
|
for annotation in input_dict['annotations']:
|
|
@@ -109,15 +106,17 @@ def write_coco_dataset(df: pxt.DataFrame, dest_path: Path) -> Path:
|
|
|
109
106
|
x, y, w, h = annotation['bbox']
|
|
110
107
|
category = annotation['category']
|
|
111
108
|
categories.add(category)
|
|
112
|
-
annotations.append(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
109
|
+
annotations.append(
|
|
110
|
+
{
|
|
111
|
+
'id': ann_id,
|
|
112
|
+
'image_id': img_id,
|
|
113
|
+
# we use the category name here and fix it up at the end, when we have assigned category ids
|
|
114
|
+
'category_id': category,
|
|
115
|
+
'bbox': annotation['bbox'],
|
|
116
|
+
'area': w * h,
|
|
117
|
+
'iscrowd': 0,
|
|
118
|
+
}
|
|
119
|
+
)
|
|
121
120
|
|
|
122
121
|
# replace category names with ids
|
|
123
122
|
category_ids = {category: id for id, category in enumerate(sorted(list(categories)))}
|
|
@@ -226,5 +225,5 @@ COCO_2017_CATEGORIES = {
|
|
|
226
225
|
87: 'scissors',
|
|
227
226
|
88: 'teddy bear',
|
|
228
227
|
89: 'hair drier',
|
|
229
|
-
90: 'toothbrush'
|
|
228
|
+
90: 'toothbrush',
|
|
230
229
|
}
|
pixeltable/utils/code.py
CHANGED
|
@@ -3,9 +3,9 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
from pixeltable.func import Function
|
|
5
5
|
|
|
6
|
-
|
|
7
6
|
# Utilities related to the organization of the Pixeltable codebase.
|
|
8
7
|
|
|
8
|
+
|
|
9
9
|
def local_public_names(mod_name: str, exclude: Optional[list[str]] = None) -> list[str]:
|
|
10
10
|
"""
|
|
11
11
|
Returns a list of all functions and submodules that are local to the specified module and are
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
def map_level(verbosity: int) -> int:
|
|
4
5
|
"""
|
|
5
6
|
Map verbosity level to logging level.
|
|
@@ -19,6 +20,7 @@ def map_level(verbosity: int) -> int:
|
|
|
19
20
|
return logging.DEBUG
|
|
20
21
|
return logging.INFO
|
|
21
22
|
|
|
23
|
+
|
|
22
24
|
class ConsoleOutputHandler(logging.StreamHandler):
|
|
23
25
|
def __init__(self, stream):
|
|
24
26
|
super().__init__(stream)
|
|
@@ -29,13 +31,14 @@ class ConsoleOutputHandler(logging.StreamHandler):
|
|
|
29
31
|
else:
|
|
30
32
|
self.stream.write(record.msg + '\n')
|
|
31
33
|
|
|
34
|
+
|
|
32
35
|
class ConsoleMessageFilter(logging.Filter):
|
|
33
36
|
def filter(self, record: logging.LogRecord) -> bool:
|
|
34
37
|
if hasattr(record, 'user_visible') and record.user_visible:
|
|
35
38
|
return True
|
|
36
39
|
return False
|
|
37
40
|
|
|
38
|
-
class ConsoleLogger(logging.LoggerAdapter):
|
|
39
|
-
def __init__(self, logger:logging.Logger):
|
|
40
|
-
super().__init__(logger, extra={'user_visible' : True})
|
|
41
41
|
|
|
42
|
+
class ConsoleLogger(logging.LoggerAdapter):
|
|
43
|
+
def __init__(self, logger: logging.Logger):
|
|
44
|
+
super().__init__(logger, extra={'user_visible': True})
|
|
@@ -25,6 +25,7 @@ class DescriptionHelper:
|
|
|
25
25
|
DescriptionHelper can convert a list of descriptors into either HTML or plaintext and do something reasonable
|
|
26
26
|
in each case.
|
|
27
27
|
"""
|
|
28
|
+
|
|
28
29
|
__descriptors: list[_Descriptor]
|
|
29
30
|
|
|
30
31
|
def __init__(self) -> None:
|
|
@@ -69,18 +70,17 @@ class DescriptionHelper:
|
|
|
69
70
|
return (
|
|
70
71
|
# Render the string as a single-cell DataFrame. This will ensure a consistent style of output in
|
|
71
72
|
# cases where strings appear alongside DataFrames in the same DescriptionHelper.
|
|
72
|
-
pd.DataFrame([descriptor.body])
|
|
73
|
-
.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
|
|
74
|
-
.hide(axis='index')
|
|
73
|
+
pd.DataFrame([descriptor.body])
|
|
74
|
+
.style.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left', 'font-weight': 'bold'})
|
|
75
|
+
.hide(axis='index')
|
|
76
|
+
.hide(axis='columns')
|
|
75
77
|
)
|
|
76
78
|
else:
|
|
77
79
|
styler = descriptor.styler
|
|
78
80
|
if styler is None:
|
|
79
81
|
styler = descriptor.body.style
|
|
80
|
-
styler = (
|
|
81
|
-
|
|
82
|
-
.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
|
|
83
|
-
.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
|
|
82
|
+
styler = styler.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'}).set_table_styles(
|
|
83
|
+
[dict(selector='th', props=[('text-align', 'left')])]
|
|
84
84
|
)
|
|
85
85
|
if not descriptor.show_header:
|
|
86
86
|
styler = styler.hide(axis='columns')
|
pixeltable/utils/documents.py
CHANGED
|
@@ -83,6 +83,7 @@ def get_xml_handle(path: str) -> Optional[bs4.BeautifulSoup]:
|
|
|
83
83
|
def get_markdown_handle(path: str) -> Optional[dict]:
|
|
84
84
|
Env.get().require_package('mistune', [3, 0])
|
|
85
85
|
import mistune
|
|
86
|
+
|
|
86
87
|
try:
|
|
87
88
|
with open(path, encoding='utf8') as file:
|
|
88
89
|
text = file.read()
|
|
@@ -91,9 +92,10 @@ def get_markdown_handle(path: str) -> Optional[dict]:
|
|
|
91
92
|
except Exception:
|
|
92
93
|
return None
|
|
93
94
|
|
|
95
|
+
|
|
94
96
|
def get_txt(path: str) -> Optional[str]:
|
|
95
97
|
try:
|
|
96
|
-
with open(path,
|
|
98
|
+
with open(path, 'r') as f:
|
|
97
99
|
doc = f.read()
|
|
98
100
|
return doc if doc != '' else None
|
|
99
101
|
except Exception:
|
pixeltable/utils/filecache.py
CHANGED
|
@@ -17,9 +17,9 @@ from pixeltable.env import Env
|
|
|
17
17
|
|
|
18
18
|
_logger = logging.getLogger('pixeltable')
|
|
19
19
|
|
|
20
|
+
|
|
20
21
|
@dataclass
|
|
21
22
|
class CacheEntry:
|
|
22
|
-
|
|
23
23
|
key: str
|
|
24
24
|
tbl_id: UUID
|
|
25
25
|
col_id: int
|
|
@@ -56,6 +56,7 @@ class FileCache:
|
|
|
56
56
|
TODO:
|
|
57
57
|
- implement MRU eviction for queries that exceed the capacity
|
|
58
58
|
"""
|
|
59
|
+
|
|
59
60
|
__instance: Optional[FileCache] = None
|
|
60
61
|
|
|
61
62
|
cache: OrderedDict[str, CacheEntry]
|
|
@@ -79,8 +80,7 @@ class FileCache:
|
|
|
79
80
|
|
|
80
81
|
FileCacheColumnStats = namedtuple('FileCacheColumnStats', ('tbl_id', 'col_id', 'num_files', 'total_size'))
|
|
81
82
|
FileCacheStats = namedtuple(
|
|
82
|
-
'FileCacheStats',
|
|
83
|
-
('total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats')
|
|
83
|
+
'FileCacheStats', ('total_size', 'num_requests', 'num_hits', 'num_evictions', 'column_stats')
|
|
84
84
|
)
|
|
85
85
|
|
|
86
86
|
@classmethod
|
|
@@ -154,7 +154,7 @@ class FileCache:
|
|
|
154
154
|
f'Consider increasing the cache size to at least {round(suggested_cache_size / (1 << 30), 1)} GiB '
|
|
155
155
|
f'(it is currently {round(self.capacity_bytes / (1 << 30), 1)} GiB).\n'
|
|
156
156
|
f'You can do this by setting the value of `file_cache_size_g` in: {str(Env.get()._config_file)}',
|
|
157
|
-
excs.PixeltableWarning
|
|
157
|
+
excs.PixeltableWarning,
|
|
158
158
|
)
|
|
159
159
|
self.new_redownload_witnessed = False
|
|
160
160
|
|
|
@@ -195,7 +195,9 @@ class FileCache:
|
|
|
195
195
|
self.evicted_working_set_keys.add(key)
|
|
196
196
|
self.new_redownload_witnessed = True
|
|
197
197
|
self.keys_retrieved.add(key)
|
|
198
|
-
entry = CacheEntry(
|
|
198
|
+
entry = CacheEntry(
|
|
199
|
+
key, tbl_id, col_id, file_info.st_size, datetime.fromtimestamp(file_info.st_mtime), path.suffix
|
|
200
|
+
)
|
|
199
201
|
self.cache[key] = entry
|
|
200
202
|
self.total_size += entry.size
|
|
201
203
|
new_path = entry.path
|
|
@@ -217,7 +219,9 @@ class FileCache:
|
|
|
217
219
|
# Make a record of the eviction, so that we can generate a warning later if the key is retrieved again.
|
|
218
220
|
self.keys_evicted_after_retrieval.add(lru_entry.key)
|
|
219
221
|
os.remove(str(lru_entry.path))
|
|
220
|
-
_logger.debug(
|
|
222
|
+
_logger.debug(
|
|
223
|
+
f'evicted entry for cell {lru_entry.key} from file cache (of size {lru_entry.size // (1 << 20)} MiB)'
|
|
224
|
+
)
|
|
221
225
|
|
|
222
226
|
def set_capacity(self, capacity_bytes: int) -> None:
|
|
223
227
|
self.capacity_bytes = capacity_bytes
|
|
@@ -232,7 +236,8 @@ class FileCache:
|
|
|
232
236
|
t[0] += 1
|
|
233
237
|
t[1] += entry.size
|
|
234
238
|
col_stats = [
|
|
235
|
-
self.FileCacheColumnStats(tbl_id, col_id, num_files, size)
|
|
239
|
+
self.FileCacheColumnStats(tbl_id, col_id, num_files, size)
|
|
240
|
+
for (tbl_id, col_id), (num_files, size) in d.items()
|
|
236
241
|
]
|
|
237
242
|
col_stats.sort(key=lambda e: e[3], reverse=True)
|
|
238
243
|
return self.FileCacheStats(self.total_size, self.num_requests, self.num_hits, self.num_evictions, col_stats)
|