pixeltable 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -2
- pixeltable/catalog/column.py +1 -1
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/table.py +1 -1
- pixeltable/catalog/table_version.py +12 -2
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +64 -20
- pixeltable/dataframe.py +14 -14
- pixeltable/env.py +20 -3
- pixeltable/exec/component_iteration_node.py +1 -2
- pixeltable/exec/expr_eval/evaluators.py +4 -2
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
- pixeltable/exprs/comparison.py +8 -4
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +9 -2
- pixeltable/exprs/function_call.py +155 -313
- pixeltable/func/aggregate_function.py +29 -15
- pixeltable/func/callable_function.py +11 -8
- pixeltable/func/expr_template_function.py +3 -9
- pixeltable/func/function.py +148 -74
- pixeltable/func/signature.py +65 -30
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/deepseek.py +121 -0
- pixeltable/functions/image.py +7 -7
- pixeltable/functions/openai.py +49 -10
- pixeltable/functions/video.py +14 -7
- pixeltable/globals.py +14 -3
- pixeltable/index/embedding_index.py +4 -13
- pixeltable/io/globals.py +88 -77
- pixeltable/io/hf_datasets.py +34 -34
- pixeltable/io/pandas.py +75 -87
- pixeltable/io/parquet.py +19 -27
- pixeltable/io/utils.py +115 -0
- pixeltable/iterators/audio.py +2 -1
- pixeltable/iterators/video.py +1 -1
- pixeltable/metadata/__init__.py +2 -1
- pixeltable/metadata/converters/convert_15.py +18 -8
- pixeltable/metadata/converters/convert_27.py +31 -0
- pixeltable/metadata/converters/convert_28.py +15 -0
- pixeltable/metadata/converters/convert_29.py +111 -0
- pixeltable/metadata/converters/util.py +12 -1
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +8 -0
- pixeltable/share/__init__.py +1 -0
- pixeltable/share/packager.py +246 -0
- pixeltable/share/publish.py +97 -0
- pixeltable/type_system.py +87 -42
- pixeltable/utils/__init__.py +41 -0
- pixeltable/utils/arrow.py +45 -12
- pixeltable/utils/formatter.py +1 -1
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +1 -1
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/METADATA +37 -50
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/RECORD +60 -51
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.3.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0
pixeltable/type_system.py
CHANGED
|
@@ -8,12 +8,9 @@ import json
|
|
|
8
8
|
import typing
|
|
9
9
|
import urllib.parse
|
|
10
10
|
import urllib.request
|
|
11
|
-
from
|
|
11
|
+
from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
from typing import Any, Iterable, Mapping, Optional, Sequence, Union
|
|
15
|
-
|
|
16
|
-
import av # type: ignore
|
|
13
|
+
import av
|
|
17
14
|
import jsonschema
|
|
18
15
|
import jsonschema.protocols
|
|
19
16
|
import jsonschema.validators
|
|
@@ -24,6 +21,9 @@ import sqlalchemy as sql
|
|
|
24
21
|
from typing_extensions import _AnnotatedAlias
|
|
25
22
|
|
|
26
23
|
import pixeltable.exceptions as excs
|
|
24
|
+
from pixeltable.utils import parse_local_file_path
|
|
25
|
+
|
|
26
|
+
from typing import _GenericAlias # type: ignore[attr-defined] # isort: skip
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class ColumnType:
|
|
@@ -47,8 +47,8 @@ class ColumnType:
|
|
|
47
47
|
@classmethod
|
|
48
48
|
def supertype(
|
|
49
49
|
cls,
|
|
50
|
-
type1: 'ColumnType.Type',
|
|
51
|
-
type2: 'ColumnType.Type',
|
|
50
|
+
type1: Optional['ColumnType.Type'],
|
|
51
|
+
type2: Optional['ColumnType.Type'],
|
|
52
52
|
# we need to pass this in because we can't easily append it as a class member
|
|
53
53
|
common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
|
|
54
54
|
) -> Optional['ColumnType.Type']:
|
|
@@ -93,6 +93,9 @@ class ColumnType:
|
|
|
93
93
|
self._type = t
|
|
94
94
|
self._nullable = nullable
|
|
95
95
|
|
|
96
|
+
def has_supertype(self) -> bool:
|
|
97
|
+
return True
|
|
98
|
+
|
|
96
99
|
@property
|
|
97
100
|
def nullable(self) -> bool:
|
|
98
101
|
return self._nullable
|
|
@@ -213,9 +216,9 @@ class ColumnType:
|
|
|
213
216
|
return self.copy(nullable=(self.nullable or other.nullable))
|
|
214
217
|
|
|
215
218
|
if self.is_invalid_type():
|
|
216
|
-
return other
|
|
219
|
+
return other.copy(nullable=(self.nullable or other.nullable))
|
|
217
220
|
if other.is_invalid_type():
|
|
218
|
-
return self
|
|
221
|
+
return self.copy(nullable=(self.nullable or other.nullable))
|
|
219
222
|
|
|
220
223
|
if self.is_scalar_type() and other.is_scalar_type():
|
|
221
224
|
t = self.Type.supertype(self._type, other._type, self.common_supertypes)
|
|
@@ -271,8 +274,10 @@ class ColumnType:
|
|
|
271
274
|
inferred_type = val_type
|
|
272
275
|
else:
|
|
273
276
|
inferred_type = inferred_type.supertype(val_type)
|
|
274
|
-
|
|
275
|
-
|
|
277
|
+
if inferred_type is None:
|
|
278
|
+
return None
|
|
279
|
+
if not inferred_type.has_supertype():
|
|
280
|
+
return inferred_type
|
|
276
281
|
return inferred_type
|
|
277
282
|
|
|
278
283
|
@classmethod
|
|
@@ -292,26 +297,24 @@ class ColumnType:
|
|
|
292
297
|
designations will be allowed regardless.
|
|
293
298
|
"""
|
|
294
299
|
origin = typing.get_origin(t)
|
|
300
|
+
type_args = typing.get_args(t)
|
|
295
301
|
if origin is typing.Union:
|
|
296
302
|
# Check if `t` has the form Optional[T].
|
|
297
|
-
|
|
298
|
-
if len(union_args) == 2 and type(None) in union_args:
|
|
303
|
+
if len(type_args) == 2 and type(None) in type_args:
|
|
299
304
|
# `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
|
|
300
305
|
# We treat it as the underlying type but with nullable=True.
|
|
301
|
-
underlying_py_type =
|
|
306
|
+
underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
|
|
302
307
|
underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
|
|
303
308
|
if underlying is not None:
|
|
304
309
|
return underlying.copy(nullable=True)
|
|
305
310
|
elif origin is Required:
|
|
306
|
-
|
|
307
|
-
assert len(required_args) == 1
|
|
311
|
+
assert len(type_args) == 1
|
|
308
312
|
return cls.from_python_type(
|
|
309
|
-
|
|
310
|
-
)
|
|
313
|
+
type_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
|
|
314
|
+
).copy(nullable=False)
|
|
311
315
|
elif origin is typing.Annotated:
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
parameters = annotated_args[1]
|
|
316
|
+
origin = type_args[0]
|
|
317
|
+
parameters = type_args[1]
|
|
315
318
|
if isinstance(parameters, ColumnType):
|
|
316
319
|
return parameters.copy(nullable=nullable_default)
|
|
317
320
|
else:
|
|
@@ -323,6 +326,11 @@ class ColumnType:
|
|
|
323
326
|
if isinstance(t, type) and issubclass(t, _PxtType):
|
|
324
327
|
return t.as_col_type(nullable=nullable_default)
|
|
325
328
|
elif allow_builtin_types:
|
|
329
|
+
if t is Literal and len(type_args) > 0:
|
|
330
|
+
literal_type = cls.infer_common_literal_type(type_args)
|
|
331
|
+
if literal_type is None:
|
|
332
|
+
return None
|
|
333
|
+
return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
|
|
326
334
|
if t is str:
|
|
327
335
|
return StringType(nullable=nullable_default)
|
|
328
336
|
if t is int:
|
|
@@ -335,7 +343,7 @@ class ColumnType:
|
|
|
335
343
|
return TimestampType(nullable=nullable_default)
|
|
336
344
|
if t is PIL.Image.Image:
|
|
337
345
|
return ImageType(nullable=nullable_default)
|
|
338
|
-
if
|
|
346
|
+
if isinstance(t, type) and issubclass(t, (Sequence, Mapping, pydantic.BaseModel)):
|
|
339
347
|
return JsonType(nullable=nullable_default)
|
|
340
348
|
return None
|
|
341
349
|
|
|
@@ -394,12 +402,9 @@ class ColumnType:
|
|
|
394
402
|
def _validate_file_path(self, val: Any) -> None:
|
|
395
403
|
"""Raises TypeError if not a valid local file path or not a path/byte sequence"""
|
|
396
404
|
if isinstance(val, str):
|
|
397
|
-
|
|
398
|
-
if
|
|
399
|
-
|
|
400
|
-
path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
401
|
-
if not path.is_file():
|
|
402
|
-
raise TypeError(f'File not found: {str(path)}')
|
|
405
|
+
path = parse_local_file_path(val)
|
|
406
|
+
if path is not None and not path.is_file():
|
|
407
|
+
raise TypeError(f'File not found: {path}')
|
|
403
408
|
else:
|
|
404
409
|
if not isinstance(val, bytes):
|
|
405
410
|
raise TypeError(f'expected file path or bytes, got {type(val)}')
|
|
@@ -492,7 +497,7 @@ class InvalidType(ColumnType):
|
|
|
492
497
|
super().__init__(self.Type.INVALID, nullable=nullable)
|
|
493
498
|
|
|
494
499
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
495
|
-
|
|
500
|
+
return sql.types.NullType()
|
|
496
501
|
|
|
497
502
|
def print_value(self, val: Any) -> str:
|
|
498
503
|
return str(val)
|
|
@@ -505,6 +510,9 @@ class StringType(ColumnType):
|
|
|
505
510
|
def __init__(self, nullable: bool = False):
|
|
506
511
|
super().__init__(self.Type.STRING, nullable=nullable)
|
|
507
512
|
|
|
513
|
+
def has_supertype(self):
|
|
514
|
+
return not self.nullable
|
|
515
|
+
|
|
508
516
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
509
517
|
return sql.String()
|
|
510
518
|
|
|
@@ -588,6 +596,9 @@ class TimestampType(ColumnType):
|
|
|
588
596
|
def __init__(self, nullable: bool = False):
|
|
589
597
|
super().__init__(self.Type.TIMESTAMP, nullable=nullable)
|
|
590
598
|
|
|
599
|
+
def has_supertype(self):
|
|
600
|
+
return not self.nullable
|
|
601
|
+
|
|
591
602
|
def to_sa_type(self) -> sql.types.TypeEngine:
|
|
592
603
|
return sql.TIMESTAMP(timezone=True)
|
|
593
604
|
|
|
@@ -598,6 +609,8 @@ class TimestampType(ColumnType):
|
|
|
598
609
|
def _create_literal(self, val: Any) -> Any:
|
|
599
610
|
if isinstance(val, str):
|
|
600
611
|
return datetime.datetime.fromisoformat(val)
|
|
612
|
+
if isinstance(val, datetime.datetime):
|
|
613
|
+
return val
|
|
601
614
|
return val
|
|
602
615
|
|
|
603
616
|
|
|
@@ -648,6 +661,10 @@ class JsonType(ColumnType):
|
|
|
648
661
|
return val_type.print_value(val)
|
|
649
662
|
|
|
650
663
|
def _validate_literal(self, val: Any) -> None:
|
|
664
|
+
if isinstance(val, tuple):
|
|
665
|
+
val = list(val)
|
|
666
|
+
if isinstance(val, pydantic.BaseModel):
|
|
667
|
+
val = val.model_dump()
|
|
651
668
|
if not self.__is_valid_json(val):
|
|
652
669
|
raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
|
|
653
670
|
if self.__validator is not None:
|
|
@@ -815,14 +832,20 @@ class ArrayType(ColumnType):
|
|
|
815
832
|
return hash((self._type, self.nullable, self.shape, self.dtype))
|
|
816
833
|
|
|
817
834
|
def supertype(self, other: ColumnType) -> Optional[ArrayType]:
|
|
835
|
+
basic_supertype = super().supertype(other)
|
|
836
|
+
if basic_supertype is not None:
|
|
837
|
+
assert isinstance(basic_supertype, ArrayType)
|
|
838
|
+
return basic_supertype
|
|
839
|
+
|
|
818
840
|
if not isinstance(other, ArrayType):
|
|
819
841
|
return None
|
|
842
|
+
|
|
820
843
|
super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
|
|
821
844
|
if super_dtype is None:
|
|
822
845
|
# if the dtypes are incompatible, then the supertype is a fully general array
|
|
823
846
|
return ArrayType(nullable=(self.nullable or other.nullable))
|
|
824
847
|
super_shape: Optional[tuple[Optional[int], ...]]
|
|
825
|
-
if len(self.shape) != len(other.shape):
|
|
848
|
+
if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
|
|
826
849
|
super_shape = None
|
|
827
850
|
else:
|
|
828
851
|
super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
|
|
@@ -851,23 +874,39 @@ class ArrayType(ColumnType):
|
|
|
851
874
|
dtype = None if d['dtype'] is None else cls.make_type(cls.Type(d['dtype']))
|
|
852
875
|
return cls(shape, dtype, nullable=d['nullable'])
|
|
853
876
|
|
|
877
|
+
@classmethod
|
|
878
|
+
def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> Optional[ColumnType]:
|
|
879
|
+
"""
|
|
880
|
+
Return pixeltable type corresponding to a given simple numpy dtype
|
|
881
|
+
"""
|
|
882
|
+
if np.issubdtype(dtype, np.integer):
|
|
883
|
+
return IntType(nullable=nullable)
|
|
884
|
+
|
|
885
|
+
if np.issubdtype(dtype, np.floating):
|
|
886
|
+
return FloatType(nullable=nullable)
|
|
887
|
+
|
|
888
|
+
if dtype == np.bool_:
|
|
889
|
+
return BoolType(nullable=nullable)
|
|
890
|
+
|
|
891
|
+
if np.issubdtype(dtype, np.str_):
|
|
892
|
+
return StringType(nullable=nullable)
|
|
893
|
+
|
|
894
|
+
if np.issubdtype(dtype, np.character):
|
|
895
|
+
return StringType(nullable=nullable)
|
|
896
|
+
|
|
897
|
+
if np.issubdtype(dtype, np.datetime64):
|
|
898
|
+
return TimestampType(nullable=nullable)
|
|
899
|
+
|
|
900
|
+
return None
|
|
901
|
+
|
|
854
902
|
@classmethod
|
|
855
903
|
def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
|
|
856
904
|
# determine our dtype
|
|
857
905
|
assert isinstance(val, np.ndarray)
|
|
858
|
-
|
|
859
|
-
if
|
|
860
|
-
dtype = IntType()
|
|
861
|
-
elif np.issubdtype(val.dtype, np.floating):
|
|
862
|
-
dtype = FloatType()
|
|
863
|
-
elif val.dtype == np.bool_:
|
|
864
|
-
dtype = BoolType()
|
|
865
|
-
elif np.issubdtype(val.dtype, np.str_):
|
|
866
|
-
# Note that this includes NumPy types like '<U1' -- arrays of single Unicode characters
|
|
867
|
-
dtype = StringType()
|
|
868
|
-
else:
|
|
906
|
+
pxttype: Optional[ColumnType] = cls.from_np_dtype(val.dtype, nullable)
|
|
907
|
+
if pxttype == None:
|
|
869
908
|
return None
|
|
870
|
-
return cls(val.shape, dtype=
|
|
909
|
+
return cls(val.shape, dtype=pxttype, nullable=nullable)
|
|
871
910
|
|
|
872
911
|
def is_valid_literal(self, val: np.ndarray) -> bool:
|
|
873
912
|
if not isinstance(val, np.ndarray):
|
|
@@ -990,8 +1029,14 @@ class ImageType(ColumnType):
|
|
|
990
1029
|
return hash((self._type, self.nullable, self.size, self.mode))
|
|
991
1030
|
|
|
992
1031
|
def supertype(self, other: ColumnType) -> Optional[ImageType]:
|
|
1032
|
+
basic_supertype = super().supertype(other)
|
|
1033
|
+
if basic_supertype is not None:
|
|
1034
|
+
assert isinstance(basic_supertype, ImageType)
|
|
1035
|
+
return basic_supertype
|
|
1036
|
+
|
|
993
1037
|
if not isinstance(other, ImageType):
|
|
994
1038
|
return None
|
|
1039
|
+
|
|
995
1040
|
width = self.width if self.width == other.width else None
|
|
996
1041
|
height = self.height if self.height == other.height else None
|
|
997
1042
|
mode = self.mode if self.mode == other.mode else None
|
pixeltable/utils/__init__.py
CHANGED
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import urllib.parse
|
|
3
|
+
import urllib.request
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
|
|
1
8
|
def print_perf_counter_delta(delta: float) -> str:
|
|
2
9
|
"""Prints a performance counter delta in a human-readable format.
|
|
3
10
|
|
|
@@ -15,3 +22,37 @@ def print_perf_counter_delta(delta: float) -> str:
|
|
|
15
22
|
return f'{delta * 1e3:.2f} ms'
|
|
16
23
|
else:
|
|
17
24
|
return f'{delta:.2f} s'
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def sha256sum(path: Union[Path, str]) -> str:
|
|
28
|
+
"""
|
|
29
|
+
Compute the SHA256 hash of a file.
|
|
30
|
+
"""
|
|
31
|
+
if isinstance(path, str):
|
|
32
|
+
path = Path(path)
|
|
33
|
+
|
|
34
|
+
h = hashlib.sha256()
|
|
35
|
+
with open(path, 'rb') as file:
|
|
36
|
+
while chunk := file.read(h.block_size):
|
|
37
|
+
h.update(chunk)
|
|
38
|
+
|
|
39
|
+
return h.hexdigest()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def parse_local_file_path(file_or_url: str) -> Optional[Path]:
|
|
43
|
+
"""
|
|
44
|
+
Parses a string that may be either a URL or a local file path.
|
|
45
|
+
|
|
46
|
+
If the string is a local file path or a file-scheme URL (file://), then a Path object will be returned.
|
|
47
|
+
Otherwise, None will be returned.
|
|
48
|
+
"""
|
|
49
|
+
parsed = urllib.parse.urlparse(file_or_url)
|
|
50
|
+
if len(parsed.scheme) <= 1:
|
|
51
|
+
# We're using `urlparse` to help distinguish file paths from URLs. If there is no scheme, then it's a file path.
|
|
52
|
+
# If there's a single-character scheme, we also interpret this as a file path; this insures that drive letters
|
|
53
|
+
# on Windows pathnames are correctly handled.
|
|
54
|
+
return Path(file_or_url).absolute()
|
|
55
|
+
elif parsed.scheme == 'file':
|
|
56
|
+
return Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
|
|
57
|
+
else:
|
|
58
|
+
return None
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -6,8 +6,10 @@ import pyarrow as pa
|
|
|
6
6
|
|
|
7
7
|
import pixeltable.type_system as ts
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
10
10
|
pa.string(): ts.StringType(nullable=True),
|
|
11
|
+
pa.large_string(): ts.StringType(nullable=True),
|
|
12
|
+
pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
|
|
11
13
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
12
14
|
pa.uint8(): ts.IntType(nullable=True),
|
|
13
15
|
pa.int8(): ts.IntType(nullable=True),
|
|
@@ -16,9 +18,10 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
|
|
|
16
18
|
pa.int32(): ts.IntType(nullable=True),
|
|
17
19
|
pa.int64(): ts.IntType(nullable=True),
|
|
18
20
|
pa.float32(): ts.FloatType(nullable=True),
|
|
21
|
+
pa.float64(): ts.FloatType(nullable=True),
|
|
19
22
|
}
|
|
20
23
|
|
|
21
|
-
|
|
24
|
+
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
22
25
|
ts.StringType: pa.string(),
|
|
23
26
|
ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
|
|
24
27
|
ts.BoolType: pa.bool_(),
|
|
@@ -32,19 +35,20 @@ _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
|
|
|
32
35
|
}
|
|
33
36
|
|
|
34
37
|
|
|
35
|
-
def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
|
|
38
|
+
def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
|
|
36
39
|
"""Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
|
|
37
40
|
Returns None if no conversion is currently implemented.
|
|
38
41
|
"""
|
|
39
42
|
if isinstance(arrow_type, pa.TimestampType):
|
|
40
|
-
return ts.TimestampType(nullable=
|
|
41
|
-
elif arrow_type in
|
|
42
|
-
|
|
43
|
+
return ts.TimestampType(nullable=nullable)
|
|
44
|
+
elif arrow_type in PA_TO_PXT_TYPES:
|
|
45
|
+
pt = PA_TO_PXT_TYPES[arrow_type]
|
|
46
|
+
return pt.copy(nullable=nullable)
|
|
43
47
|
elif isinstance(arrow_type, pa.FixedShapeTensorType):
|
|
44
|
-
dtype = to_pixeltable_type(arrow_type.value_type)
|
|
48
|
+
dtype = to_pixeltable_type(arrow_type.value_type, nullable)
|
|
45
49
|
if dtype is None:
|
|
46
50
|
return None
|
|
47
|
-
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
|
|
51
|
+
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
|
|
48
52
|
else:
|
|
49
53
|
return None
|
|
50
54
|
|
|
@@ -53,16 +57,25 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
|
|
|
53
57
|
"""Convert a pixeltable DataType to a pyarrow datatype if one is defined.
|
|
54
58
|
Returns None if no conversion is currently implemented.
|
|
55
59
|
"""
|
|
56
|
-
if pixeltable_type.__class__ in
|
|
57
|
-
return
|
|
60
|
+
if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
|
|
61
|
+
return PXT_TO_PA_TYPES[pixeltable_type.__class__]
|
|
58
62
|
elif isinstance(pixeltable_type, ts.ArrayType):
|
|
59
63
|
return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
|
|
60
64
|
else:
|
|
61
65
|
return None
|
|
62
66
|
|
|
63
67
|
|
|
64
|
-
def
|
|
65
|
-
|
|
68
|
+
def ar_infer_schema(
|
|
69
|
+
arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
|
|
70
|
+
) -> dict[str, ts.ColumnType]:
|
|
71
|
+
"""Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
|
|
72
|
+
ar_schema = {
|
|
73
|
+
field.name: to_pixeltable_type(field.type, field.name not in primary_key)
|
|
74
|
+
if field.name not in schema_overrides
|
|
75
|
+
else schema_overrides[field.name]
|
|
76
|
+
for field in arrow_schema
|
|
77
|
+
}
|
|
78
|
+
return ar_schema
|
|
66
79
|
|
|
67
80
|
|
|
68
81
|
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
@@ -96,3 +109,23 @@ def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, An
|
|
|
96
109
|
|
|
97
110
|
for i in range(batch_size):
|
|
98
111
|
yield {col_name: values[i] for col_name, values in pydict.items()}
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def iter_tuples2(
|
|
115
|
+
batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
|
|
116
|
+
) -> Iterator[dict[str, Any]]:
|
|
117
|
+
"""Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
|
|
118
|
+
pydict = to_pydict(batch)
|
|
119
|
+
assert len(pydict) > 0, 'empty record batch'
|
|
120
|
+
for _, v in pydict.items():
|
|
121
|
+
batch_size = len(v)
|
|
122
|
+
break
|
|
123
|
+
|
|
124
|
+
for i in range(batch_size):
|
|
125
|
+
# Convert a row to insertable format
|
|
126
|
+
yield {
|
|
127
|
+
(pxt_name := col_name if col_mapping is None else col_mapping[col_name]): schema[pxt_name].create_literal(
|
|
128
|
+
values[i]
|
|
129
|
+
)
|
|
130
|
+
for col_name, values in pydict.items()
|
|
131
|
+
}
|
pixeltable/utils/formatter.py
CHANGED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
from pyiceberg.catalog.sql import SqlCatalog
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def sqlite_catalog(warehouse_path: Union[str, Path], name: str = 'pixeltable') -> SqlCatalog:
|
|
8
|
+
"""
|
|
9
|
+
Instantiate a sqlite Iceberg catalog at the specified path. If no catalog exists, one will be created.
|
|
10
|
+
"""
|
|
11
|
+
if isinstance(warehouse_path, str):
|
|
12
|
+
warehouse_path = Path(warehouse_path)
|
|
13
|
+
warehouse_path.mkdir(exist_ok=True)
|
|
14
|
+
return SqlCatalog(name, uri=f'sqlite:///{warehouse_path}/catalog.db', warehouse=f'file://{warehouse_path}')
|
pixeltable/utils/media_store.py
CHANGED
|
@@ -30,7 +30,7 @@ class MediaStore:
|
|
|
30
30
|
the environment's media_dir.
|
|
31
31
|
"""
|
|
32
32
|
id_hex = uuid.uuid4().hex
|
|
33
|
-
parent = Env.get().media_dir / tbl_id.hex / id_hex[
|
|
33
|
+
parent = Env.get().media_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
|
|
34
34
|
parent.mkdir(parents=True, exist_ok=True)
|
|
35
35
|
return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'
|
|
36
36
|
|