pixeltable 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/column.py +25 -48
- pixeltable/catalog/insertable_table.py +7 -4
- pixeltable/catalog/table.py +163 -57
- pixeltable/catalog/table_version.py +416 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/client.py +0 -4
- pixeltable/dataframe.py +65 -21
- pixeltable/env.py +16 -1
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +8 -40
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/aggregate_function.py +15 -15
- pixeltable/func/expr_template_function.py +9 -1
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +18 -12
- pixeltable/func/udf.py +7 -2
- pixeltable/functions/__init__.py +8 -8
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/huggingface.py +47 -19
- pixeltable/functions/openai.py +2 -2
- pixeltable/functions/util.py +11 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +49 -0
- pixeltable/index/embedding_index.py +95 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -34
- pixeltable/store.py +38 -41
- pixeltable/tests/conftest.py +5 -11
- pixeltable/tests/ext/test_yolox.py +21 -0
- pixeltable/tests/functions/test_fireworks.py +1 -0
- pixeltable/tests/functions/test_huggingface.py +2 -2
- pixeltable/tests/functions/test_openai.py +15 -5
- pixeltable/tests/functions/test_together.py +1 -0
- pixeltable/tests/test_component_view.py +14 -5
- pixeltable/tests/test_dataframe.py +19 -18
- pixeltable/tests/test_exprs.py +99 -102
- pixeltable/tests/test_function.py +51 -43
- pixeltable/tests/test_index.py +138 -0
- pixeltable/tests/test_migration.py +2 -1
- pixeltable/tests/test_snapshot.py +24 -1
- pixeltable/tests/test_table.py +101 -25
- pixeltable/tests/test_types.py +30 -0
- pixeltable/tests/test_video.py +16 -16
- pixeltable/tests/test_view.py +5 -0
- pixeltable/tests/utils.py +43 -9
- pixeltable/tool/create_test_db_dump.py +16 -0
- pixeltable/type_system.py +37 -45
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/METADATA +5 -4
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/RECORD +56 -49
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.4.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
pixeltable/tests/test_table.py
CHANGED
|
@@ -18,7 +18,7 @@ from pixeltable import exceptions as excs
|
|
|
18
18
|
from pixeltable.iterators import FrameIterator
|
|
19
19
|
from pixeltable.tests.utils import \
|
|
20
20
|
make_tbl, create_table_data, read_data_file, get_video_files, get_audio_files, get_image_files, get_documents, \
|
|
21
|
-
assert_resultset_eq, assert_hf_dataset_equal, make_test_arrow_table
|
|
21
|
+
assert_resultset_eq, assert_hf_dataset_equal, make_test_arrow_table, validate_update_status
|
|
22
22
|
from pixeltable.tests.utils import skip_test_if_not_installed
|
|
23
23
|
from pixeltable.type_system import \
|
|
24
24
|
StringType, IntType, FloatType, TimestampType, ImageType, VideoType, JsonType, BoolType, ArrayType, AudioType, \
|
|
@@ -41,6 +41,21 @@ class TestTable:
|
|
|
41
41
|
def add1(a: int) -> int:
|
|
42
42
|
return a + 1
|
|
43
43
|
|
|
44
|
+
@pxt.uda(
|
|
45
|
+
update_types=[IntType()], value_type=IntType(), requires_order_by=True,
|
|
46
|
+
allows_window=True)
|
|
47
|
+
class window_fn:
|
|
48
|
+
def __init__(self):
|
|
49
|
+
pass
|
|
50
|
+
def update(self, i: int) -> None:
|
|
51
|
+
pass
|
|
52
|
+
def value(self) -> int:
|
|
53
|
+
return 1
|
|
54
|
+
|
|
55
|
+
@pxt.expr_udf(param_types=[IntType(nullable=False)])
|
|
56
|
+
def add1(a: int) -> int:
|
|
57
|
+
return a + 1
|
|
58
|
+
|
|
44
59
|
def test_create(self, test_client: pxt.Client) -> None:
|
|
45
60
|
cl = test_client
|
|
46
61
|
cl.create_dir('dir1')
|
|
@@ -56,7 +71,7 @@ class TestTable:
|
|
|
56
71
|
with pytest.raises(excs.Error):
|
|
57
72
|
_ = cl.create_table('1test', schema)
|
|
58
73
|
with pytest.raises(excs.Error):
|
|
59
|
-
_ =
|
|
74
|
+
_ = cl.create_table('bad name', schema={'c1': StringType()})
|
|
60
75
|
with pytest.raises(excs.Error):
|
|
61
76
|
_ = cl.create_table('test', schema)
|
|
62
77
|
with pytest.raises(excs.Error):
|
|
@@ -299,10 +314,6 @@ class TestTable:
|
|
|
299
314
|
cl.create_table('test', {'c1': {'type': StringType(), 'stored': 'true'}})
|
|
300
315
|
assert '"stored" must be a bool' in str(exc_info.value)
|
|
301
316
|
|
|
302
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
303
|
-
cl.create_table('test', {'c1': {'type': StringType(), 'indexed': 'true'}})
|
|
304
|
-
assert '"indexed" must be a bool' in str(exc_info.value)
|
|
305
|
-
|
|
306
317
|
with pytest.raises(excs.Error) as exc_info:
|
|
307
318
|
cl.create_table('test', {'c1': StringType()}, primary_key='c2')
|
|
308
319
|
assert 'primary key column c2 not found' in str(exc_info.value).lower()
|
|
@@ -501,18 +512,8 @@ class TestTable:
|
|
|
501
512
|
# a non-materialized column that refers to another non-materialized column
|
|
502
513
|
view.add_column(c4=view.c2.rotate(60), stored=False)
|
|
503
514
|
|
|
504
|
-
@pxt.uda(
|
|
505
|
-
name='window_fn', update_types=[IntType()], value_type=IntType(), requires_order_by = True,
|
|
506
|
-
allows_window = True)
|
|
507
|
-
class WindowFnAggregator:
|
|
508
|
-
def __init__(self):
|
|
509
|
-
pass
|
|
510
|
-
def update(self, i: int) -> None:
|
|
511
|
-
pass
|
|
512
|
-
def value(self) -> int:
|
|
513
|
-
return 1
|
|
514
515
|
# cols computed with window functions are stored by default
|
|
515
|
-
view.add_column(c5=window_fn(view.frame_idx, 1, group_by=view.video))
|
|
516
|
+
view.add_column(c5=self.window_fn(view.frame_idx, 1, group_by=view.video))
|
|
516
517
|
|
|
517
518
|
# reload to make sure that metadata gets restored correctly
|
|
518
519
|
cl = pxt.Client(reload=True)
|
|
@@ -553,6 +554,23 @@ class TestTable:
|
|
|
553
554
|
cl.drop_table('test_tbl')
|
|
554
555
|
assert MediaStore.count(view.get_id()) == 0
|
|
555
556
|
|
|
557
|
+
def test_insert_nulls(self, test_client: pxt.Client) -> None:
|
|
558
|
+
cl = test_client
|
|
559
|
+
schema = {
|
|
560
|
+
'c1': StringType(nullable=True),
|
|
561
|
+
'c2': IntType(nullable=True),
|
|
562
|
+
'c3': FloatType(nullable=True),
|
|
563
|
+
'c4': BoolType(nullable=True),
|
|
564
|
+
'c5': ArrayType((2, 3), dtype=IntType(), nullable=True),
|
|
565
|
+
'c6': JsonType(nullable=True),
|
|
566
|
+
'c7': ImageType(nullable=True),
|
|
567
|
+
'c8': VideoType(nullable=True),
|
|
568
|
+
}
|
|
569
|
+
t = cl.create_table('test1', schema)
|
|
570
|
+
status = t.insert(c1='abc')
|
|
571
|
+
assert status.num_rows == 1
|
|
572
|
+
assert status.num_excs == 0
|
|
573
|
+
|
|
556
574
|
def test_insert(self, test_client: pxt.Client) -> None:
|
|
557
575
|
cl = test_client
|
|
558
576
|
schema = {
|
|
@@ -650,7 +668,63 @@ class TestTable:
|
|
|
650
668
|
t2 = cl.get_table('test')
|
|
651
669
|
_ = t2.show(n=0)
|
|
652
670
|
|
|
653
|
-
def
|
|
671
|
+
def test_batch_update(self, test_tbl: pxt.Table) -> None:
|
|
672
|
+
t = test_tbl
|
|
673
|
+
validate_update_status(
|
|
674
|
+
t.batch_update([{'c1': '1', 'c2': 1}, {'c1': '2', 'c2': 2}]),
|
|
675
|
+
expected_rows=2)
|
|
676
|
+
assert t.where(t.c2 == 1).collect()[0]['c1'] == '1'
|
|
677
|
+
assert t.where(t.c2 == 2).collect()[0]['c1'] == '2'
|
|
678
|
+
validate_update_status(
|
|
679
|
+
t.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two', '_rowid': (2,)}]),
|
|
680
|
+
expected_rows=2)
|
|
681
|
+
assert t.where(t.c2 == 1).collect()[0]['c1'] == 'one'
|
|
682
|
+
assert t.where(t.c2 == 2).collect()[0]['c1'] == 'two'
|
|
683
|
+
|
|
684
|
+
cl = pxt.Client()
|
|
685
|
+
# test composite primary key
|
|
686
|
+
schema = {'c1': StringType(), 'c2': IntType(), 'c3': FloatType()}
|
|
687
|
+
t = cl.create_table('composite', schema=schema, primary_key=['c1', 'c2'])
|
|
688
|
+
rows = [{'c1': str(i), 'c2': i, 'c3': float(i)} for i in range(10)]
|
|
689
|
+
validate_update_status(t.insert(rows), expected_rows=10)
|
|
690
|
+
|
|
691
|
+
validate_update_status(
|
|
692
|
+
t.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0}, {'c1': '2', 'c2': 2, 'c3': 3.0}]),
|
|
693
|
+
expected_rows=2)
|
|
694
|
+
|
|
695
|
+
with pytest.raises(excs.Error) as exc_info:
|
|
696
|
+
# can't mix _rowid with primary key
|
|
697
|
+
_ = t.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0, '_rowid': (1,)}])
|
|
698
|
+
assert 'c1 is a primary key column' in str(exc_info.value).lower()
|
|
699
|
+
|
|
700
|
+
with pytest.raises(excs.Error) as exc_info:
|
|
701
|
+
# bad literal
|
|
702
|
+
_ = t.batch_update([{'c2': 1, 'c3': 'a'}])
|
|
703
|
+
assert "'a' is not a valid literal" in str(exc_info.value).lower()
|
|
704
|
+
|
|
705
|
+
with pytest.raises(excs.Error) as exc_info:
|
|
706
|
+
# missing primary key column
|
|
707
|
+
t.batch_update([{'c1': '1', 'c3': 2.0}])
|
|
708
|
+
assert 'primary key columns (c2) missing' in str(exc_info.value).lower()
|
|
709
|
+
|
|
710
|
+
# table without primary key
|
|
711
|
+
t2 = cl.create_table('no_pk', schema=schema)
|
|
712
|
+
validate_update_status(t2.insert(rows), expected_rows=10)
|
|
713
|
+
with pytest.raises(excs.Error) as exc_info:
|
|
714
|
+
_ = t2.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0}])
|
|
715
|
+
assert 'must have primary key for batch update' in str(exc_info.value).lower()
|
|
716
|
+
|
|
717
|
+
# updating with _rowid still works
|
|
718
|
+
validate_update_status(
|
|
719
|
+
t2.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two', '_rowid': (2,)}]),
|
|
720
|
+
expected_rows=2)
|
|
721
|
+
assert t2.where(t2.c2 == 1).collect()[0]['c1'] == 'one'
|
|
722
|
+
assert t2.where(t2.c2 == 2).collect()[0]['c1'] == 'two'
|
|
723
|
+
with pytest.raises(AssertionError):
|
|
724
|
+
# some rows are missing rowids
|
|
725
|
+
_ = t2.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two'}])
|
|
726
|
+
|
|
727
|
+
def test_update(self, test_tbl: pxt.Table, small_img_tbl) -> None:
|
|
654
728
|
t = test_tbl
|
|
655
729
|
# update every type with a literal
|
|
656
730
|
test_cases = [
|
|
@@ -755,7 +829,7 @@ class TestTable:
|
|
|
755
829
|
t.update({'c3': 1.0}, where=lambda c2: c2 == 10)
|
|
756
830
|
assert 'Predicate' in str(excinfo.value)
|
|
757
831
|
|
|
758
|
-
img_t =
|
|
832
|
+
img_t = small_img_tbl
|
|
759
833
|
|
|
760
834
|
# can't update image col
|
|
761
835
|
with pytest.raises(excs.Error) as excinfo:
|
|
@@ -785,7 +859,7 @@ class TestTable:
|
|
|
785
859
|
r2 = t.where(t.c2 < 5).select(t.c3, t.c10, t.d1, t.d2).order_by(t.c2).show(0)
|
|
786
860
|
assert_resultset_eq(r1, r2)
|
|
787
861
|
|
|
788
|
-
def test_delete(self, test_tbl: pxt.Table,
|
|
862
|
+
def test_delete(self, test_tbl: pxt.Table, small_img_tbl) -> None:
|
|
789
863
|
t = test_tbl
|
|
790
864
|
|
|
791
865
|
cnt = t.where(t.c3 < 10.0).count()
|
|
@@ -813,7 +887,7 @@ class TestTable:
|
|
|
813
887
|
t.delete(where=lambda c2: c2 == 10)
|
|
814
888
|
assert 'Predicate' in str(excinfo.value)
|
|
815
889
|
|
|
816
|
-
img_t =
|
|
890
|
+
img_t = small_img_tbl
|
|
817
891
|
# similarity search is not supported
|
|
818
892
|
with pytest.raises(excs.Error) as excinfo:
|
|
819
893
|
img_t.delete(where=img_t.img.nearest('car'))
|
|
@@ -979,6 +1053,10 @@ class TestTable:
|
|
|
979
1053
|
t2.revert()
|
|
980
1054
|
assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
|
|
981
1055
|
|
|
1056
|
+
@pxt.udf(return_type=ImageType(), param_types=[ImageType()])
|
|
1057
|
+
def img_fn_with_exc(img: PIL.Image.Image) -> PIL.Image.Image:
|
|
1058
|
+
raise RuntimeError
|
|
1059
|
+
|
|
982
1060
|
def test_computed_img_cols(self, test_client: pxt.Client) -> None:
|
|
983
1061
|
cl = test_client
|
|
984
1062
|
schema = {'img': ImageType(nullable=False)}
|
|
@@ -996,10 +1074,7 @@ class TestTable:
|
|
|
996
1074
|
|
|
997
1075
|
# computed img col with exceptions
|
|
998
1076
|
t = cl.create_table('test3', schema)
|
|
999
|
-
|
|
1000
|
-
def f(img: PIL.Image.Image) -> PIL.Image.Image:
|
|
1001
|
-
raise RuntimeError
|
|
1002
|
-
t.add_column(c3=f(t.img), stored=True)
|
|
1077
|
+
t.add_column(c3=self.img_fn_with_exc(t.img), stored=True)
|
|
1003
1078
|
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
1004
1079
|
rows = [{'img': r['img']} for r in rows[:20]]
|
|
1005
1080
|
t.insert(rows, fail_on_exception=False)
|
|
@@ -1211,6 +1286,7 @@ class TestTable:
|
|
|
1211
1286
|
check_rename(t, 'c1_renamed', 'c1')
|
|
1212
1287
|
|
|
1213
1288
|
# revert() works
|
|
1289
|
+
_ = t.select(t.c1_renamed).collect()
|
|
1214
1290
|
t.revert()
|
|
1215
1291
|
_ = t.select(t.c1).collect()
|
|
1216
1292
|
#check_rename(t, 'c1', 'c1_renamed')
|
pixeltable/tests/test_types.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from copy import copy
|
|
3
|
+
from typing import List, Dict, Optional
|
|
4
|
+
|
|
1
5
|
from pixeltable.type_system import \
|
|
2
6
|
ColumnType, StringType, IntType, BoolType, ImageType, InvalidType, FloatType, TimestampType, JsonType, ArrayType
|
|
3
7
|
|
|
@@ -20,3 +24,29 @@ class TestTypes:
|
|
|
20
24
|
t_serialized = t.serialize()
|
|
21
25
|
t_deserialized = ColumnType.deserialize(t_serialized)
|
|
22
26
|
assert t == t_deserialized
|
|
27
|
+
|
|
28
|
+
def test_from_python_type(self) -> None:
|
|
29
|
+
test_cases = {
|
|
30
|
+
str: StringType(),
|
|
31
|
+
int: IntType(),
|
|
32
|
+
float: FloatType(),
|
|
33
|
+
bool: BoolType(),
|
|
34
|
+
datetime.date: TimestampType(),
|
|
35
|
+
datetime.datetime: TimestampType(),
|
|
36
|
+
list: JsonType(),
|
|
37
|
+
dict: JsonType(),
|
|
38
|
+
list[int]: JsonType(),
|
|
39
|
+
list[dict[str, int]]: JsonType(),
|
|
40
|
+
dict[int, str]: JsonType(),
|
|
41
|
+
dict[dict[str, int], list[int]]: JsonType(),
|
|
42
|
+
List: JsonType(),
|
|
43
|
+
Dict: JsonType(),
|
|
44
|
+
List[int]: JsonType(),
|
|
45
|
+
List[Dict[str, int]]: JsonType(),
|
|
46
|
+
Dict[int, str]: JsonType()
|
|
47
|
+
}
|
|
48
|
+
for py_type, pxt_type in test_cases.items():
|
|
49
|
+
assert ColumnType.from_python_type(py_type) == pxt_type
|
|
50
|
+
opt_pxt_type = copy(pxt_type)
|
|
51
|
+
opt_pxt_type.nullable = True
|
|
52
|
+
assert ColumnType.from_python_type(Optional[py_type]) == opt_pxt_type
|
pixeltable/tests/test_video.py
CHANGED
|
@@ -107,6 +107,18 @@ class TestVideo:
|
|
|
107
107
|
base_t.insert({'video': p} for p in video_filepaths)
|
|
108
108
|
_ = view_t[view_t.c1, view_t.c2, view_t.c3, view_t.c4].show(0)
|
|
109
109
|
|
|
110
|
+
# window function that simply passes through the frame
|
|
111
|
+
@pxt.uda(
|
|
112
|
+
update_types=[ImageType()], value_type=ImageType(),
|
|
113
|
+
requires_order_by=True, allows_std_agg=False, allows_window=True)
|
|
114
|
+
class agg_fn:
|
|
115
|
+
def __init__(self):
|
|
116
|
+
self.img = None
|
|
117
|
+
def update(self, frame: PIL.Image.Image) -> None:
|
|
118
|
+
self.img = frame
|
|
119
|
+
def value(self) -> PIL.Image.Image:
|
|
120
|
+
return self.img
|
|
121
|
+
|
|
110
122
|
def test_make_video(self, test_client: pxt.Client) -> None:
|
|
111
123
|
video_filepaths = get_video_files()
|
|
112
124
|
cl = test_client
|
|
@@ -131,29 +143,17 @@ class TestVideo:
|
|
|
131
143
|
make_video(view_t.pos, view_t.frame),
|
|
132
144
|
make_video(view_t.pos - 1, view_t.transformed)).group_by(base_t).show()
|
|
133
145
|
|
|
134
|
-
# window function that simply passes through the frame
|
|
135
|
-
@pxt.uda(
|
|
136
|
-
update_types=[ImageType()], value_type=ImageType(), name='agg_fn',
|
|
137
|
-
requires_order_by=True, allows_std_agg=False, allows_window=True)
|
|
138
|
-
class WindowAgg:
|
|
139
|
-
def __init__(self):
|
|
140
|
-
self.img = None
|
|
141
|
-
def update(self, frame: PIL.Image.Image) -> None:
|
|
142
|
-
self.img = frame
|
|
143
|
-
def value(self) -> PIL.Image.Image:
|
|
144
|
-
return self.img
|
|
145
|
-
|
|
146
146
|
# make sure it works
|
|
147
|
-
_ = view_t.select(agg_fn(view_t.pos, view_t.frame, group_by=base_t)).show()
|
|
148
|
-
status = view_t.add_column(agg=agg_fn(view_t.pos, view_t.frame, group_by=base_t))
|
|
147
|
+
_ = view_t.select(self.agg_fn(view_t.pos, view_t.frame, group_by=base_t)).show()
|
|
148
|
+
status = view_t.add_column(agg=self.agg_fn(view_t.pos, view_t.frame, group_by=base_t))
|
|
149
149
|
assert status.num_excs == 0
|
|
150
150
|
_ = view_t.select(make_video(view_t.pos, view_t.agg)).group_by(base_t).show()
|
|
151
151
|
|
|
152
152
|
# image cols computed with a window function currently need to be stored
|
|
153
153
|
with pytest.raises(excs.Error):
|
|
154
|
-
view_t.add_column(agg2=agg_fn(view_t.pos, view_t.frame, group_by=base_t), stored=False)
|
|
154
|
+
view_t.add_column(agg2=self.agg_fn(view_t.pos, view_t.frame, group_by=base_t), stored=False)
|
|
155
155
|
|
|
156
156
|
# reload from store
|
|
157
157
|
cl = pxt.Client(reload=True)
|
|
158
158
|
base_t, view_t = cl.get_table(base_t.get_name()), cl.get_table(view_t.get_name())
|
|
159
|
-
_ = view_t.select(agg_fn(view_t.pos, view_t.frame, group_by=base_t)).show()
|
|
159
|
+
_ = view_t.select(self.agg_fn(view_t.pos, view_t.frame, group_by=base_t)).show()
|
pixeltable/tests/test_view.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
import logging
|
|
2
3
|
|
|
3
4
|
import PIL
|
|
@@ -414,6 +415,10 @@ class TestView:
|
|
|
414
415
|
v.order_by(v.c2).show(0),
|
|
415
416
|
t.where(t.c2 < 10).order_by(t.c2).show(0))
|
|
416
417
|
|
|
418
|
+
# create views with filters containing date and datetime
|
|
419
|
+
_ = cl.create_view('test_view_2', t, filter=t.c5 >= datetime.date.today())
|
|
420
|
+
_ = cl.create_view('test_view_3', t, filter=t.c5 < datetime.datetime.now())
|
|
421
|
+
|
|
417
422
|
def test_view_of_snapshot(self, test_client: pxt.Client) -> None:
|
|
418
423
|
"""Test view over a snapshot"""
|
|
419
424
|
cl = test_client
|
pixeltable/tests/utils.py
CHANGED
|
@@ -6,6 +6,7 @@ from collections import namedtuple
|
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
from typing import Any, Dict, List, Optional, Set
|
|
8
8
|
|
|
9
|
+
import PIL.Image
|
|
9
10
|
import numpy as np
|
|
10
11
|
import pandas as pd
|
|
11
12
|
import pytest
|
|
@@ -16,6 +17,7 @@ from pixeltable import catalog
|
|
|
16
17
|
from pixeltable.catalog.globals import UpdateStatus
|
|
17
18
|
from pixeltable.dataframe import DataFrameResultSet
|
|
18
19
|
from pixeltable.env import Env
|
|
20
|
+
from pixeltable.functions.huggingface import clip_image, clip_text
|
|
19
21
|
from pixeltable.type_system import (
|
|
20
22
|
ArrayType,
|
|
21
23
|
BoolType,
|
|
@@ -43,6 +45,7 @@ def make_default_type(t: ColumnType.Type) -> ColumnType:
|
|
|
43
45
|
return TimestampType()
|
|
44
46
|
assert False
|
|
45
47
|
|
|
48
|
+
|
|
46
49
|
def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
|
|
47
50
|
if col_names is None:
|
|
48
51
|
col_names = ['c1']
|
|
@@ -51,7 +54,9 @@ def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]]
|
|
|
51
54
|
schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
|
|
52
55
|
return cl.create_table(name, schema)
|
|
53
56
|
|
|
54
|
-
|
|
57
|
+
|
|
58
|
+
def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[
|
|
59
|
+
Dict[str, Any]]:
|
|
55
60
|
if col_names is None:
|
|
56
61
|
col_names = []
|
|
57
62
|
data: Dict[str, Any] = {}
|
|
@@ -124,6 +129,7 @@ def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, n
|
|
|
124
129
|
rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
|
|
125
130
|
return rows
|
|
126
131
|
|
|
132
|
+
|
|
127
133
|
def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
|
|
128
134
|
schema = {
|
|
129
135
|
'c1': StringType(nullable=False),
|
|
@@ -189,12 +195,25 @@ def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table
|
|
|
189
195
|
t.insert(rows)
|
|
190
196
|
return t
|
|
191
197
|
|
|
198
|
+
|
|
199
|
+
def create_img_tbl(cl: pxt.Client, name: str = 'test_img_tbl') -> catalog.Table:
|
|
200
|
+
schema = {
|
|
201
|
+
'img': ImageType(nullable=False),
|
|
202
|
+
'category': StringType(nullable=False),
|
|
203
|
+
'split': StringType(nullable=False),
|
|
204
|
+
}
|
|
205
|
+
tbl = cl.create_table(name, schema)
|
|
206
|
+
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
207
|
+
tbl.insert(rows)
|
|
208
|
+
return tbl
|
|
209
|
+
|
|
210
|
+
|
|
192
211
|
def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
|
|
193
212
|
""" Creates a table with all supported datatypes.
|
|
194
213
|
"""
|
|
195
214
|
schema = {
|
|
196
|
-
'row_id': IntType(nullable=False),
|
|
197
|
-
'c_array': ArrayType(shape=(10,),
|
|
215
|
+
'row_id': IntType(nullable=False), # used for row selection
|
|
216
|
+
'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
|
|
198
217
|
'c_bool': BoolType(nullable=True),
|
|
199
218
|
'c_float': FloatType(nullable=True),
|
|
200
219
|
'c_image': ImageType(nullable=True),
|
|
@@ -207,12 +226,13 @@ def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
|
|
|
207
226
|
tbl = test_client.create_table('all_datatype_tbl', schema)
|
|
208
227
|
example_rows = create_table_data(tbl, num_rows=11)
|
|
209
228
|
|
|
210
|
-
for i,r in enumerate(example_rows):
|
|
211
|
-
r['row_id'] = i
|
|
229
|
+
for i, r in enumerate(example_rows):
|
|
230
|
+
r['row_id'] = i # row_id
|
|
212
231
|
|
|
213
232
|
tbl.insert(example_rows)
|
|
214
233
|
return tbl
|
|
215
234
|
|
|
235
|
+
|
|
216
236
|
def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
217
237
|
"""
|
|
218
238
|
Locate dir_name, create df out of file_name.
|
|
@@ -223,7 +243,7 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
|
|
|
223
243
|
"""
|
|
224
244
|
if path_col_names is None:
|
|
225
245
|
path_col_names = []
|
|
226
|
-
tests_dir = os.path.dirname(__file__)
|
|
246
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
227
247
|
glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
|
|
228
248
|
assert len(glob_result) == 1, f'Could not find {dir_name}'
|
|
229
249
|
abs_path = Path(glob_result[0])
|
|
@@ -235,8 +255,9 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
|
|
|
235
255
|
df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
|
|
236
256
|
return df.to_dict(orient='records')
|
|
237
257
|
|
|
258
|
+
|
|
238
259
|
def get_video_files(include_bad_video: bool = False) -> List[str]:
|
|
239
|
-
tests_dir = os.path.dirname(__file__)
|
|
260
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
240
261
|
glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
|
|
241
262
|
if not include_bad_video:
|
|
242
263
|
glob_result = [f for f in glob_result if 'bad_video' not in f]
|
|
@@ -244,18 +265,21 @@ def get_video_files(include_bad_video: bool = False) -> List[str]:
|
|
|
244
265
|
half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
|
|
245
266
|
return half_res
|
|
246
267
|
|
|
268
|
+
|
|
247
269
|
def get_test_video_files() -> List[str]:
|
|
248
|
-
tests_dir = os.path.dirname(__file__)
|
|
270
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
249
271
|
glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
|
|
250
272
|
return glob_result
|
|
251
273
|
|
|
274
|
+
|
|
252
275
|
def get_image_files(include_bad_image: bool = False) -> List[str]:
|
|
253
|
-
tests_dir = os.path.dirname(__file__)
|
|
276
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
254
277
|
glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
|
|
255
278
|
if not include_bad_image:
|
|
256
279
|
glob_result = [f for f in glob_result if 'bad_image' not in f]
|
|
257
280
|
return glob_result
|
|
258
281
|
|
|
282
|
+
|
|
259
283
|
def get_audio_files(include_bad_audio: bool = False) -> List[str]:
|
|
260
284
|
tests_dir = os.path.dirname(__file__)
|
|
261
285
|
glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
|
|
@@ -263,11 +287,13 @@ def get_audio_files(include_bad_audio: bool = False) -> List[str]:
|
|
|
263
287
|
glob_result = [f for f in glob_result if 'bad_audio' not in f]
|
|
264
288
|
return glob_result
|
|
265
289
|
|
|
290
|
+
|
|
266
291
|
def get_documents() -> List[str]:
|
|
267
292
|
tests_dir = os.path.dirname(__file__)
|
|
268
293
|
# for now, we can only handle .html and .md
|
|
269
294
|
return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
|
|
270
295
|
|
|
296
|
+
|
|
271
297
|
def get_sentences(n: int = 100) -> List[str]:
|
|
272
298
|
tests_dir = os.path.dirname(__file__)
|
|
273
299
|
path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
|
|
@@ -403,6 +429,14 @@ def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, s
|
|
|
403
429
|
check_tup = DatasetTuple(**encoded_tup)
|
|
404
430
|
assert check_tup in acc_dataset
|
|
405
431
|
|
|
432
|
+
@pxt.expr_udf
|
|
433
|
+
def img_embed(img: PIL.Image.Image) -> np.ndarray:
|
|
434
|
+
return clip_image(img, model_id='openai/clip-vit-base-patch32')
|
|
435
|
+
|
|
436
|
+
@pxt.expr_udf
|
|
437
|
+
def text_embed(txt: str) -> np.ndarray:
|
|
438
|
+
return clip_text(txt, model_id='openai/clip-vit-base-patch32')
|
|
406
439
|
|
|
407
440
|
SAMPLE_IMAGE_URL = \
|
|
408
441
|
'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'
|
|
442
|
+
|
|
@@ -136,6 +136,22 @@ class Dumper:
|
|
|
136
136
|
for i in range(num_rows)
|
|
137
137
|
]
|
|
138
138
|
t.insert(rows)
|
|
139
|
+
self.cl.create_dir('views')
|
|
140
|
+
v = self.cl.create_view('views.sample_view', t, filter=(t.c2 < 50))
|
|
141
|
+
_ = self.cl.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
|
|
142
|
+
# Computed column using a library function
|
|
143
|
+
v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
|
|
144
|
+
# Computed column using a bespoke udf
|
|
145
|
+
v['test_udf'] = test_udf(t.c2)
|
|
146
|
+
# astype
|
|
147
|
+
v['astype'] = t.c1.astype(pxt.FloatType())
|
|
148
|
+
# computed column using a stored function
|
|
149
|
+
v['stored'] = t.c1.apply(lambda x: f'Hello, {x}', col_type=pxt.StringType())
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@pxt.udf
|
|
153
|
+
def test_udf(n: int) -> int:
|
|
154
|
+
return n + 1
|
|
139
155
|
|
|
140
156
|
|
|
141
157
|
def main() -> None:
|