pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
|
@@ -1,440 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
import pickle
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Any, Dict
|
|
5
|
-
|
|
6
|
-
import bs4
|
|
7
|
-
import numpy as np
|
|
8
|
-
import pytest
|
|
9
|
-
import requests
|
|
10
|
-
|
|
11
|
-
import pixeltable as pxt
|
|
12
|
-
from pixeltable import catalog
|
|
13
|
-
from pixeltable import exceptions as excs
|
|
14
|
-
from pixeltable.iterators import FrameIterator
|
|
15
|
-
from pixeltable.tests.utils import get_video_files, get_audio_files, skip_test_if_not_installed
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class TestDataFrame:
|
|
19
|
-
|
|
20
|
-
@pxt.udf(return_type=pxt.JsonType(nullable=False), param_types=[pxt.JsonType(nullable=False)])
|
|
21
|
-
def yolo_to_coco(detections):
|
|
22
|
-
bboxes, labels = detections['bboxes'], detections['labels']
|
|
23
|
-
num_annotations = len(detections['bboxes'])
|
|
24
|
-
assert num_annotations == len(detections['labels'])
|
|
25
|
-
result = []
|
|
26
|
-
for i in range(num_annotations):
|
|
27
|
-
bbox = bboxes[i]
|
|
28
|
-
ann = {
|
|
29
|
-
'bbox': [round(bbox[0]), round(bbox[1]), round(bbox[2] - bbox[0]), round(bbox[3] - bbox[1])],
|
|
30
|
-
'category': labels[i],
|
|
31
|
-
}
|
|
32
|
-
result.append(ann)
|
|
33
|
-
return result
|
|
34
|
-
|
|
35
|
-
def test_select_where(self, test_tbl: catalog.Table) -> None:
|
|
36
|
-
t = test_tbl
|
|
37
|
-
res1 = t[t.c1, t.c2, t.c3].show(0)
|
|
38
|
-
res2 = t.select(t.c1, t.c2, t.c3).show(0)
|
|
39
|
-
assert res1 == res2
|
|
40
|
-
|
|
41
|
-
res1 = t[t.c2 < 10][t.c1, t.c2, t.c3].show(0)
|
|
42
|
-
res2 = t.where(t.c2 < 10).select(t.c1, t.c2, t.c3).show(0)
|
|
43
|
-
assert res1 == res2
|
|
44
|
-
|
|
45
|
-
res3 = t.where(t.c2 < 10).select(c1=t.c1, c2=t.c2, c3=t.c3).show(0)
|
|
46
|
-
assert res1 == res3
|
|
47
|
-
|
|
48
|
-
res4 = t.where(t.c2 < 10).select(t.c1, c2=t.c2, c3=t.c3).show(0)
|
|
49
|
-
assert res1 == res4
|
|
50
|
-
|
|
51
|
-
_ = t.where(t.c2 < 10).select(t.c2, t.c2).show(0) # repeated name no error
|
|
52
|
-
|
|
53
|
-
# duplicate select list
|
|
54
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
55
|
-
_ = t.select(t.c1).select(t.c2).show(0)
|
|
56
|
-
assert 'already specified' in str(exc_info.value)
|
|
57
|
-
|
|
58
|
-
# invalid expr in select list: Callable is not a valid literal
|
|
59
|
-
with pytest.raises(TypeError) as exc_info:
|
|
60
|
-
_ = t.select(datetime.datetime.now).show(0)
|
|
61
|
-
assert 'Not a valid literal' in str(exc_info.value)
|
|
62
|
-
|
|
63
|
-
# catch invalid name in select list from user input
|
|
64
|
-
# only check stuff that's not caught by python kwargs checker
|
|
65
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
66
|
-
_ = t.select(t.c1, **{'c2-1': t.c2}).show(0)
|
|
67
|
-
assert 'Invalid name' in str(exc_info.value)
|
|
68
|
-
|
|
69
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
70
|
-
_ = t.select(t.c1, **{'': t.c2}).show(0)
|
|
71
|
-
assert 'Invalid name' in str(exc_info.value)
|
|
72
|
-
|
|
73
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
74
|
-
_ = t.select(t.c1, **{'foo.bar': t.c2}).show(0)
|
|
75
|
-
assert 'Invalid name' in str(exc_info.value)
|
|
76
|
-
|
|
77
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
78
|
-
_ = t.select(t.c1, _c3=t.c2).show(0)
|
|
79
|
-
assert 'Invalid name' in str(exc_info.value)
|
|
80
|
-
|
|
81
|
-
# catch repeated name from user input
|
|
82
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
83
|
-
_ = t.select(t.c2, c2=t.c1).show(0)
|
|
84
|
-
assert 'Repeated column name' in str(exc_info.value)
|
|
85
|
-
|
|
86
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
87
|
-
_ = t.select(t.c2+1, col_0=t.c2).show(0)
|
|
88
|
-
assert 'Repeated column name' in str(exc_info.value)
|
|
89
|
-
|
|
90
|
-
def test_result_set_iterator(self, test_tbl: catalog.Table) -> None:
|
|
91
|
-
t = test_tbl
|
|
92
|
-
res = t.select(t.c1, t.c2, t.c3).collect()
|
|
93
|
-
pd_df = res.to_pandas()
|
|
94
|
-
|
|
95
|
-
def check_row(row: Dict[str, Any], idx: int) -> None:
|
|
96
|
-
assert len(row) == 3
|
|
97
|
-
assert 'c1' in row
|
|
98
|
-
assert row['c1'] == pd_df['c1'][idx]
|
|
99
|
-
assert 'c2' in row
|
|
100
|
-
assert row['c2'] == pd_df['c2'][idx]
|
|
101
|
-
assert 'c3' in row
|
|
102
|
-
assert row['c3'] == pd_df['c3'][idx]
|
|
103
|
-
|
|
104
|
-
# row iteration
|
|
105
|
-
for idx, row in enumerate(res):
|
|
106
|
-
check_row(row, idx)
|
|
107
|
-
|
|
108
|
-
# row access
|
|
109
|
-
row = res[0]
|
|
110
|
-
check_row(row, 0)
|
|
111
|
-
|
|
112
|
-
# column access
|
|
113
|
-
col_values = res['c2']
|
|
114
|
-
assert col_values == pd_df['c2'].values.tolist()
|
|
115
|
-
|
|
116
|
-
# cell access
|
|
117
|
-
assert res[0, 'c2'] == pd_df['c2'][0]
|
|
118
|
-
assert res[0, 'c2'] == res[0, 1]
|
|
119
|
-
|
|
120
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
121
|
-
_ = res['does_not_exist']
|
|
122
|
-
assert 'Invalid column name' in str(exc_info.value)
|
|
123
|
-
|
|
124
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
125
|
-
_ = res[0, 'does_not_exist']
|
|
126
|
-
assert 'Invalid column name' in str(exc_info.value)
|
|
127
|
-
|
|
128
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
129
|
-
_ = res[0, 0, 0]
|
|
130
|
-
assert 'Bad index' in str(exc_info.value)
|
|
131
|
-
|
|
132
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
133
|
-
_ = res['c2', 0]
|
|
134
|
-
assert 'Bad index' in str(exc_info.value)
|
|
135
|
-
|
|
136
|
-
def test_order_by(self, test_tbl: catalog.Table) -> None:
|
|
137
|
-
t = test_tbl
|
|
138
|
-
res = t.select(t.c4, t.c2).order_by(t.c4).order_by(t.c2, asc=False).show(0)
|
|
139
|
-
|
|
140
|
-
# invalid expr in order_by()
|
|
141
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
142
|
-
_ = t.order_by(datetime.datetime.now()).show(0)
|
|
143
|
-
assert 'Invalid expression' in str(exc_info.value)
|
|
144
|
-
|
|
145
|
-
def test_head_tail(self, test_tbl: catalog.Table) -> None:
|
|
146
|
-
t = test_tbl
|
|
147
|
-
res = t.head(10).to_pandas()
|
|
148
|
-
assert np.all(res.c2 == list(range(10)))
|
|
149
|
-
# Where is applied
|
|
150
|
-
res = t.where(t.c2 > 9).head(10).to_pandas()
|
|
151
|
-
assert np.all(res.c2 == list(range(10, 20)))
|
|
152
|
-
# order_by() is an error
|
|
153
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
154
|
-
_ = t.order_by(t.c2).head(10)
|
|
155
|
-
assert 'cannot be used with order_by' in str(exc_info.value)
|
|
156
|
-
|
|
157
|
-
res = t.tail().to_pandas()
|
|
158
|
-
assert np.all(res.c2 == list(range(90, 100)))
|
|
159
|
-
res = t.where(t.c2 < 90).tail().to_pandas()
|
|
160
|
-
assert np.all(res.c2 == list(range(80, 90)))
|
|
161
|
-
# order_by() is an error
|
|
162
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
163
|
-
_ = t.order_by(t.c2).tail(10)
|
|
164
|
-
assert 'cannot be used with order_by' in str(exc_info.value)
|
|
165
|
-
|
|
166
|
-
def test_describe(self, test_tbl: catalog.Table) -> None:
|
|
167
|
-
t = test_tbl
|
|
168
|
-
df = t.select(t.c1).where(t.c2 < 10).limit(10)
|
|
169
|
-
df.describe()
|
|
170
|
-
|
|
171
|
-
# TODO: how to you check the output of these?
|
|
172
|
-
_ = df.__repr__()
|
|
173
|
-
_ = df._repr_html_()
|
|
174
|
-
|
|
175
|
-
def test_count(self, test_tbl: catalog.Table, small_img_tbl) -> None:
|
|
176
|
-
skip_test_if_not_installed('nos')
|
|
177
|
-
t = test_tbl
|
|
178
|
-
cnt = t.count()
|
|
179
|
-
assert cnt == 100
|
|
180
|
-
|
|
181
|
-
cnt = t.where(t.c2 < 10).count()
|
|
182
|
-
assert cnt == 10
|
|
183
|
-
|
|
184
|
-
# count() doesn't work with similarity search
|
|
185
|
-
t = small_img_tbl
|
|
186
|
-
probe = t.select(t.img).show(1)
|
|
187
|
-
img = probe[0, 0]
|
|
188
|
-
with pytest.raises(excs.Error):
|
|
189
|
-
_ = t.where(t.img.nearest(img)).count()
|
|
190
|
-
with pytest.raises(excs.Error):
|
|
191
|
-
_ = t.where(t.img.nearest('car')).count()
|
|
192
|
-
|
|
193
|
-
# for now, count() doesn't work with non-SQL Where clauses
|
|
194
|
-
with pytest.raises(excs.Error):
|
|
195
|
-
_ = t.where(t.img.width > 100).count()
|
|
196
|
-
|
|
197
|
-
def test_select_literal(self, test_tbl: catalog.Table) -> None:
|
|
198
|
-
t = test_tbl
|
|
199
|
-
res = t.select(1.0).where(t.c2 < 10).collect()
|
|
200
|
-
assert res[res.column_names()[0]] == [1.0] * 10
|
|
201
|
-
|
|
202
|
-
# TODO This test doesn't work on Windows due to reliance on the structure of file URLs
|
|
203
|
-
@pytest.mark.skip('Test is not portable')
|
|
204
|
-
def test_html_media_url(self, test_client: pxt.Client) -> None:
|
|
205
|
-
tab = test_client.create_table('test_html_repr', {'video': pxt.VideoType(), 'audio': pxt.AudioType()})
|
|
206
|
-
status = tab.insert(video=get_video_files()[0], audio=get_audio_files()[0])
|
|
207
|
-
assert status.num_rows == 1
|
|
208
|
-
assert status.num_excs == 0
|
|
209
|
-
|
|
210
|
-
res = tab.select(tab.video, tab.audio).collect()
|
|
211
|
-
doc = bs4.BeautifulSoup(res._repr_html_(), features='html.parser')
|
|
212
|
-
video_tags = doc.find_all('video')
|
|
213
|
-
assert len(video_tags) == 1
|
|
214
|
-
audio_tags = doc.find_all('audio')
|
|
215
|
-
assert len(audio_tags) == 1
|
|
216
|
-
|
|
217
|
-
# get the source elements and test their src attributes
|
|
218
|
-
for tag in video_tags + audio_tags:
|
|
219
|
-
sources = tag.find_all('source')
|
|
220
|
-
assert len(sources) == 1
|
|
221
|
-
for src in sources:
|
|
222
|
-
response = requests.get(src['src'])
|
|
223
|
-
assert response.status_code == 200
|
|
224
|
-
|
|
225
|
-
def test_to_pytorch_dataset(self, all_datatypes_tbl: catalog.Table):
|
|
226
|
-
""" tests all types are handled correctly in this conversion
|
|
227
|
-
"""
|
|
228
|
-
skip_test_if_not_installed('torch')
|
|
229
|
-
import torch
|
|
230
|
-
|
|
231
|
-
t = all_datatypes_tbl
|
|
232
|
-
df = t.where(t.row_id < 1)
|
|
233
|
-
assert df.count() > 0
|
|
234
|
-
ds = df.to_pytorch_dataset()
|
|
235
|
-
type_dict = dict(zip(df.get_column_names(),df.get_column_types()))
|
|
236
|
-
for tup in ds:
|
|
237
|
-
for col in df.get_column_names():
|
|
238
|
-
assert col in tup
|
|
239
|
-
|
|
240
|
-
arrval = tup['c_array']
|
|
241
|
-
assert isinstance(arrval, np.ndarray)
|
|
242
|
-
col_type = type_dict['c_array']
|
|
243
|
-
assert arrval.dtype == col_type.numpy_dtype()
|
|
244
|
-
assert arrval.shape == col_type.shape
|
|
245
|
-
assert arrval.dtype == np.float32
|
|
246
|
-
assert arrval.flags["WRITEABLE"], 'required by pytorch collate function'
|
|
247
|
-
|
|
248
|
-
assert isinstance(tup['c_bool'], bool)
|
|
249
|
-
assert isinstance(tup['c_int'], int)
|
|
250
|
-
assert isinstance(tup['c_float'], float)
|
|
251
|
-
assert isinstance(tup['c_timestamp'], float)
|
|
252
|
-
assert torch.is_tensor(tup['c_image'])
|
|
253
|
-
assert isinstance(tup['c_video'], str)
|
|
254
|
-
assert isinstance(tup['c_json'], dict)
|
|
255
|
-
|
|
256
|
-
def test_to_pytorch_image_format(self, all_datatypes_tbl: catalog.Table) -> None:
|
|
257
|
-
""" tests the image_format parameter is honored
|
|
258
|
-
"""
|
|
259
|
-
skip_test_if_not_installed('torch')
|
|
260
|
-
import torch
|
|
261
|
-
import torchvision.transforms as T
|
|
262
|
-
|
|
263
|
-
W, H = 220, 224 # make different from each other
|
|
264
|
-
t = all_datatypes_tbl
|
|
265
|
-
df = t.select(
|
|
266
|
-
t.row_id,
|
|
267
|
-
t.c_image,
|
|
268
|
-
c_image_xformed=t.c_image.resize([W, H]).convert('RGB')
|
|
269
|
-
).where(t.row_id < 1)
|
|
270
|
-
|
|
271
|
-
pandas_df = df.show().to_pandas()
|
|
272
|
-
im_plain = pandas_df['c_image'].values[0]
|
|
273
|
-
im_xformed = pandas_df['c_image_xformed'].values[0]
|
|
274
|
-
assert pandas_df.shape[0] == 1
|
|
275
|
-
|
|
276
|
-
ds = df.to_pytorch_dataset(image_format='np')
|
|
277
|
-
ds_ptformat = df.to_pytorch_dataset(image_format='pt')
|
|
278
|
-
|
|
279
|
-
elt_count = 0
|
|
280
|
-
for elt, elt_pt in zip(ds, ds_ptformat):
|
|
281
|
-
arr_plain = elt['c_image']
|
|
282
|
-
assert isinstance(arr_plain, np.ndarray)
|
|
283
|
-
assert arr_plain.flags["WRITEABLE"], 'required by pytorch collate function'
|
|
284
|
-
|
|
285
|
-
# NB: compare numpy array bc PIL.Image object itself is not using same file.
|
|
286
|
-
assert (arr_plain == np.array(im_plain)).all(), 'numpy image should be the same as the original'
|
|
287
|
-
arr_xformed = elt['c_image_xformed']
|
|
288
|
-
assert isinstance(arr_xformed, np.ndarray)
|
|
289
|
-
assert arr_xformed.flags["WRITEABLE"], 'required by pytorch collate function'
|
|
290
|
-
|
|
291
|
-
assert arr_xformed.shape == (H, W, 3)
|
|
292
|
-
assert arr_xformed.dtype == np.uint8
|
|
293
|
-
# same as above, compare numpy array bc PIL.Image object itself is not using same file.
|
|
294
|
-
assert (arr_xformed == np.array(im_xformed)).all(),\
|
|
295
|
-
'numpy image array for xformed image should be the same as the original'
|
|
296
|
-
|
|
297
|
-
# now compare pytorch version
|
|
298
|
-
arr_pt = elt_pt['c_image']
|
|
299
|
-
assert torch.is_tensor(arr_pt)
|
|
300
|
-
arr_pt = elt_pt['c_image_xformed']
|
|
301
|
-
assert torch.is_tensor(arr_pt)
|
|
302
|
-
assert arr_pt.shape == (3, H, W)
|
|
303
|
-
assert arr_pt.dtype == torch.float32
|
|
304
|
-
assert (0.0 <= arr_pt).all()
|
|
305
|
-
assert (arr_pt <= 1.0).all()
|
|
306
|
-
assert torch.isclose(T.ToTensor()(arr_xformed), arr_pt).all(),\
|
|
307
|
-
'pytorch image should be consistent with numpy image'
|
|
308
|
-
elt_count += 1
|
|
309
|
-
assert elt_count == 1
|
|
310
|
-
|
|
311
|
-
@pytest.mark.skip('Flaky test (fails intermittently)')
|
|
312
|
-
def test_to_pytorch_dataloader(self, all_datatypes_tbl: catalog.Table) -> None:
|
|
313
|
-
""" Tests the dataset works well with pytorch dataloader:
|
|
314
|
-
1. compatibility with multiprocessing
|
|
315
|
-
2. compatibility of all types with default collate_fn
|
|
316
|
-
"""
|
|
317
|
-
skip_test_if_not_installed('torch')
|
|
318
|
-
import torch.utils.data
|
|
319
|
-
@pxt.udf(param_types=[pxt.JsonType()], return_type=pxt.JsonType())
|
|
320
|
-
def restrict_json_for_default_collate(obj):
|
|
321
|
-
keys = ['id', 'label', 'iscrowd', 'bounding_box']
|
|
322
|
-
return {k: obj[k] for k in keys}
|
|
323
|
-
|
|
324
|
-
t = all_datatypes_tbl
|
|
325
|
-
df = t.select(
|
|
326
|
-
t.row_id,
|
|
327
|
-
t.c_int,
|
|
328
|
-
t.c_float,
|
|
329
|
-
t.c_bool,
|
|
330
|
-
t.c_timestamp,
|
|
331
|
-
t.c_array,
|
|
332
|
-
t.c_video,
|
|
333
|
-
# default collate_fn doesnt support null values, nor lists of different lengths
|
|
334
|
-
# but does allow some dictionaries if they are uniform
|
|
335
|
-
c_json = restrict_json_for_default_collate(t.c_json.detections[0]),
|
|
336
|
-
# images must be uniform shape for pytorch collate_fn to not fail
|
|
337
|
-
c_image=t.c_image.resize([220, 224]).convert('RGB')
|
|
338
|
-
)
|
|
339
|
-
df_size = df.count()
|
|
340
|
-
ds = df.to_pytorch_dataset(image_format='pt')
|
|
341
|
-
# test serialization:
|
|
342
|
-
# - pickle.dumps() and pickle.loads() must work so that
|
|
343
|
-
# we can use num_workers > 0
|
|
344
|
-
x = pickle.dumps(ds)
|
|
345
|
-
_ = pickle.loads(x)
|
|
346
|
-
|
|
347
|
-
# test we get all rows
|
|
348
|
-
def check_recover_all_rows(ds, size : int, **kwargs):
|
|
349
|
-
dl = torch.utils.data.DataLoader(ds, **kwargs)
|
|
350
|
-
loaded_ids = set()
|
|
351
|
-
for batch in dl:
|
|
352
|
-
for row_id in batch['row_id']:
|
|
353
|
-
val = int(row_id) # np.int -> int or will fail set equality test below.
|
|
354
|
-
assert val not in loaded_ids, val
|
|
355
|
-
loaded_ids.add(val)
|
|
356
|
-
|
|
357
|
-
assert loaded_ids == set(range(size))
|
|
358
|
-
|
|
359
|
-
# check different number of workers
|
|
360
|
-
check_recover_all_rows(ds, size=df_size, batch_size=3, num_workers=0) # within this process
|
|
361
|
-
check_recover_all_rows(ds, size=df_size, batch_size=3, num_workers=2) # two separate processes
|
|
362
|
-
|
|
363
|
-
# check edge case where some workers get no rows
|
|
364
|
-
short_size = 1
|
|
365
|
-
df_short = df.where(t.row_id < short_size)
|
|
366
|
-
ds_short = df_short.to_pytorch_dataset(image_format='pt')
|
|
367
|
-
check_recover_all_rows(ds_short, size=short_size, batch_size=13, num_workers=short_size+1)
|
|
368
|
-
|
|
369
|
-
def test_pytorch_dataset_caching(self, all_datatypes_tbl: catalog.Table) -> None:
|
|
370
|
-
""" Tests that dataset caching works
|
|
371
|
-
1. using the same dataset twice in a row uses the cache
|
|
372
|
-
2. adding a row to the table invalidates the cached version
|
|
373
|
-
3. changing the select list invalidates the cached version
|
|
374
|
-
"""
|
|
375
|
-
skip_test_if_not_installed('torch')
|
|
376
|
-
t = all_datatypes_tbl
|
|
377
|
-
|
|
378
|
-
t.drop_column('c_video') # null value video column triggers internal assertions in DataRow
|
|
379
|
-
# see https://github.com/pixeltable/pixeltable/issues/38
|
|
380
|
-
|
|
381
|
-
t.drop_column('c_array') # no support yet for null array values in the pytorch dataset
|
|
382
|
-
|
|
383
|
-
def _get_mtimes(dir: Path):
|
|
384
|
-
return {p.name: p.stat().st_mtime for p in dir.iterdir()}
|
|
385
|
-
|
|
386
|
-
# check result cached
|
|
387
|
-
ds1 = t.to_pytorch_dataset(image_format='pt')
|
|
388
|
-
ds1_mtimes = _get_mtimes(ds1.path)
|
|
389
|
-
|
|
390
|
-
ds2 = t.to_pytorch_dataset(image_format='pt')
|
|
391
|
-
ds2_mtimes = _get_mtimes(ds2.path)
|
|
392
|
-
assert ds2.path == ds1.path, 'result should be cached'
|
|
393
|
-
assert ds2_mtimes == ds1_mtimes, 'no extra file system work should have occurred'
|
|
394
|
-
|
|
395
|
-
# check invalidation on insert
|
|
396
|
-
t_size = t.count()
|
|
397
|
-
t.insert(row_id=t_size)
|
|
398
|
-
ds3 = t.to_pytorch_dataset(image_format='pt')
|
|
399
|
-
assert ds3.path != ds1.path, 'different path should be used'
|
|
400
|
-
|
|
401
|
-
# check select list invalidation
|
|
402
|
-
ds4 = t.select(t.row_id).to_pytorch_dataset(image_format='pt')
|
|
403
|
-
assert ds4.path != ds3.path, 'different select list, hence different path should be used'
|
|
404
|
-
|
|
405
|
-
def test_to_coco(self, test_client: pxt.Client) -> None:
|
|
406
|
-
skip_test_if_not_installed('nos')
|
|
407
|
-
from pycocotools.coco import COCO
|
|
408
|
-
cl = test_client
|
|
409
|
-
base_t = cl.create_table('videos', {'video': pxt.VideoType()})
|
|
410
|
-
args = {'video': base_t.video, 'fps': 1}
|
|
411
|
-
view_t = cl.create_view('frames', base_t, iterator_class=FrameIterator, iterator_args=args)
|
|
412
|
-
from pixeltable.functions.nos.object_detection_2d import yolox_medium
|
|
413
|
-
view_t.add_column(detections=yolox_medium(view_t.frame))
|
|
414
|
-
base_t.insert(video=get_video_files()[0])
|
|
415
|
-
|
|
416
|
-
query = view_t.select({'image': view_t.frame, 'annotations': self.yolo_to_coco(view_t.detections)})
|
|
417
|
-
path = query.to_coco_dataset()
|
|
418
|
-
# we get a valid COCO dataset
|
|
419
|
-
coco_ds = COCO(path)
|
|
420
|
-
assert len(coco_ds.imgs) == view_t.count()
|
|
421
|
-
|
|
422
|
-
# we call to_coco_dataset() again and get the cached dataset
|
|
423
|
-
new_path = query.to_coco_dataset()
|
|
424
|
-
assert path == new_path
|
|
425
|
-
|
|
426
|
-
# the cache is invalidated when we add more data
|
|
427
|
-
base_t.insert(video=get_video_files()[1])
|
|
428
|
-
new_path = query.to_coco_dataset()
|
|
429
|
-
assert path != new_path
|
|
430
|
-
coco_ds = COCO(new_path)
|
|
431
|
-
assert len(coco_ds.imgs) == view_t.count()
|
|
432
|
-
|
|
433
|
-
# incorrect select list
|
|
434
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
435
|
-
_ = view_t.select({'image': view_t.frame, 'annotations': view_t.detections}).to_coco_dataset()
|
|
436
|
-
assert '"annotations" is not a list' in str(exc_info.value)
|
|
437
|
-
|
|
438
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
439
|
-
_ = view_t.select(view_t.detections).to_coco_dataset()
|
|
440
|
-
assert 'missing key "image"' in str(exc_info.value).lower()
|
pixeltable/tests/test_dirs.py
DELETED
|
@@ -1,107 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
import pixeltable as pxt
|
|
4
|
-
from pixeltable import exceptions as excs
|
|
5
|
-
from pixeltable.tests.utils import make_tbl
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TestDirs:
|
|
9
|
-
def test_create(self, test_client: pxt.Client) -> None:
|
|
10
|
-
cl = test_client
|
|
11
|
-
dirs = ['dir1', 'dir1.sub1', 'dir1.sub1.subsub1']
|
|
12
|
-
for name in dirs:
|
|
13
|
-
cl.create_dir(name)
|
|
14
|
-
|
|
15
|
-
# invalid names
|
|
16
|
-
with pytest.raises(excs.Error):
|
|
17
|
-
cl.create_dir('1dir')
|
|
18
|
-
with pytest.raises(excs.Error):
|
|
19
|
-
cl.create_dir('_dir1')
|
|
20
|
-
with pytest.raises(excs.Error):
|
|
21
|
-
cl.create_dir('dir 1')
|
|
22
|
-
with pytest.raises(excs.Error):
|
|
23
|
-
cl.create_dir('dir1..sub2')
|
|
24
|
-
with pytest.raises(excs.Error):
|
|
25
|
-
cl.create_dir('dir1.sub2.')
|
|
26
|
-
with pytest.raises(excs.Error):
|
|
27
|
-
cl.create_dir('dir1:sub2.')
|
|
28
|
-
|
|
29
|
-
# existing dirs
|
|
30
|
-
with pytest.raises(excs.Error):
|
|
31
|
-
cl.create_dir('dir1')
|
|
32
|
-
cl.create_dir('dir1', ignore_errors=True)
|
|
33
|
-
with pytest.raises(excs.Error):
|
|
34
|
-
cl.create_dir('dir1.sub1')
|
|
35
|
-
with pytest.raises(excs.Error):
|
|
36
|
-
cl.create_dir('dir1.sub1.subsub1')
|
|
37
|
-
|
|
38
|
-
# existing table
|
|
39
|
-
make_tbl(cl, 'dir1.t1')
|
|
40
|
-
with pytest.raises(excs.Error):
|
|
41
|
-
cl.create_dir('dir1.t1')
|
|
42
|
-
|
|
43
|
-
with pytest.raises(excs.Error):
|
|
44
|
-
cl.create_dir('dir2.sub2')
|
|
45
|
-
make_tbl(cl, 't2')
|
|
46
|
-
with pytest.raises(excs.Error):
|
|
47
|
-
cl.create_dir('t2.sub2')
|
|
48
|
-
|
|
49
|
-
# new client: force loading from store
|
|
50
|
-
cl2 = pxt.Client(reload=True)
|
|
51
|
-
|
|
52
|
-
listing = cl2.list_dirs(recursive=True)
|
|
53
|
-
assert listing == dirs
|
|
54
|
-
listing = cl2.list_dirs(recursive=False)
|
|
55
|
-
assert listing == ['dir1']
|
|
56
|
-
listing = cl2.list_dirs('dir1', recursive=True)
|
|
57
|
-
assert listing == ['dir1.sub1', 'dir1.sub1.subsub1']
|
|
58
|
-
listing = cl2.list_dirs('dir1', recursive=False)
|
|
59
|
-
assert listing == ['dir1.sub1']
|
|
60
|
-
listing = cl2.list_dirs('dir1.sub1', recursive=True)
|
|
61
|
-
assert listing == ['dir1.sub1.subsub1']
|
|
62
|
-
listing = cl2.list_dirs('dir1.sub1', recursive=False)
|
|
63
|
-
assert listing == ['dir1.sub1.subsub1']
|
|
64
|
-
|
|
65
|
-
def test_rm(self, test_client: pxt.Client) -> None:
|
|
66
|
-
cl = test_client
|
|
67
|
-
dirs = ['dir1', 'dir1.sub1', 'dir1.sub1.subsub1']
|
|
68
|
-
for name in dirs:
|
|
69
|
-
cl.create_dir(name)
|
|
70
|
-
make_tbl(cl, 't1')
|
|
71
|
-
make_tbl(cl, 'dir1.t1')
|
|
72
|
-
|
|
73
|
-
# bad name
|
|
74
|
-
with pytest.raises(excs.Error):
|
|
75
|
-
cl.rm_dir('1dir')
|
|
76
|
-
# bad path
|
|
77
|
-
with pytest.raises(excs.Error):
|
|
78
|
-
cl.rm_dir('dir1..sub1')
|
|
79
|
-
# doesn't exist
|
|
80
|
-
with pytest.raises(excs.Error):
|
|
81
|
-
cl.rm_dir('dir2')
|
|
82
|
-
# not empty
|
|
83
|
-
with pytest.raises(excs.Error):
|
|
84
|
-
cl.rm_dir('dir1')
|
|
85
|
-
|
|
86
|
-
cl.rm_dir('dir1.sub1.subsub1')
|
|
87
|
-
assert cl.list_dirs('dir1.sub1') == []
|
|
88
|
-
|
|
89
|
-
# check after reloading
|
|
90
|
-
cl = pxt.Client(reload=True)
|
|
91
|
-
assert cl.list_dirs('dir1.sub1') == []
|
|
92
|
-
|
|
93
|
-
def test_move(self, test_client: pxt.Client) -> None:
|
|
94
|
-
cl = test_client
|
|
95
|
-
cl.create_dir('dir1')
|
|
96
|
-
cl.create_dir('dir1.sub1')
|
|
97
|
-
make_tbl(cl, 'dir1.sub1.t1')
|
|
98
|
-
assert cl.list_tables('dir1') == ['dir1.sub1.t1']
|
|
99
|
-
cl.move('dir1.sub1.t1', 'dir1.sub1.t2')
|
|
100
|
-
assert cl.list_tables('dir1') == ['dir1.sub1.t2']
|
|
101
|
-
cl.create_dir('dir2')
|
|
102
|
-
cl.move('dir1', 'dir2.dir1')
|
|
103
|
-
assert cl.list_tables('dir2') == ['dir2.dir1.sub1.t2']
|
|
104
|
-
|
|
105
|
-
# new client: force loading from store
|
|
106
|
-
cl2 = pxt.Client(reload=True)
|
|
107
|
-
assert cl2.list_tables('dir2') == ['dir2.dir1.sub1.t2']
|
|
@@ -1,120 +0,0 @@
|
|
|
1
|
-
import itertools
|
|
2
|
-
import json
|
|
3
|
-
import re
|
|
4
|
-
from typing import Optional, Set, List
|
|
5
|
-
|
|
6
|
-
import pytest
|
|
7
|
-
|
|
8
|
-
import pixeltable as pxt
|
|
9
|
-
from pixeltable.iterators.document import DocumentSplitter
|
|
10
|
-
from pixeltable.tests.utils import get_documents, get_video_files, get_audio_files, get_image_files
|
|
11
|
-
from pixeltable.tests.utils import skip_test_if_not_installed
|
|
12
|
-
from pixeltable.type_system import DocumentType
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class TestDocument:
|
|
16
|
-
def valid_doc_paths(self) -> List[str]:
|
|
17
|
-
return get_documents()
|
|
18
|
-
|
|
19
|
-
def invalid_doc_paths(self) -> List[str]:
|
|
20
|
-
return [get_video_files()[0], get_audio_files()[0], get_image_files()[0]]
|
|
21
|
-
|
|
22
|
-
def test_insert(self, test_client: pxt.Client) -> None:
|
|
23
|
-
file_paths = self.valid_doc_paths()
|
|
24
|
-
cl = test_client
|
|
25
|
-
doc_t = cl.create_table('docs', {'doc': DocumentType()})
|
|
26
|
-
status = doc_t.insert({'doc': p} for p in file_paths)
|
|
27
|
-
assert status.num_rows == len(file_paths)
|
|
28
|
-
assert status.num_excs == 0
|
|
29
|
-
stored_paths = doc_t.select(output=doc_t.doc.localpath).collect()['output']
|
|
30
|
-
assert set(stored_paths) == set(file_paths)
|
|
31
|
-
|
|
32
|
-
file_paths = self.invalid_doc_paths()
|
|
33
|
-
status = doc_t.insert(({'doc': p} for p in file_paths), fail_on_exception=False)
|
|
34
|
-
assert status.num_rows == len(file_paths)
|
|
35
|
-
assert status.num_excs == len(file_paths)
|
|
36
|
-
|
|
37
|
-
def test_doc_splitter(self, test_client: pxt.Client) -> None:
|
|
38
|
-
skip_test_if_not_installed('tiktoken')
|
|
39
|
-
file_paths = self.valid_doc_paths()
|
|
40
|
-
cl = test_client
|
|
41
|
-
doc_t = cl.create_table('docs', {'doc': DocumentType()})
|
|
42
|
-
status = doc_t.insert({'doc': p} for p in file_paths)
|
|
43
|
-
assert status.num_excs == 0
|
|
44
|
-
|
|
45
|
-
def normalize(s: str) -> str:
|
|
46
|
-
# remove whitespace
|
|
47
|
-
res = re.sub(r'\s+', '', s)
|
|
48
|
-
# remove non-ascii
|
|
49
|
-
res = res.encode('ascii', 'ignore').decode()
|
|
50
|
-
return res
|
|
51
|
-
|
|
52
|
-
# run all combinations of (heading, paragraph, sentence) x (token_limit, char_limit, None)
|
|
53
|
-
# and make sure they extract the same text in aggregate
|
|
54
|
-
all_text_reference: Optional[str] = None # all text as a single string; normalized
|
|
55
|
-
headings_reference: Set[str] = {} # headings metadata as a json-serialized string
|
|
56
|
-
for sep1 in ['heading', 'paragraph', 'sentence']:
|
|
57
|
-
for sep2 in [None, 'token_limit', 'char_limit']:
|
|
58
|
-
chunk_limits = [10, 20, 100] if sep2 is not None else [None]
|
|
59
|
-
for limit in chunk_limits:
|
|
60
|
-
iterator_args = {
|
|
61
|
-
'document': doc_t.doc,
|
|
62
|
-
'separators': sep1 + (',' + sep2 if sep2 is not None else ''),
|
|
63
|
-
'metadata': 'title,headings,sourceline'
|
|
64
|
-
}
|
|
65
|
-
if sep2 is not None:
|
|
66
|
-
iterator_args['limit'] = limit
|
|
67
|
-
iterator_args['overlap'] = 0
|
|
68
|
-
chunks_t = cl.create_view(
|
|
69
|
-
f'chunks', doc_t, iterator_class=DocumentSplitter, iterator_args=iterator_args)
|
|
70
|
-
res = list(chunks_t.order_by(chunks_t.doc, chunks_t.pos).collect())
|
|
71
|
-
|
|
72
|
-
if all_text_reference is None:
|
|
73
|
-
all_text_reference = normalize(''.join([r['text'] for r in res]))
|
|
74
|
-
headings_reference = set(json.dumps(r['headings']) for r in res)
|
|
75
|
-
else:
|
|
76
|
-
all_text = normalize(''.join([r['text'] for r in res]))
|
|
77
|
-
headings = set(json.dumps(r['headings']) for r in res)
|
|
78
|
-
|
|
79
|
-
# for debugging
|
|
80
|
-
first_diff_index = next(
|
|
81
|
-
(i for i, (c1, c2) in enumerate(zip(all_text, all_text_reference)) if c1 != c2),
|
|
82
|
-
len(all_text) if len(all_text) != len(all_text_reference) else None)
|
|
83
|
-
if first_diff_index is not None:
|
|
84
|
-
a = all_text[max(0, first_diff_index - 10):first_diff_index + 10]
|
|
85
|
-
b = all_text_reference[max(0, first_diff_index - 10):first_diff_index + 10]
|
|
86
|
-
|
|
87
|
-
assert all_text == all_text_reference, f'{sep1}, {sep2}, {limit}'
|
|
88
|
-
assert headings == headings_reference, f'{sep1}, {sep2}, {limit}'
|
|
89
|
-
# TODO: verify chunk limit
|
|
90
|
-
cl.drop_table('chunks')
|
|
91
|
-
|
|
92
|
-
def test_doc_splitter_headings(self, test_client: pxt.Client) -> None:
|
|
93
|
-
skip_test_if_not_installed('spacy')
|
|
94
|
-
file_paths = self.valid_doc_paths()
|
|
95
|
-
cl = test_client
|
|
96
|
-
doc_t = cl.create_table('docs', {'doc': DocumentType()})
|
|
97
|
-
status = doc_t.insert({'doc': p} for p in file_paths)
|
|
98
|
-
assert status.num_excs == 0
|
|
99
|
-
|
|
100
|
-
# verify that only the requested metadata is present in the view
|
|
101
|
-
md_elements = ['title', 'headings', 'sourceline']
|
|
102
|
-
md_tuples = list(itertools.chain.from_iterable(itertools.combinations(md_elements, i) for i in range(len(md_elements) + 1)))
|
|
103
|
-
_ = [','.join(t) for t in md_tuples]
|
|
104
|
-
for md_str in [','.join(t) for t in md_tuples]:
|
|
105
|
-
iterator_args = {
|
|
106
|
-
'document': doc_t.doc,
|
|
107
|
-
'separators': 'sentence',
|
|
108
|
-
'metadata': md_str
|
|
109
|
-
}
|
|
110
|
-
chunks_t = cl.create_view(
|
|
111
|
-
f'chunks', doc_t, iterator_class=DocumentSplitter, iterator_args=iterator_args)
|
|
112
|
-
res = chunks_t.order_by(chunks_t.doc, chunks_t.pos).collect()
|
|
113
|
-
requested_md_elements = set(md_str.split(','))
|
|
114
|
-
for md_element in md_elements:
|
|
115
|
-
if md_element in requested_md_elements:
|
|
116
|
-
_ = res[md_element]
|
|
117
|
-
else:
|
|
118
|
-
with pytest.raises(pxt.Error):
|
|
119
|
-
_ = res[md_element]
|
|
120
|
-
cl.drop_table('chunks')
|