pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -1,440 +0,0 @@
1
- import datetime
2
- import pickle
3
- from pathlib import Path
4
- from typing import Any, Dict
5
-
6
- import bs4
7
- import numpy as np
8
- import pytest
9
- import requests
10
-
11
- import pixeltable as pxt
12
- from pixeltable import catalog
13
- from pixeltable import exceptions as excs
14
- from pixeltable.iterators import FrameIterator
15
- from pixeltable.tests.utils import get_video_files, get_audio_files, skip_test_if_not_installed
16
-
17
-
18
- class TestDataFrame:
19
-
20
- @pxt.udf(return_type=pxt.JsonType(nullable=False), param_types=[pxt.JsonType(nullable=False)])
21
- def yolo_to_coco(detections):
22
- bboxes, labels = detections['bboxes'], detections['labels']
23
- num_annotations = len(detections['bboxes'])
24
- assert num_annotations == len(detections['labels'])
25
- result = []
26
- for i in range(num_annotations):
27
- bbox = bboxes[i]
28
- ann = {
29
- 'bbox': [round(bbox[0]), round(bbox[1]), round(bbox[2] - bbox[0]), round(bbox[3] - bbox[1])],
30
- 'category': labels[i],
31
- }
32
- result.append(ann)
33
- return result
34
-
35
- def test_select_where(self, test_tbl: catalog.Table) -> None:
36
- t = test_tbl
37
- res1 = t[t.c1, t.c2, t.c3].show(0)
38
- res2 = t.select(t.c1, t.c2, t.c3).show(0)
39
- assert res1 == res2
40
-
41
- res1 = t[t.c2 < 10][t.c1, t.c2, t.c3].show(0)
42
- res2 = t.where(t.c2 < 10).select(t.c1, t.c2, t.c3).show(0)
43
- assert res1 == res2
44
-
45
- res3 = t.where(t.c2 < 10).select(c1=t.c1, c2=t.c2, c3=t.c3).show(0)
46
- assert res1 == res3
47
-
48
- res4 = t.where(t.c2 < 10).select(t.c1, c2=t.c2, c3=t.c3).show(0)
49
- assert res1 == res4
50
-
51
- _ = t.where(t.c2 < 10).select(t.c2, t.c2).show(0) # repeated name no error
52
-
53
- # duplicate select list
54
- with pytest.raises(excs.Error) as exc_info:
55
- _ = t.select(t.c1).select(t.c2).show(0)
56
- assert 'already specified' in str(exc_info.value)
57
-
58
- # invalid expr in select list: Callable is not a valid literal
59
- with pytest.raises(TypeError) as exc_info:
60
- _ = t.select(datetime.datetime.now).show(0)
61
- assert 'Not a valid literal' in str(exc_info.value)
62
-
63
- # catch invalid name in select list from user input
64
- # only check stuff that's not caught by python kwargs checker
65
- with pytest.raises(excs.Error) as exc_info:
66
- _ = t.select(t.c1, **{'c2-1': t.c2}).show(0)
67
- assert 'Invalid name' in str(exc_info.value)
68
-
69
- with pytest.raises(excs.Error) as exc_info:
70
- _ = t.select(t.c1, **{'': t.c2}).show(0)
71
- assert 'Invalid name' in str(exc_info.value)
72
-
73
- with pytest.raises(excs.Error) as exc_info:
74
- _ = t.select(t.c1, **{'foo.bar': t.c2}).show(0)
75
- assert 'Invalid name' in str(exc_info.value)
76
-
77
- with pytest.raises(excs.Error) as exc_info:
78
- _ = t.select(t.c1, _c3=t.c2).show(0)
79
- assert 'Invalid name' in str(exc_info.value)
80
-
81
- # catch repeated name from user input
82
- with pytest.raises(excs.Error) as exc_info:
83
- _ = t.select(t.c2, c2=t.c1).show(0)
84
- assert 'Repeated column name' in str(exc_info.value)
85
-
86
- with pytest.raises(excs.Error) as exc_info:
87
- _ = t.select(t.c2+1, col_0=t.c2).show(0)
88
- assert 'Repeated column name' in str(exc_info.value)
89
-
90
- def test_result_set_iterator(self, test_tbl: catalog.Table) -> None:
91
- t = test_tbl
92
- res = t.select(t.c1, t.c2, t.c3).collect()
93
- pd_df = res.to_pandas()
94
-
95
- def check_row(row: Dict[str, Any], idx: int) -> None:
96
- assert len(row) == 3
97
- assert 'c1' in row
98
- assert row['c1'] == pd_df['c1'][idx]
99
- assert 'c2' in row
100
- assert row['c2'] == pd_df['c2'][idx]
101
- assert 'c3' in row
102
- assert row['c3'] == pd_df['c3'][idx]
103
-
104
- # row iteration
105
- for idx, row in enumerate(res):
106
- check_row(row, idx)
107
-
108
- # row access
109
- row = res[0]
110
- check_row(row, 0)
111
-
112
- # column access
113
- col_values = res['c2']
114
- assert col_values == pd_df['c2'].values.tolist()
115
-
116
- # cell access
117
- assert res[0, 'c2'] == pd_df['c2'][0]
118
- assert res[0, 'c2'] == res[0, 1]
119
-
120
- with pytest.raises(excs.Error) as exc_info:
121
- _ = res['does_not_exist']
122
- assert 'Invalid column name' in str(exc_info.value)
123
-
124
- with pytest.raises(excs.Error) as exc_info:
125
- _ = res[0, 'does_not_exist']
126
- assert 'Invalid column name' in str(exc_info.value)
127
-
128
- with pytest.raises(excs.Error) as exc_info:
129
- _ = res[0, 0, 0]
130
- assert 'Bad index' in str(exc_info.value)
131
-
132
- with pytest.raises(excs.Error) as exc_info:
133
- _ = res['c2', 0]
134
- assert 'Bad index' in str(exc_info.value)
135
-
136
- def test_order_by(self, test_tbl: catalog.Table) -> None:
137
- t = test_tbl
138
- res = t.select(t.c4, t.c2).order_by(t.c4).order_by(t.c2, asc=False).show(0)
139
-
140
- # invalid expr in order_by()
141
- with pytest.raises(excs.Error) as exc_info:
142
- _ = t.order_by(datetime.datetime.now()).show(0)
143
- assert 'Invalid expression' in str(exc_info.value)
144
-
145
- def test_head_tail(self, test_tbl: catalog.Table) -> None:
146
- t = test_tbl
147
- res = t.head(10).to_pandas()
148
- assert np.all(res.c2 == list(range(10)))
149
- # Where is applied
150
- res = t.where(t.c2 > 9).head(10).to_pandas()
151
- assert np.all(res.c2 == list(range(10, 20)))
152
- # order_by() is an error
153
- with pytest.raises(excs.Error) as exc_info:
154
- _ = t.order_by(t.c2).head(10)
155
- assert 'cannot be used with order_by' in str(exc_info.value)
156
-
157
- res = t.tail().to_pandas()
158
- assert np.all(res.c2 == list(range(90, 100)))
159
- res = t.where(t.c2 < 90).tail().to_pandas()
160
- assert np.all(res.c2 == list(range(80, 90)))
161
- # order_by() is an error
162
- with pytest.raises(excs.Error) as exc_info:
163
- _ = t.order_by(t.c2).tail(10)
164
- assert 'cannot be used with order_by' in str(exc_info.value)
165
-
166
- def test_describe(self, test_tbl: catalog.Table) -> None:
167
- t = test_tbl
168
- df = t.select(t.c1).where(t.c2 < 10).limit(10)
169
- df.describe()
170
-
171
- # TODO: how to you check the output of these?
172
- _ = df.__repr__()
173
- _ = df._repr_html_()
174
-
175
- def test_count(self, test_tbl: catalog.Table, small_img_tbl) -> None:
176
- skip_test_if_not_installed('nos')
177
- t = test_tbl
178
- cnt = t.count()
179
- assert cnt == 100
180
-
181
- cnt = t.where(t.c2 < 10).count()
182
- assert cnt == 10
183
-
184
- # count() doesn't work with similarity search
185
- t = small_img_tbl
186
- probe = t.select(t.img).show(1)
187
- img = probe[0, 0]
188
- with pytest.raises(excs.Error):
189
- _ = t.where(t.img.nearest(img)).count()
190
- with pytest.raises(excs.Error):
191
- _ = t.where(t.img.nearest('car')).count()
192
-
193
- # for now, count() doesn't work with non-SQL Where clauses
194
- with pytest.raises(excs.Error):
195
- _ = t.where(t.img.width > 100).count()
196
-
197
- def test_select_literal(self, test_tbl: catalog.Table) -> None:
198
- t = test_tbl
199
- res = t.select(1.0).where(t.c2 < 10).collect()
200
- assert res[res.column_names()[0]] == [1.0] * 10
201
-
202
- # TODO This test doesn't work on Windows due to reliance on the structure of file URLs
203
- @pytest.mark.skip('Test is not portable')
204
- def test_html_media_url(self, test_client: pxt.Client) -> None:
205
- tab = test_client.create_table('test_html_repr', {'video': pxt.VideoType(), 'audio': pxt.AudioType()})
206
- status = tab.insert(video=get_video_files()[0], audio=get_audio_files()[0])
207
- assert status.num_rows == 1
208
- assert status.num_excs == 0
209
-
210
- res = tab.select(tab.video, tab.audio).collect()
211
- doc = bs4.BeautifulSoup(res._repr_html_(), features='html.parser')
212
- video_tags = doc.find_all('video')
213
- assert len(video_tags) == 1
214
- audio_tags = doc.find_all('audio')
215
- assert len(audio_tags) == 1
216
-
217
- # get the source elements and test their src attributes
218
- for tag in video_tags + audio_tags:
219
- sources = tag.find_all('source')
220
- assert len(sources) == 1
221
- for src in sources:
222
- response = requests.get(src['src'])
223
- assert response.status_code == 200
224
-
225
- def test_to_pytorch_dataset(self, all_datatypes_tbl: catalog.Table):
226
- """ tests all types are handled correctly in this conversion
227
- """
228
- skip_test_if_not_installed('torch')
229
- import torch
230
-
231
- t = all_datatypes_tbl
232
- df = t.where(t.row_id < 1)
233
- assert df.count() > 0
234
- ds = df.to_pytorch_dataset()
235
- type_dict = dict(zip(df.get_column_names(),df.get_column_types()))
236
- for tup in ds:
237
- for col in df.get_column_names():
238
- assert col in tup
239
-
240
- arrval = tup['c_array']
241
- assert isinstance(arrval, np.ndarray)
242
- col_type = type_dict['c_array']
243
- assert arrval.dtype == col_type.numpy_dtype()
244
- assert arrval.shape == col_type.shape
245
- assert arrval.dtype == np.float32
246
- assert arrval.flags["WRITEABLE"], 'required by pytorch collate function'
247
-
248
- assert isinstance(tup['c_bool'], bool)
249
- assert isinstance(tup['c_int'], int)
250
- assert isinstance(tup['c_float'], float)
251
- assert isinstance(tup['c_timestamp'], float)
252
- assert torch.is_tensor(tup['c_image'])
253
- assert isinstance(tup['c_video'], str)
254
- assert isinstance(tup['c_json'], dict)
255
-
256
- def test_to_pytorch_image_format(self, all_datatypes_tbl: catalog.Table) -> None:
257
- """ tests the image_format parameter is honored
258
- """
259
- skip_test_if_not_installed('torch')
260
- import torch
261
- import torchvision.transforms as T
262
-
263
- W, H = 220, 224 # make different from each other
264
- t = all_datatypes_tbl
265
- df = t.select(
266
- t.row_id,
267
- t.c_image,
268
- c_image_xformed=t.c_image.resize([W, H]).convert('RGB')
269
- ).where(t.row_id < 1)
270
-
271
- pandas_df = df.show().to_pandas()
272
- im_plain = pandas_df['c_image'].values[0]
273
- im_xformed = pandas_df['c_image_xformed'].values[0]
274
- assert pandas_df.shape[0] == 1
275
-
276
- ds = df.to_pytorch_dataset(image_format='np')
277
- ds_ptformat = df.to_pytorch_dataset(image_format='pt')
278
-
279
- elt_count = 0
280
- for elt, elt_pt in zip(ds, ds_ptformat):
281
- arr_plain = elt['c_image']
282
- assert isinstance(arr_plain, np.ndarray)
283
- assert arr_plain.flags["WRITEABLE"], 'required by pytorch collate function'
284
-
285
- # NB: compare numpy array bc PIL.Image object itself is not using same file.
286
- assert (arr_plain == np.array(im_plain)).all(), 'numpy image should be the same as the original'
287
- arr_xformed = elt['c_image_xformed']
288
- assert isinstance(arr_xformed, np.ndarray)
289
- assert arr_xformed.flags["WRITEABLE"], 'required by pytorch collate function'
290
-
291
- assert arr_xformed.shape == (H, W, 3)
292
- assert arr_xformed.dtype == np.uint8
293
- # same as above, compare numpy array bc PIL.Image object itself is not using same file.
294
- assert (arr_xformed == np.array(im_xformed)).all(),\
295
- 'numpy image array for xformed image should be the same as the original'
296
-
297
- # now compare pytorch version
298
- arr_pt = elt_pt['c_image']
299
- assert torch.is_tensor(arr_pt)
300
- arr_pt = elt_pt['c_image_xformed']
301
- assert torch.is_tensor(arr_pt)
302
- assert arr_pt.shape == (3, H, W)
303
- assert arr_pt.dtype == torch.float32
304
- assert (0.0 <= arr_pt).all()
305
- assert (arr_pt <= 1.0).all()
306
- assert torch.isclose(T.ToTensor()(arr_xformed), arr_pt).all(),\
307
- 'pytorch image should be consistent with numpy image'
308
- elt_count += 1
309
- assert elt_count == 1
310
-
311
- @pytest.mark.skip('Flaky test (fails intermittently)')
312
- def test_to_pytorch_dataloader(self, all_datatypes_tbl: catalog.Table) -> None:
313
- """ Tests the dataset works well with pytorch dataloader:
314
- 1. compatibility with multiprocessing
315
- 2. compatibility of all types with default collate_fn
316
- """
317
- skip_test_if_not_installed('torch')
318
- import torch.utils.data
319
- @pxt.udf(param_types=[pxt.JsonType()], return_type=pxt.JsonType())
320
- def restrict_json_for_default_collate(obj):
321
- keys = ['id', 'label', 'iscrowd', 'bounding_box']
322
- return {k: obj[k] for k in keys}
323
-
324
- t = all_datatypes_tbl
325
- df = t.select(
326
- t.row_id,
327
- t.c_int,
328
- t.c_float,
329
- t.c_bool,
330
- t.c_timestamp,
331
- t.c_array,
332
- t.c_video,
333
- # default collate_fn doesnt support null values, nor lists of different lengths
334
- # but does allow some dictionaries if they are uniform
335
- c_json = restrict_json_for_default_collate(t.c_json.detections[0]),
336
- # images must be uniform shape for pytorch collate_fn to not fail
337
- c_image=t.c_image.resize([220, 224]).convert('RGB')
338
- )
339
- df_size = df.count()
340
- ds = df.to_pytorch_dataset(image_format='pt')
341
- # test serialization:
342
- # - pickle.dumps() and pickle.loads() must work so that
343
- # we can use num_workers > 0
344
- x = pickle.dumps(ds)
345
- _ = pickle.loads(x)
346
-
347
- # test we get all rows
348
- def check_recover_all_rows(ds, size : int, **kwargs):
349
- dl = torch.utils.data.DataLoader(ds, **kwargs)
350
- loaded_ids = set()
351
- for batch in dl:
352
- for row_id in batch['row_id']:
353
- val = int(row_id) # np.int -> int or will fail set equality test below.
354
- assert val not in loaded_ids, val
355
- loaded_ids.add(val)
356
-
357
- assert loaded_ids == set(range(size))
358
-
359
- # check different number of workers
360
- check_recover_all_rows(ds, size=df_size, batch_size=3, num_workers=0) # within this process
361
- check_recover_all_rows(ds, size=df_size, batch_size=3, num_workers=2) # two separate processes
362
-
363
- # check edge case where some workers get no rows
364
- short_size = 1
365
- df_short = df.where(t.row_id < short_size)
366
- ds_short = df_short.to_pytorch_dataset(image_format='pt')
367
- check_recover_all_rows(ds_short, size=short_size, batch_size=13, num_workers=short_size+1)
368
-
369
- def test_pytorch_dataset_caching(self, all_datatypes_tbl: catalog.Table) -> None:
370
- """ Tests that dataset caching works
371
- 1. using the same dataset twice in a row uses the cache
372
- 2. adding a row to the table invalidates the cached version
373
- 3. changing the select list invalidates the cached version
374
- """
375
- skip_test_if_not_installed('torch')
376
- t = all_datatypes_tbl
377
-
378
- t.drop_column('c_video') # null value video column triggers internal assertions in DataRow
379
- # see https://github.com/pixeltable/pixeltable/issues/38
380
-
381
- t.drop_column('c_array') # no support yet for null array values in the pytorch dataset
382
-
383
- def _get_mtimes(dir: Path):
384
- return {p.name: p.stat().st_mtime for p in dir.iterdir()}
385
-
386
- # check result cached
387
- ds1 = t.to_pytorch_dataset(image_format='pt')
388
- ds1_mtimes = _get_mtimes(ds1.path)
389
-
390
- ds2 = t.to_pytorch_dataset(image_format='pt')
391
- ds2_mtimes = _get_mtimes(ds2.path)
392
- assert ds2.path == ds1.path, 'result should be cached'
393
- assert ds2_mtimes == ds1_mtimes, 'no extra file system work should have occurred'
394
-
395
- # check invalidation on insert
396
- t_size = t.count()
397
- t.insert(row_id=t_size)
398
- ds3 = t.to_pytorch_dataset(image_format='pt')
399
- assert ds3.path != ds1.path, 'different path should be used'
400
-
401
- # check select list invalidation
402
- ds4 = t.select(t.row_id).to_pytorch_dataset(image_format='pt')
403
- assert ds4.path != ds3.path, 'different select list, hence different path should be used'
404
-
405
- def test_to_coco(self, test_client: pxt.Client) -> None:
406
- skip_test_if_not_installed('nos')
407
- from pycocotools.coco import COCO
408
- cl = test_client
409
- base_t = cl.create_table('videos', {'video': pxt.VideoType()})
410
- args = {'video': base_t.video, 'fps': 1}
411
- view_t = cl.create_view('frames', base_t, iterator_class=FrameIterator, iterator_args=args)
412
- from pixeltable.functions.nos.object_detection_2d import yolox_medium
413
- view_t.add_column(detections=yolox_medium(view_t.frame))
414
- base_t.insert(video=get_video_files()[0])
415
-
416
- query = view_t.select({'image': view_t.frame, 'annotations': self.yolo_to_coco(view_t.detections)})
417
- path = query.to_coco_dataset()
418
- # we get a valid COCO dataset
419
- coco_ds = COCO(path)
420
- assert len(coco_ds.imgs) == view_t.count()
421
-
422
- # we call to_coco_dataset() again and get the cached dataset
423
- new_path = query.to_coco_dataset()
424
- assert path == new_path
425
-
426
- # the cache is invalidated when we add more data
427
- base_t.insert(video=get_video_files()[1])
428
- new_path = query.to_coco_dataset()
429
- assert path != new_path
430
- coco_ds = COCO(new_path)
431
- assert len(coco_ds.imgs) == view_t.count()
432
-
433
- # incorrect select list
434
- with pytest.raises(excs.Error) as exc_info:
435
- _ = view_t.select({'image': view_t.frame, 'annotations': view_t.detections}).to_coco_dataset()
436
- assert '"annotations" is not a list' in str(exc_info.value)
437
-
438
- with pytest.raises(excs.Error) as exc_info:
439
- _ = view_t.select(view_t.detections).to_coco_dataset()
440
- assert 'missing key "image"' in str(exc_info.value).lower()
@@ -1,107 +0,0 @@
1
- import pytest
2
-
3
- import pixeltable as pxt
4
- from pixeltable import exceptions as excs
5
- from pixeltable.tests.utils import make_tbl
6
-
7
-
8
- class TestDirs:
9
- def test_create(self, test_client: pxt.Client) -> None:
10
- cl = test_client
11
- dirs = ['dir1', 'dir1.sub1', 'dir1.sub1.subsub1']
12
- for name in dirs:
13
- cl.create_dir(name)
14
-
15
- # invalid names
16
- with pytest.raises(excs.Error):
17
- cl.create_dir('1dir')
18
- with pytest.raises(excs.Error):
19
- cl.create_dir('_dir1')
20
- with pytest.raises(excs.Error):
21
- cl.create_dir('dir 1')
22
- with pytest.raises(excs.Error):
23
- cl.create_dir('dir1..sub2')
24
- with pytest.raises(excs.Error):
25
- cl.create_dir('dir1.sub2.')
26
- with pytest.raises(excs.Error):
27
- cl.create_dir('dir1:sub2.')
28
-
29
- # existing dirs
30
- with pytest.raises(excs.Error):
31
- cl.create_dir('dir1')
32
- cl.create_dir('dir1', ignore_errors=True)
33
- with pytest.raises(excs.Error):
34
- cl.create_dir('dir1.sub1')
35
- with pytest.raises(excs.Error):
36
- cl.create_dir('dir1.sub1.subsub1')
37
-
38
- # existing table
39
- make_tbl(cl, 'dir1.t1')
40
- with pytest.raises(excs.Error):
41
- cl.create_dir('dir1.t1')
42
-
43
- with pytest.raises(excs.Error):
44
- cl.create_dir('dir2.sub2')
45
- make_tbl(cl, 't2')
46
- with pytest.raises(excs.Error):
47
- cl.create_dir('t2.sub2')
48
-
49
- # new client: force loading from store
50
- cl2 = pxt.Client(reload=True)
51
-
52
- listing = cl2.list_dirs(recursive=True)
53
- assert listing == dirs
54
- listing = cl2.list_dirs(recursive=False)
55
- assert listing == ['dir1']
56
- listing = cl2.list_dirs('dir1', recursive=True)
57
- assert listing == ['dir1.sub1', 'dir1.sub1.subsub1']
58
- listing = cl2.list_dirs('dir1', recursive=False)
59
- assert listing == ['dir1.sub1']
60
- listing = cl2.list_dirs('dir1.sub1', recursive=True)
61
- assert listing == ['dir1.sub1.subsub1']
62
- listing = cl2.list_dirs('dir1.sub1', recursive=False)
63
- assert listing == ['dir1.sub1.subsub1']
64
-
65
- def test_rm(self, test_client: pxt.Client) -> None:
66
- cl = test_client
67
- dirs = ['dir1', 'dir1.sub1', 'dir1.sub1.subsub1']
68
- for name in dirs:
69
- cl.create_dir(name)
70
- make_tbl(cl, 't1')
71
- make_tbl(cl, 'dir1.t1')
72
-
73
- # bad name
74
- with pytest.raises(excs.Error):
75
- cl.rm_dir('1dir')
76
- # bad path
77
- with pytest.raises(excs.Error):
78
- cl.rm_dir('dir1..sub1')
79
- # doesn't exist
80
- with pytest.raises(excs.Error):
81
- cl.rm_dir('dir2')
82
- # not empty
83
- with pytest.raises(excs.Error):
84
- cl.rm_dir('dir1')
85
-
86
- cl.rm_dir('dir1.sub1.subsub1')
87
- assert cl.list_dirs('dir1.sub1') == []
88
-
89
- # check after reloading
90
- cl = pxt.Client(reload=True)
91
- assert cl.list_dirs('dir1.sub1') == []
92
-
93
- def test_move(self, test_client: pxt.Client) -> None:
94
- cl = test_client
95
- cl.create_dir('dir1')
96
- cl.create_dir('dir1.sub1')
97
- make_tbl(cl, 'dir1.sub1.t1')
98
- assert cl.list_tables('dir1') == ['dir1.sub1.t1']
99
- cl.move('dir1.sub1.t1', 'dir1.sub1.t2')
100
- assert cl.list_tables('dir1') == ['dir1.sub1.t2']
101
- cl.create_dir('dir2')
102
- cl.move('dir1', 'dir2.dir1')
103
- assert cl.list_tables('dir2') == ['dir2.dir1.sub1.t2']
104
-
105
- # new client: force loading from store
106
- cl2 = pxt.Client(reload=True)
107
- assert cl2.list_tables('dir2') == ['dir2.dir1.sub1.t2']
@@ -1,120 +0,0 @@
1
- import itertools
2
- import json
3
- import re
4
- from typing import Optional, Set, List
5
-
6
- import pytest
7
-
8
- import pixeltable as pxt
9
- from pixeltable.iterators.document import DocumentSplitter
10
- from pixeltable.tests.utils import get_documents, get_video_files, get_audio_files, get_image_files
11
- from pixeltable.tests.utils import skip_test_if_not_installed
12
- from pixeltable.type_system import DocumentType
13
-
14
-
15
- class TestDocument:
16
- def valid_doc_paths(self) -> List[str]:
17
- return get_documents()
18
-
19
- def invalid_doc_paths(self) -> List[str]:
20
- return [get_video_files()[0], get_audio_files()[0], get_image_files()[0]]
21
-
22
- def test_insert(self, test_client: pxt.Client) -> None:
23
- file_paths = self.valid_doc_paths()
24
- cl = test_client
25
- doc_t = cl.create_table('docs', {'doc': DocumentType()})
26
- status = doc_t.insert({'doc': p} for p in file_paths)
27
- assert status.num_rows == len(file_paths)
28
- assert status.num_excs == 0
29
- stored_paths = doc_t.select(output=doc_t.doc.localpath).collect()['output']
30
- assert set(stored_paths) == set(file_paths)
31
-
32
- file_paths = self.invalid_doc_paths()
33
- status = doc_t.insert(({'doc': p} for p in file_paths), fail_on_exception=False)
34
- assert status.num_rows == len(file_paths)
35
- assert status.num_excs == len(file_paths)
36
-
37
- def test_doc_splitter(self, test_client: pxt.Client) -> None:
38
- skip_test_if_not_installed('tiktoken')
39
- file_paths = self.valid_doc_paths()
40
- cl = test_client
41
- doc_t = cl.create_table('docs', {'doc': DocumentType()})
42
- status = doc_t.insert({'doc': p} for p in file_paths)
43
- assert status.num_excs == 0
44
-
45
- def normalize(s: str) -> str:
46
- # remove whitespace
47
- res = re.sub(r'\s+', '', s)
48
- # remove non-ascii
49
- res = res.encode('ascii', 'ignore').decode()
50
- return res
51
-
52
- # run all combinations of (heading, paragraph, sentence) x (token_limit, char_limit, None)
53
- # and make sure they extract the same text in aggregate
54
- all_text_reference: Optional[str] = None # all text as a single string; normalized
55
- headings_reference: Set[str] = {} # headings metadata as a json-serialized string
56
- for sep1 in ['heading', 'paragraph', 'sentence']:
57
- for sep2 in [None, 'token_limit', 'char_limit']:
58
- chunk_limits = [10, 20, 100] if sep2 is not None else [None]
59
- for limit in chunk_limits:
60
- iterator_args = {
61
- 'document': doc_t.doc,
62
- 'separators': sep1 + (',' + sep2 if sep2 is not None else ''),
63
- 'metadata': 'title,headings,sourceline'
64
- }
65
- if sep2 is not None:
66
- iterator_args['limit'] = limit
67
- iterator_args['overlap'] = 0
68
- chunks_t = cl.create_view(
69
- f'chunks', doc_t, iterator_class=DocumentSplitter, iterator_args=iterator_args)
70
- res = list(chunks_t.order_by(chunks_t.doc, chunks_t.pos).collect())
71
-
72
- if all_text_reference is None:
73
- all_text_reference = normalize(''.join([r['text'] for r in res]))
74
- headings_reference = set(json.dumps(r['headings']) for r in res)
75
- else:
76
- all_text = normalize(''.join([r['text'] for r in res]))
77
- headings = set(json.dumps(r['headings']) for r in res)
78
-
79
- # for debugging
80
- first_diff_index = next(
81
- (i for i, (c1, c2) in enumerate(zip(all_text, all_text_reference)) if c1 != c2),
82
- len(all_text) if len(all_text) != len(all_text_reference) else None)
83
- if first_diff_index is not None:
84
- a = all_text[max(0, first_diff_index - 10):first_diff_index + 10]
85
- b = all_text_reference[max(0, first_diff_index - 10):first_diff_index + 10]
86
-
87
- assert all_text == all_text_reference, f'{sep1}, {sep2}, {limit}'
88
- assert headings == headings_reference, f'{sep1}, {sep2}, {limit}'
89
- # TODO: verify chunk limit
90
- cl.drop_table('chunks')
91
-
92
- def test_doc_splitter_headings(self, test_client: pxt.Client) -> None:
93
- skip_test_if_not_installed('spacy')
94
- file_paths = self.valid_doc_paths()
95
- cl = test_client
96
- doc_t = cl.create_table('docs', {'doc': DocumentType()})
97
- status = doc_t.insert({'doc': p} for p in file_paths)
98
- assert status.num_excs == 0
99
-
100
- # verify that only the requested metadata is present in the view
101
- md_elements = ['title', 'headings', 'sourceline']
102
- md_tuples = list(itertools.chain.from_iterable(itertools.combinations(md_elements, i) for i in range(len(md_elements) + 1)))
103
- _ = [','.join(t) for t in md_tuples]
104
- for md_str in [','.join(t) for t in md_tuples]:
105
- iterator_args = {
106
- 'document': doc_t.doc,
107
- 'separators': 'sentence',
108
- 'metadata': md_str
109
- }
110
- chunks_t = cl.create_view(
111
- f'chunks', doc_t, iterator_class=DocumentSplitter, iterator_args=iterator_args)
112
- res = chunks_t.order_by(chunks_t.doc, chunks_t.pos).collect()
113
- requested_md_elements = set(md_str.split(','))
114
- for md_element in md_elements:
115
- if md_element in requested_md_elements:
116
- _ = res[md_element]
117
- else:
118
- with pytest.raises(pxt.Error):
119
- _ = res[md_element]
120
- cl.drop_table('chunks')