pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show
  1. pixeltable/__init__.py +20 -9
  2. pixeltable/__version__.py +3 -0
  3. pixeltable/catalog/column.py +23 -7
  4. pixeltable/catalog/insertable_table.py +32 -19
  5. pixeltable/catalog/table.py +210 -20
  6. pixeltable/catalog/table_version.py +272 -111
  7. pixeltable/catalog/table_version_path.py +6 -1
  8. pixeltable/dataframe.py +184 -110
  9. pixeltable/datatransfer/__init__.py +1 -0
  10. pixeltable/datatransfer/label_studio.py +526 -0
  11. pixeltable/datatransfer/remote.py +113 -0
  12. pixeltable/env.py +213 -79
  13. pixeltable/exec/__init__.py +2 -1
  14. pixeltable/exec/data_row_batch.py +6 -7
  15. pixeltable/exec/expr_eval_node.py +28 -28
  16. pixeltable/exec/sql_scan_node.py +7 -6
  17. pixeltable/exprs/__init__.py +4 -3
  18. pixeltable/exprs/column_ref.py +11 -2
  19. pixeltable/exprs/comparison.py +39 -1
  20. pixeltable/exprs/data_row.py +7 -0
  21. pixeltable/exprs/expr.py +26 -19
  22. pixeltable/exprs/function_call.py +17 -18
  23. pixeltable/exprs/globals.py +14 -2
  24. pixeltable/exprs/image_member_access.py +9 -28
  25. pixeltable/exprs/in_predicate.py +96 -0
  26. pixeltable/exprs/inline_array.py +13 -11
  27. pixeltable/exprs/inline_dict.py +15 -13
  28. pixeltable/exprs/row_builder.py +7 -1
  29. pixeltable/exprs/similarity_expr.py +67 -0
  30. pixeltable/ext/functions/whisperx.py +30 -0
  31. pixeltable/ext/functions/yolox.py +16 -0
  32. pixeltable/func/__init__.py +0 -2
  33. pixeltable/func/aggregate_function.py +5 -2
  34. pixeltable/func/callable_function.py +57 -13
  35. pixeltable/func/expr_template_function.py +14 -3
  36. pixeltable/func/function.py +35 -4
  37. pixeltable/func/signature.py +5 -15
  38. pixeltable/func/udf.py +8 -12
  39. pixeltable/functions/fireworks.py +9 -4
  40. pixeltable/functions/huggingface.py +48 -5
  41. pixeltable/functions/openai.py +49 -11
  42. pixeltable/functions/pil/image.py +61 -64
  43. pixeltable/functions/together.py +32 -6
  44. pixeltable/functions/util.py +0 -43
  45. pixeltable/functions/video.py +46 -8
  46. pixeltable/globals.py +443 -0
  47. pixeltable/index/__init__.py +1 -0
  48. pixeltable/index/base.py +9 -2
  49. pixeltable/index/btree.py +54 -0
  50. pixeltable/index/embedding_index.py +91 -15
  51. pixeltable/io/__init__.py +4 -0
  52. pixeltable/io/globals.py +59 -0
  53. pixeltable/{utils → io}/hf_datasets.py +48 -17
  54. pixeltable/io/pandas.py +148 -0
  55. pixeltable/{utils → io}/parquet.py +58 -33
  56. pixeltable/iterators/__init__.py +1 -1
  57. pixeltable/iterators/base.py +8 -4
  58. pixeltable/iterators/document.py +225 -93
  59. pixeltable/iterators/video.py +16 -9
  60. pixeltable/metadata/__init__.py +8 -4
  61. pixeltable/metadata/converters/convert_12.py +3 -0
  62. pixeltable/metadata/converters/convert_13.py +41 -0
  63. pixeltable/metadata/converters/convert_14.py +13 -0
  64. pixeltable/metadata/converters/convert_15.py +29 -0
  65. pixeltable/metadata/converters/util.py +63 -0
  66. pixeltable/metadata/schema.py +12 -6
  67. pixeltable/plan.py +11 -24
  68. pixeltable/store.py +16 -23
  69. pixeltable/tool/create_test_db_dump.py +49 -14
  70. pixeltable/type_system.py +27 -58
  71. pixeltable/utils/coco.py +94 -0
  72. pixeltable/utils/documents.py +42 -12
  73. pixeltable/utils/http_server.py +70 -0
  74. pixeltable-0.2.7.dist-info/METADATA +137 -0
  75. pixeltable-0.2.7.dist-info/RECORD +126 -0
  76. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
  77. pixeltable/client.py +0 -600
  78. pixeltable/exprs/image_similarity_predicate.py +0 -58
  79. pixeltable/func/batched_function.py +0 -53
  80. pixeltable/func/nos_function.py +0 -202
  81. pixeltable/tests/conftest.py +0 -171
  82. pixeltable/tests/ext/test_yolox.py +0 -21
  83. pixeltable/tests/functions/test_fireworks.py +0 -43
  84. pixeltable/tests/functions/test_functions.py +0 -60
  85. pixeltable/tests/functions/test_huggingface.py +0 -158
  86. pixeltable/tests/functions/test_openai.py +0 -162
  87. pixeltable/tests/functions/test_together.py +0 -112
  88. pixeltable/tests/test_audio.py +0 -65
  89. pixeltable/tests/test_catalog.py +0 -27
  90. pixeltable/tests/test_client.py +0 -21
  91. pixeltable/tests/test_component_view.py +0 -379
  92. pixeltable/tests/test_dataframe.py +0 -440
  93. pixeltable/tests/test_dirs.py +0 -107
  94. pixeltable/tests/test_document.py +0 -120
  95. pixeltable/tests/test_exprs.py +0 -802
  96. pixeltable/tests/test_function.py +0 -332
  97. pixeltable/tests/test_index.py +0 -138
  98. pixeltable/tests/test_migration.py +0 -44
  99. pixeltable/tests/test_nos.py +0 -54
  100. pixeltable/tests/test_snapshot.py +0 -231
  101. pixeltable/tests/test_table.py +0 -1343
  102. pixeltable/tests/test_transactional_directory.py +0 -42
  103. pixeltable/tests/test_types.py +0 -52
  104. pixeltable/tests/test_video.py +0 -159
  105. pixeltable/tests/test_view.py +0 -535
  106. pixeltable/tests/utils.py +0 -442
  107. pixeltable/utils/clip.py +0 -18
  108. pixeltable-0.2.5.dist-info/METADATA +0 -128
  109. pixeltable-0.2.5.dist-info/RECORD +0 -139
  110. {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
@@ -1,1343 +0,0 @@
1
- import datetime
2
- import math
3
- import os
4
- import random
5
- from typing import List, Tuple
6
-
7
- import PIL
8
- import cv2
9
- import numpy as np
10
- import pandas as pd
11
- import pathlib
12
- import pytest
13
-
14
- import pixeltable as pxt
15
- import pixeltable.functions as ptf
16
- from pixeltable import catalog
17
- from pixeltable import exceptions as excs
18
- from pixeltable.iterators import FrameIterator
19
- from pixeltable.tests.utils import \
20
- make_tbl, create_table_data, read_data_file, get_video_files, get_audio_files, get_image_files, get_documents, \
21
- assert_resultset_eq, assert_hf_dataset_equal, make_test_arrow_table, validate_update_status
22
- from pixeltable.tests.utils import skip_test_if_not_installed
23
- from pixeltable.type_system import \
24
- StringType, IntType, FloatType, TimestampType, ImageType, VideoType, JsonType, BoolType, ArrayType, AudioType, \
25
- DocumentType
26
- from pixeltable.utils.filecache import FileCache
27
- from pixeltable.utils.media_store import MediaStore
28
-
29
- class TestTable:
30
- # exc for a % 10 == 0
31
- @pxt.udf(return_type=FloatType(), param_types=[IntType()])
32
- def f1(a: int) -> float:
33
- return a / (a % 10)
34
-
35
- # exception for a == None; this should not get triggered
36
- @pxt.udf(return_type=FloatType(), param_types=[FloatType()])
37
- def f2(a: float) -> float:
38
- return a + 1
39
-
40
- @pxt.expr_udf(param_types=[IntType(nullable=False)])
41
- def add1(a: int) -> int:
42
- return a + 1
43
-
44
- @pxt.uda(
45
- update_types=[IntType()], value_type=IntType(), requires_order_by=True,
46
- allows_window=True)
47
- class window_fn:
48
- def __init__(self):
49
- pass
50
- def update(self, i: int) -> None:
51
- pass
52
- def value(self) -> int:
53
- return 1
54
-
55
- @pxt.expr_udf(param_types=[IntType(nullable=False)])
56
- def add1(a: int) -> int:
57
- return a + 1
58
-
59
- def test_create(self, test_client: pxt.Client) -> None:
60
- cl = test_client
61
- cl.create_dir('dir1')
62
- schema = {
63
- 'c1': StringType(nullable=False),
64
- 'c2': IntType(nullable=False),
65
- 'c3': FloatType(nullable=False),
66
- 'c4': TimestampType(nullable=False),
67
- }
68
- tbl = cl.create_table('test', schema)
69
- _ = cl.create_table('dir1.test', schema)
70
-
71
- with pytest.raises(excs.Error):
72
- _ = cl.create_table('1test', schema)
73
- with pytest.raises(excs.Error):
74
- _ = cl.create_table('bad name', schema={'c1': StringType()})
75
- with pytest.raises(excs.Error):
76
- _ = cl.create_table('test', schema)
77
- with pytest.raises(excs.Error):
78
- _ = cl.create_table('dir2.test2', schema)
79
-
80
- _ = cl.list_tables()
81
- _ = cl.list_tables('dir1')
82
-
83
- with pytest.raises(excs.Error):
84
- _ = cl.list_tables('1dir')
85
- with pytest.raises(excs.Error):
86
- _ = cl.list_tables('dir2')
87
-
88
- # test loading with new client
89
- cl = pxt.Client(reload=True)
90
-
91
- tbl = cl.get_table('test')
92
- assert isinstance(tbl, catalog.InsertableTable)
93
- tbl.add_column(c5=IntType())
94
- tbl.drop_column('c1')
95
- tbl.rename_column('c2', 'c17')
96
-
97
- cl.move('test', 'test2')
98
-
99
- cl.drop_table('test2')
100
- cl.drop_table('dir1.test')
101
-
102
- with pytest.raises(excs.Error):
103
- cl.drop_table('test')
104
- with pytest.raises(excs.Error):
105
- cl.drop_table('dir1.test2')
106
- with pytest.raises(excs.Error):
107
- cl.drop_table('.test2')
108
-
109
- def test_empty_table(self, test_client: pxt.Client) -> None:
110
- cl = test_client
111
- with pytest.raises(excs.Error) as exc_info:
112
- cl.create_table('empty_table', {})
113
- assert 'Table schema is empty' in str(exc_info.value)
114
-
115
- def test_table_attrs(self, test_client: pxt.Client) -> None:
116
- cl = test_client
117
- schema = {'c': StringType(nullable=False)}
118
- num_retained_versions = 20
119
- comment = "This is a table."
120
- tbl = cl.create_table('test_table_attrs', schema, num_retained_versions=num_retained_versions, comment=comment)
121
- assert tbl.num_retained_versions == num_retained_versions
122
- assert tbl.comment == comment
123
- new_num_retained_versions = 30
124
- new_comment = "This is an updated table."
125
- tbl.num_retained_versions = new_num_retained_versions
126
- assert tbl.num_retained_versions == new_num_retained_versions
127
- tbl.comment = new_comment
128
- assert tbl.comment == new_comment
129
- tbl.revert()
130
- assert tbl.comment == comment
131
- tbl.revert()
132
- assert tbl.num_retained_versions == num_retained_versions
133
-
134
- def test_import_parquet(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
135
- skip_test_if_not_installed('pyarrow')
136
- import pyarrow as pa
137
- from pixeltable.utils.arrow import iter_tuples
138
-
139
- parquet_dir = tmp_path / 'test_data'
140
- parquet_dir.mkdir()
141
- make_test_arrow_table(parquet_dir)
142
-
143
- tab = test_client.import_parquet('test_parquet', parquet_path=str(parquet_dir))
144
- assert 'test_parquet' in test_client.list_tables()
145
- assert tab is not None
146
- num_elts = tab.count()
147
- arrow_tab: pa.Table = pa.parquet.read_table(str(parquet_dir))
148
- assert num_elts == arrow_tab.num_rows
149
- assert set(tab.column_names()) == set(arrow_tab.column_names)
150
-
151
- result_set = tab.order_by(tab.c_id).collect()
152
- column_types = tab.column_types()
153
-
154
- for tup, arrow_tup in zip(result_set, iter_tuples(arrow_tab)):
155
- assert tup['c_id'] == arrow_tup['c_id']
156
- for col, val in tup.items():
157
- if val is None:
158
- assert arrow_tup[col] is None
159
- continue
160
-
161
- if column_types[col].is_array_type():
162
- assert (val == arrow_tup[col]).all()
163
- else:
164
- assert val == arrow_tup[col]
165
-
166
- def test_import_huggingface_dataset(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
167
- skip_test_if_not_installed('datasets')
168
- import datasets
169
-
170
- test_cases = [
171
- # { # includes a timestamp. 20MB for specific slice
172
- # Disbled this test case because download is failing, and its not critical.
173
- # 'dataset_name': 'c4',
174
- # # see https://huggingface.co/datasets/allenai/c4/blob/main/realnewslike/c4-train.00000-of-00512.json.gz
175
- # 'dataset': datasets.load_dataset(
176
- # "allenai/c4",
177
- # data_dir="realnewslike",
178
- # data_files="c4-train.00000-of-00512.json.gz",
179
- # split='train[:1000]',
180
- # cache_dir=tmp_path
181
- # ),
182
- # },
183
- { # includes an embedding (array type), common in a few RAG datasets.
184
- 'dataset_name': 'cohere_wikipedia',
185
- 'dataset': datasets.load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3",
186
- data_dir='cr').select_columns(['url', 'title', 'text', 'emb']),
187
- # column with name `_id`` is not currently allowed by pixeltable rules,
188
- # so filter out that column.
189
- # cr subdir has a small number of rows, avoid running out of space in CI runner
190
- # see https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/tree/main/cr
191
- 'schema_override': {'emb': ArrayType((1024,), dtype=FloatType(), nullable=False)}
192
- },
193
- # example of dataset dictionary with multiple splits
194
- {
195
- 'dataset_name': 'rotten_tomatoes',
196
- 'dataset': datasets.load_dataset("rotten_tomatoes"),
197
- },
198
- ]
199
-
200
- # test a column name for splits other than the default of 'split'
201
- split_column_name = 'my_split_col'
202
- for rec in test_cases:
203
- dataset_name = rec['dataset_name']
204
- hf_dataset = rec['dataset']
205
-
206
- tab = test_client.import_huggingface_dataset(
207
- dataset_name,
208
- hf_dataset,
209
- column_name_for_split=split_column_name,
210
- schema_override=rec.get('schema_override', None),
211
- )
212
- if isinstance(hf_dataset, datasets.Dataset):
213
- assert_hf_dataset_equal(hf_dataset, tab.df(), split_column_name)
214
- elif isinstance(hf_dataset, datasets.DatasetDict):
215
- assert tab.count() == sum(hf_dataset.num_rows.values())
216
- assert split_column_name in tab.column_names()
217
-
218
- for dataset_name in hf_dataset:
219
- df = tab.where(tab.my_split_col == dataset_name)
220
- assert_hf_dataset_equal(hf_dataset[dataset_name], df, split_column_name)
221
- else:
222
- assert False
223
-
224
- with pytest.raises(excs.Error) as exc_info:
225
- test_client.import_huggingface_dataset('test', {})
226
- assert 'type(dataset)' in str(exc_info.value)
227
-
228
- def test_image_table(self, test_client: pxt.Client) -> None:
229
- n_sample_rows = 20
230
- cl = test_client
231
- schema = {
232
- 'img': ImageType(nullable=False),
233
- 'category': StringType(nullable=False),
234
- 'split': StringType(nullable=False),
235
- 'img_literal': ImageType(nullable=False),
236
- }
237
- tbl = cl.create_table('test', schema)
238
- assert(MediaStore.count(tbl.get_id()) == 0)
239
-
240
- rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
241
- sample_rows = random.sample(rows, n_sample_rows)
242
-
243
- # add literal image data and column
244
- for r in rows:
245
- with open(r['img'], 'rb') as f:
246
- r['img_literal'] = f.read()
247
-
248
- tbl.insert(sample_rows)
249
- assert(MediaStore.count(tbl.get_id()) == n_sample_rows)
250
-
251
- # compare img and img_literal
252
- # TODO: make tbl.select(tbl.img == tbl.img_literal) work
253
- tdf = tbl.select(tbl.img, tbl.img_literal).show()
254
- pdf = tdf.to_pandas()
255
- for tup in pdf.itertuples():
256
- assert tup.img == tup.img_literal
257
-
258
- # Test adding stored image transformation
259
- tbl.add_column(rotated=tbl.img.rotate(30), stored=True)
260
- assert(MediaStore.count(tbl.get_id()) == 2 * n_sample_rows)
261
-
262
- # Test MediaStore.stats()
263
- stats = list(filter(lambda x: x[0] == tbl.get_id(), MediaStore.stats()))
264
- assert len(stats) == 2 # Two columns
265
- assert stats[0][2] == n_sample_rows # Each column has n_sample_rows associated images
266
- assert stats[1][2] == n_sample_rows
267
-
268
- # Test that version-specific images are cleared when table is reverted
269
- tbl.revert()
270
- assert(MediaStore.count(tbl.get_id()) == n_sample_rows)
271
-
272
- # Test that all stored images are cleared when table is dropped
273
- cl.drop_table('test')
274
- assert(MediaStore.count(tbl.get_id()) == 0)
275
-
276
- def test_schema_spec(self, test_client: pxt.Client) -> None:
277
- cl = test_client
278
-
279
- with pytest.raises(excs.Error) as exc_info:
280
- cl.create_table('test', {'c 1': IntType()})
281
- assert 'invalid column name' in str(exc_info.value).lower()
282
-
283
- with pytest.raises(excs.Error) as exc_info:
284
- cl.create_table('test', {'c1': {}})
285
- assert '"type" is required' in str(exc_info.value)
286
-
287
- with pytest.raises(excs.Error) as exc_info:
288
- cl.create_table('test', {'c1': {'xyz': IntType()}})
289
- assert "invalid key 'xyz'" in str(exc_info.value)
290
-
291
- with pytest.raises(excs.Error) as exc_info:
292
- cl.create_table('test', {'c1': {'stored': True}})
293
- assert '"type" is required' in str(exc_info.value)
294
-
295
- with pytest.raises(excs.Error) as exc_info:
296
- cl.create_table('test', {'c1': {'type': 'string'}})
297
- assert 'must be a ColumnType' in str(exc_info.value)
298
-
299
- with pytest.raises(excs.Error) as exc_info:
300
- cl.create_table('test', {'c1': {'value': 1, 'type': StringType()}})
301
- assert '"type" is redundant' in str(exc_info.value)
302
-
303
- with pytest.raises(excs.Error) as exc_info:
304
- cl.create_table('test', {'c1': {'value': pytest}})
305
- assert 'value needs to be either' in str(exc_info.value)
306
-
307
- with pytest.raises(excs.Error) as exc_info:
308
- def f() -> float:
309
- return 1.0
310
- cl.create_table('test', {'c1': {'value': f}})
311
- assert '"type" is required' in str(exc_info.value)
312
-
313
- with pytest.raises(excs.Error) as exc_info:
314
- cl.create_table('test', {'c1': {'type': StringType(), 'stored': 'true'}})
315
- assert '"stored" must be a bool' in str(exc_info.value)
316
-
317
- with pytest.raises(excs.Error) as exc_info:
318
- cl.create_table('test', {'c1': StringType()}, primary_key='c2')
319
- assert 'primary key column c2 not found' in str(exc_info.value).lower()
320
-
321
- with pytest.raises(excs.Error) as exc_info:
322
- cl.create_table('test', {'c1': StringType()}, primary_key=['c1', 'c2'])
323
- assert 'primary key column c2 not found' in str(exc_info.value).lower()
324
-
325
- with pytest.raises(excs.Error) as exc_info:
326
- cl.create_table('test', {'c1': StringType()}, primary_key=['c2'])
327
- assert 'primary key column c2 not found' in str(exc_info.value).lower()
328
-
329
- with pytest.raises(excs.Error) as exc_info:
330
- cl.create_table('test', {'c1': StringType()}, primary_key=0)
331
- assert 'primary_key must be a' in str(exc_info.value).lower()
332
-
333
- with pytest.raises(excs.Error) as exc_info:
334
- cl.create_table('test', {'c1': StringType(nullable=True)}, primary_key='c1')
335
- assert 'cannot be nullable' in str(exc_info.value).lower()
336
-
337
- def check_bad_media(
338
- self, test_client: pxt.Client, rows: List[Tuple[str, bool]], col_type: pxt.ColumnType,
339
- validate_local_path: bool = True
340
- ) -> None:
341
- schema = {
342
- 'media': col_type,
343
- 'is_bad_media': BoolType(nullable=False),
344
- }
345
- tbl = test_client.create_table('test', schema)
346
-
347
- assert len(rows) > 0
348
- total_bad_rows = sum([int(row['is_bad_media']) for row in rows])
349
- assert total_bad_rows > 0
350
-
351
- # Mode 1: Validation error on bad input (default)
352
- # we ignore the exact error here, because it depends on the media type
353
- with pytest.raises(excs.Error):
354
- tbl.insert(rows, fail_on_exception=True)
355
-
356
- # Mode 2: ignore_errors=True, store error information in table
357
- status = tbl.insert(rows, fail_on_exception=False)
358
- _ = tbl.select(tbl.media, tbl.media.errormsg).show()
359
- assert status.num_rows == len(rows)
360
- assert status.num_excs == total_bad_rows
361
-
362
- # check that we have the right number of bad and good rows
363
- assert tbl.where(tbl.is_bad_media == True).count() == total_bad_rows
364
- assert tbl.where(tbl.is_bad_media == False).count() == len(rows) - total_bad_rows
365
-
366
- # check error type is set correctly
367
- assert tbl.where((tbl.is_bad_media == True) & (tbl.media.errortype == None)).count() == 0
368
- assert tbl.where((tbl.is_bad_media == False) & (tbl.media.errortype == None)).count() \
369
- == len(rows) - total_bad_rows
370
-
371
- # check fileurl is set for valid images, and check no file url is set for bad images
372
- assert tbl.where((tbl.is_bad_media == False) & (tbl.media.fileurl == None)).count() == 0
373
- assert tbl.where((tbl.is_bad_media == True) & (tbl.media.fileurl != None)).count() == 0
374
-
375
- if validate_local_path:
376
- # check that tbl.media is a valid local path
377
- paths = tbl.where(tbl.media != None).select(output=tbl.media).collect()['output']
378
- for path in paths:
379
- assert os.path.exists(path) and os.path.isfile(path)
380
-
381
- def test_validate_image(self, test_client: pxt.Client) -> None:
382
- rows = read_data_file('imagenette2-160', 'manifest_bad.csv', ['img'])
383
- rows = [{'media': r['img'], 'is_bad_media': r['is_bad_image']} for r in rows]
384
- self.check_bad_media(test_client, rows, ImageType(nullable=True), validate_local_path=False)
385
-
386
- def test_validate_video(self, test_client: pxt.Client) -> None:
387
- files = get_video_files(include_bad_video=True)
388
- rows = [{'media': f, 'is_bad_media': f.endswith('bad_video.mp4')} for f in files]
389
- self.check_bad_media(test_client, rows, VideoType(nullable=True))
390
-
391
- def test_validate_audio(self, test_client: pxt.Client) -> None:
392
- files = get_audio_files(include_bad_audio=True)
393
- rows = [{'media': f, 'is_bad_media': f.endswith('bad_audio.mp3')} for f in files]
394
- self.check_bad_media(test_client, rows, AudioType(nullable=True))
395
-
396
- def test_validate_docs(self, test_client: pxt.Client) -> None:
397
- valid_doc_paths = get_documents()
398
- invalid_doc_paths = [get_video_files()[0], get_audio_files()[0], get_image_files()[0]]
399
- doc_paths = valid_doc_paths + invalid_doc_paths
400
- is_valid = [True] * len(valid_doc_paths) + [False] * len(invalid_doc_paths)
401
- rows = [{'media': f, 'is_bad_media': not is_valid} for f, is_valid in zip(doc_paths, is_valid)]
402
- self.check_bad_media(test_client, rows, DocumentType(nullable=True))
403
-
404
- def test_validate_external_url(self, test_client: pxt.Client) -> None:
405
- skip_test_if_not_installed('boto3')
406
- rows = [
407
- {'media': 's3://open-images-dataset/validation/doesnotexist.jpg', 'is_bad_media': True},
408
- {'media': 'https://archive.random.org/download?file=2024-01-28.bin', 'is_bad_media': True}, # 403 error
409
- {'media': 's3://open-images-dataset/validation/3c02ca9ec9b2b77b.jpg', 'is_bad_media': True}, # wrong media
410
- # test s3 url
411
- {
412
- 'media': 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4',
413
- 'is_bad_media': False
414
- },
415
- # test http url
416
- {
417
- 'media': 'https://github.com/pixeltable/pixeltable/raw/master/pixeltable/tests/data/videos/bangkok.mp4',
418
- 'is_bad_media': False
419
- },
420
-
421
- ]
422
- self.check_bad_media(test_client, rows, VideoType(nullable=True))
423
-
424
- def test_create_s3_image_table(self, test_client: pxt.Client) -> None:
425
- skip_test_if_not_installed('boto3')
426
- cl = test_client
427
- tbl = cl.create_table('test', {'img': ImageType(nullable=False)})
428
- # this is needed because Client.reset_catalog() doesn't call TableVersion.drop(), which would
429
- # clear the file cache
430
- # TODO: change reset_catalog() to drop tables
431
- FileCache.get().clear()
432
- cache_stats = FileCache.get().stats()
433
- assert cache_stats.num_requests == 0, f'{str(cache_stats)} tbl_id={tbl.get_id()}'
434
- # add computed column to make sure that external files are cached locally during insert
435
- tbl.add_column(rotated=tbl.img.rotate(30), stored=True)
436
- urls = [
437
- 's3://open-images-dataset/validation/3c02ca9ec9b2b77b.jpg',
438
- 's3://open-images-dataset/validation/3c13e0015b6c3bcf.jpg',
439
- 's3://open-images-dataset/validation/3ba5380490084697.jpg',
440
- 's3://open-images-dataset/validation/3afeb4b34f90c0cf.jpg',
441
- 's3://open-images-dataset/validation/3b07a2c0d5c0c789.jpg',
442
- ]
443
-
444
- tbl.insert({'img': url} for url in urls)
445
- # check that we populated the cache
446
- cache_stats = FileCache.get().stats()
447
- assert cache_stats.num_requests == len(urls), f'{str(cache_stats)} tbl_id={tbl.get_id()}'
448
- assert cache_stats.num_hits == 0
449
- assert FileCache.get().num_files() == len(urls)
450
- assert FileCache.get().num_files(tbl.get_id()) == len(urls)
451
- assert FileCache.get().avg_file_size() > 0
452
-
453
- # query: we read from the cache
454
- _ = tbl.show(0)
455
- cache_stats = FileCache.get().stats()
456
- assert cache_stats.num_requests == 2 * len(urls)
457
- assert cache_stats.num_hits == len(urls)
458
-
459
- # after clearing the cache, we need to re-fetch the files
460
- FileCache.get().clear()
461
- _ = tbl.show(0)
462
- cache_stats = FileCache.get().stats()
463
- assert cache_stats.num_requests == len(urls)
464
- assert cache_stats.num_hits == 0
465
-
466
- # start with fresh client and FileCache instance to test FileCache initialization with pre-existing files
467
- cl = pxt.Client(reload=True)
468
- # is there a better way to do this?
469
- FileCache._instance = None
470
- t = cl.get_table('test')
471
- _ = t.show(0)
472
- cache_stats = FileCache.get().stats()
473
- assert cache_stats.num_requests == len(urls)
474
- assert cache_stats.num_hits == len(urls)
475
-
476
- # dropping the table also clears the file cache
477
- cl.drop_table('test')
478
- cache_stats = FileCache.get().stats()
479
- assert cache_stats.total_size == 0
480
-
481
- def test_video_url(self, test_client: pxt.Client) -> None:
482
- skip_test_if_not_installed('boto3')
483
- cl = test_client
484
- schema = {
485
- 'payload': IntType(nullable=False),
486
- 'video': VideoType(nullable=False),
487
- }
488
- tbl = cl.create_table('test', schema)
489
- url = 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4'
490
- tbl.insert(payload=1, video=url)
491
- row = tbl.select(tbl.video.fileurl, tbl.video.localpath).collect()[0]
492
- assert row['video_fileurl'] == url
493
- # row[1] contains valid path to an mp4 file
494
- local_path = row['video_localpath']
495
- assert os.path.exists(local_path) and os.path.isfile(local_path)
496
- cap = cv2.VideoCapture(local_path)
497
- # TODO: this isn't sufficient to determine that this is actually a video, rather than an image
498
- assert cap.isOpened()
499
- cap.release()
500
-
501
- def test_create_video_table(self, test_client: pxt.Client) -> None:
502
- skip_test_if_not_installed('boto3')
503
- cl = test_client
504
- tbl = cl.create_table(
505
- 'test_tbl',
506
- {'payload': IntType(nullable=False), 'video': VideoType(nullable=True)})
507
- args = {'video': tbl.video, 'fps': 0}
508
- view = cl.create_view('test_view', tbl, iterator_class=FrameIterator, iterator_args=args)
509
- view.add_column(c1=view.frame.rotate(30), stored=True)
510
- view.add_column(c2=view.c1.rotate(40), stored=False)
511
- view.add_column(c3=view.c2.rotate(50), stored=True)
512
- # a non-materialized column that refers to another non-materialized column
513
- view.add_column(c4=view.c2.rotate(60), stored=False)
514
-
515
- # cols computed with window functions are stored by default
516
- view.add_column(c5=self.window_fn(view.frame_idx, 1, group_by=view.video))
517
-
518
- # reload to make sure that metadata gets restored correctly
519
- cl = pxt.Client(reload=True)
520
- tbl = cl.get_table('test_tbl')
521
- view = cl.get_table('test_view')
522
- # we're inserting only a single row and the video column is not in position 0
523
- url = 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4'
524
- status = tbl.insert(payload=1, video=url)
525
- assert status.num_excs == 0
526
- # * 2: we have 2 stored img cols
527
- assert MediaStore.count(view.get_id()) == view.count() * 2
528
- # also insert a local file
529
- tbl.insert(payload=1, video=get_video_files()[0])
530
- assert MediaStore.count(view.get_id()) == view.count() * 2
531
-
532
- # TODO: test inserting Nulls
533
- #status = tbl.insert(payload=1, video=None)
534
- #assert status.num_excs == 0
535
-
536
- # revert() clears stored images
537
- tbl.revert()
538
- tbl.revert()
539
- assert MediaStore.count(view.get_id()) == 0
540
-
541
- with pytest.raises(excs.Error):
542
- # can't drop frame col
543
- view.drop_column('frame')
544
- with pytest.raises(excs.Error):
545
- # can't drop frame_idx col
546
- view.drop_column('frame_idx')
547
-
548
- # drop() clears stored images and the cache
549
- tbl.insert(payload=1, video=get_video_files()[0])
550
- with pytest.raises(excs.Error) as exc_info:
551
- cl.drop_table('test_tbl')
552
- assert 'has dependents: test_view' in str(exc_info.value)
553
- cl.drop_table('test_view')
554
- cl.drop_table('test_tbl')
555
- assert MediaStore.count(view.get_id()) == 0
556
-
557
- def test_insert_nulls(self, test_client: pxt.Client) -> None:
558
- cl = test_client
559
- schema = {
560
- 'c1': StringType(nullable=True),
561
- 'c2': IntType(nullable=True),
562
- 'c3': FloatType(nullable=True),
563
- 'c4': BoolType(nullable=True),
564
- 'c5': ArrayType((2, 3), dtype=IntType(), nullable=True),
565
- 'c6': JsonType(nullable=True),
566
- 'c7': ImageType(nullable=True),
567
- 'c8': VideoType(nullable=True),
568
- }
569
- t = cl.create_table('test1', schema)
570
- status = t.insert(c1='abc')
571
- assert status.num_rows == 1
572
- assert status.num_excs == 0
573
-
574
- def test_insert(self, test_client: pxt.Client) -> None:
575
- cl = test_client
576
- schema = {
577
- 'c1': StringType(nullable=False),
578
- 'c2': IntType(nullable=False),
579
- 'c3': FloatType(nullable=False),
580
- 'c4': BoolType(nullable=False),
581
- 'c5': ArrayType((2, 3), dtype=IntType(), nullable=False),
582
- 'c6': JsonType(nullable=False),
583
- 'c7': ImageType(nullable=False),
584
- 'c8': VideoType(nullable=False),
585
- }
586
- t = cl.create_table('test1', schema)
587
- rows = create_table_data(t)
588
- status = t.insert(rows)
589
- assert status.num_rows == len(rows)
590
- assert status.num_excs == 0
591
-
592
- # alternate (kwargs) insert syntax
593
- status = t.insert(
594
- c1='string',
595
- c2=91,
596
- c3=1.0,
597
- c4=True,
598
- c5=np.ones((2, 3), dtype=np.dtype(np.int64)),
599
- c6={'key': 'val'},
600
- c7=get_image_files()[0],
601
- c8=get_video_files()[0]
602
- )
603
- assert status.num_rows == 1
604
- assert status.num_excs == 0
605
-
606
- # empty input
607
- with pytest.raises(excs.Error) as exc_info:
608
- t.insert([])
609
- assert 'empty' in str(exc_info.value)
610
-
611
- # missing column
612
- with pytest.raises(excs.Error) as exc_info:
613
- # drop first column
614
- col_names = list(rows[0].keys())[1:]
615
- new_rows = [{col_name: row[col_name] for col_name in col_names} for row in rows]
616
- t.insert(new_rows)
617
- assert 'Missing' in str(exc_info.value)
618
-
619
- # incompatible schema
620
- for (col_name, col_type), value_col_name in zip(schema.items(), ['c2', 'c3', 'c5', 'c5', 'c6', 'c7', 'c2', 'c2']):
621
- cl.drop_table('test1', ignore_errors=True)
622
- t = cl.create_table('test1', {col_name: col_type})
623
- with pytest.raises(excs.Error) as exc_info:
624
- t.insert({col_name: r[value_col_name]} for r in rows)
625
- assert 'expected' in str(exc_info.value).lower()
626
-
627
- # rows not list of dicts
628
- cl.drop_table('test1', ignore_errors=True)
629
- t = cl.create_table('test1', {'c1': StringType()})
630
- with pytest.raises(excs.Error) as exc_info:
631
- t.insert(['1'])
632
- assert 'list of dictionaries' in str(exc_info.value)
633
-
634
- # bad null value
635
- cl.drop_table('test1', ignore_errors=True)
636
- t = cl.create_table('test1', {'c1': StringType(nullable=False)})
637
- with pytest.raises(excs.Error) as exc_info:
638
- t.insert(c1=None)
639
- assert 'expected non-None' in str(exc_info.value)
640
-
641
- # bad array literal
642
- cl.drop_table('test1', ignore_errors=True)
643
- t = cl.create_table('test1', {'c5': ArrayType((2, 3), dtype=IntType(), nullable=False)})
644
- with pytest.raises(excs.Error) as exc_info:
645
- t.insert(c5=np.ndarray((3, 2)))
646
- assert 'expected ndarray((2, 3)' in str(exc_info.value)
647
-
648
- def test_insert_string_with_null(self, test_client: pxt.Client) -> None:
649
- cl = test_client
650
- t = cl.create_table('test', {'c1': StringType()})
651
-
652
- t.insert([{'c1': 'this is a python\x00string'}])
653
- assert t.count() == 1
654
- for tup in t.df().collect():
655
- assert tup['c1'] == 'this is a python string'
656
-
657
- def test_query(self, test_client: pxt.Client) -> None:
658
- skip_test_if_not_installed('boto3')
659
- cl = test_client
660
- col_names = ['c1', 'c2', 'c3', 'c4', 'c5']
661
- t = make_tbl(cl, 'test', col_names)
662
- rows = create_table_data(t)
663
- t.insert(rows)
664
- _ = t.show(n=0)
665
-
666
- # test querying existing table
667
- cl = pxt.Client(reload=True)
668
- t2 = cl.get_table('test')
669
- _ = t2.show(n=0)
670
-
671
- def test_batch_update(self, test_tbl: pxt.Table) -> None:
672
- t = test_tbl
673
- validate_update_status(
674
- t.batch_update([{'c1': '1', 'c2': 1}, {'c1': '2', 'c2': 2}]),
675
- expected_rows=2)
676
- assert t.where(t.c2 == 1).collect()[0]['c1'] == '1'
677
- assert t.where(t.c2 == 2).collect()[0]['c1'] == '2'
678
- validate_update_status(
679
- t.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two', '_rowid': (2,)}]),
680
- expected_rows=2)
681
- assert t.where(t.c2 == 1).collect()[0]['c1'] == 'one'
682
- assert t.where(t.c2 == 2).collect()[0]['c1'] == 'two'
683
-
684
- cl = pxt.Client()
685
- # test composite primary key
686
- schema = {'c1': StringType(), 'c2': IntType(), 'c3': FloatType()}
687
- t = cl.create_table('composite', schema=schema, primary_key=['c1', 'c2'])
688
- rows = [{'c1': str(i), 'c2': i, 'c3': float(i)} for i in range(10)]
689
- validate_update_status(t.insert(rows), expected_rows=10)
690
-
691
- validate_update_status(
692
- t.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0}, {'c1': '2', 'c2': 2, 'c3': 3.0}]),
693
- expected_rows=2)
694
-
695
- with pytest.raises(excs.Error) as exc_info:
696
- # can't mix _rowid with primary key
697
- _ = t.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0, '_rowid': (1,)}])
698
- assert 'c1 is a primary key column' in str(exc_info.value).lower()
699
-
700
- with pytest.raises(excs.Error) as exc_info:
701
- # bad literal
702
- _ = t.batch_update([{'c2': 1, 'c3': 'a'}])
703
- assert "'a' is not a valid literal" in str(exc_info.value).lower()
704
-
705
- with pytest.raises(excs.Error) as exc_info:
706
- # missing primary key column
707
- t.batch_update([{'c1': '1', 'c3': 2.0}])
708
- assert 'primary key columns (c2) missing' in str(exc_info.value).lower()
709
-
710
- # table without primary key
711
- t2 = cl.create_table('no_pk', schema=schema)
712
- validate_update_status(t2.insert(rows), expected_rows=10)
713
- with pytest.raises(excs.Error) as exc_info:
714
- _ = t2.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0}])
715
- assert 'must have primary key for batch update' in str(exc_info.value).lower()
716
-
717
- # updating with _rowid still works
718
- validate_update_status(
719
- t2.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two', '_rowid': (2,)}]),
720
- expected_rows=2)
721
- assert t2.where(t2.c2 == 1).collect()[0]['c1'] == 'one'
722
- assert t2.where(t2.c2 == 2).collect()[0]['c1'] == 'two'
723
- with pytest.raises(AssertionError):
724
- # some rows are missing rowids
725
- _ = t2.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two'}])
726
-
727
- def test_update(self, test_tbl: pxt.Table, small_img_tbl) -> None:
728
- t = test_tbl
729
- # update every type with a literal
730
- test_cases = [
731
- ('c1', 'new string'),
732
- # TODO: ('c1n', None),
733
- ('c3', -1.0),
734
- ('c4', True),
735
- ('c5', datetime.datetime.now()),
736
- ('c6', [{'x': 1, 'y': 2}]),
737
- ]
738
- count = t.count()
739
- for col_name, literal in test_cases:
740
- status = t.update({col_name: literal}, where=t.c3 < 10.0, cascade=False)
741
- assert status.num_rows == 10
742
- assert status.updated_cols == [f'{t.get_name()}.{col_name}']
743
- assert t.count() == count
744
- t.revert()
745
-
746
- # exchange two columns
747
- t.add_column(float_col=FloatType(nullable=True))
748
- t.update({'float_col': 1.0})
749
- float_col_vals = t.select(t.float_col).collect().to_pandas()['float_col']
750
- c3_vals = t.select(t.c3).collect().to_pandas()['c3']
751
- assert np.all(float_col_vals == pd.Series([1.0] * t.count()))
752
- t.update({'c3': t.float_col, 'float_col': t.c3})
753
- assert np.all(t.select(t.c3).collect().to_pandas()['c3'] == float_col_vals)
754
- assert np.all(t.select(t.float_col).collect().to_pandas()['float_col'] == c3_vals)
755
- t.revert()
756
-
757
- # update column that is used in computed cols
758
- t.add_column(computed1=t.c3 + 1)
759
- t.add_column(computed2=t.computed1 + 1)
760
- t.add_column(computed3=t.c3 + 3)
761
-
762
- # cascade=False
763
- computed1 = t.order_by(t.computed1).show(0).to_pandas()['computed1']
764
- computed2 = t.order_by(t.computed2).show(0).to_pandas()['computed2']
765
- computed3 = t.order_by(t.computed3).show(0).to_pandas()['computed3']
766
- assert t.where(t.c3 < 10.0).count() == 10
767
- assert t.where(t.c3 == 10.0).count() == 1
768
- # update to a value that also satisfies the where clause
769
- status = t.update({'c3': 0.0}, where=t.c3 < 10.0, cascade=False)
770
- assert status.num_rows == 10
771
- assert status.updated_cols == ['test_tbl.c3']
772
- assert t.where(t.c3 < 10.0).count() == 10
773
- assert t.where(t.c3 == 0.0).count() == 10
774
- # computed cols are not updated
775
- assert np.all(t.order_by(t.computed1).show(0).to_pandas()['computed1'] == computed1)
776
- assert np.all(t.order_by(t.computed2).show(0).to_pandas()['computed2'] == computed2)
777
- assert np.all(t.order_by(t.computed3).show(0).to_pandas()['computed3'] == computed3)
778
-
779
- # revert, then verify that we're back to where we started
780
- cl = pxt.Client(reload=True)
781
- t = cl.get_table(t.get_name())
782
- t.revert()
783
- assert t.where(t.c3 < 10.0).count() == 10
784
- assert t.where(t.c3 == 10.0).count() == 1
785
-
786
- # cascade=True
787
- status = t.update({'c3': 0.0}, where=t.c3 < 10.0, cascade=True)
788
- assert status.num_rows == 10
789
- assert set(status.updated_cols) == \
790
- set(['test_tbl.c3', 'test_tbl.computed1', 'test_tbl.computed2', 'test_tbl.computed3'])
791
- assert t.where(t.c3 < 10.0).count() == 10
792
- assert t.where(t.c3 == 0.0).count() == 10
793
- assert np.all(t.order_by(t.computed1).show(0).to_pandas()['computed1'][:10] == pd.Series([1.0] * 10))
794
- assert np.all(t.order_by(t.computed2).show(0).to_pandas()['computed2'][:10] == pd.Series([2.0] * 10))
795
- assert np.all(t.order_by(t.computed3).show(0).to_pandas()['computed3'][:10] == pd.Series([3.0] * 10))
796
-
797
- # bad update spec
798
- with pytest.raises(excs.Error) as excinfo:
799
- t.update({1: 1})
800
- assert 'dict key' in str(excinfo.value)
801
-
802
- # unknown column
803
- with pytest.raises(excs.Error) as excinfo:
804
- t.update({'unknown': 1})
805
- assert 'unknown unknown' in str(excinfo.value)
806
-
807
- # incompatible type
808
- with pytest.raises(excs.Error) as excinfo:
809
- t.update({'c1': 1})
810
- assert 'not compatible' in str(excinfo.value)
811
-
812
- # can't update primary key
813
- with pytest.raises(excs.Error) as excinfo:
814
- t.update({'c2': 1})
815
- assert 'primary key' in str(excinfo.value)
816
-
817
- # can't update computed column
818
- with pytest.raises(excs.Error) as excinfo:
819
- t.update({'computed1': 1})
820
- assert 'is computed' in str(excinfo.value)
821
-
822
- # non-expr
823
- with pytest.raises(excs.Error) as excinfo:
824
- t.update({'c3': lambda c3: math.sqrt(c3)})
825
- assert 'not a recognized' in str(excinfo.value)
826
-
827
- # non-Predicate filter
828
- with pytest.raises(excs.Error) as excinfo:
829
- t.update({'c3': 1.0}, where=lambda c2: c2 == 10)
830
- assert 'Predicate' in str(excinfo.value)
831
-
832
- img_t = small_img_tbl
833
-
834
- # can't update image col
835
- with pytest.raises(excs.Error) as excinfo:
836
- img_t.update({'img': 17}, where=img_t.img.nearest('car'))
837
- assert 'has type image' in str(excinfo.value)
838
-
839
- # similarity search is not supported
840
- with pytest.raises(excs.Error) as excinfo:
841
- img_t.update({'split': 'train'}, where=img_t.img.nearest('car'))
842
- assert 'nearest()' in str(excinfo.value)
843
-
844
- # filter not expressible in SQL
845
- with pytest.raises(excs.Error) as excinfo:
846
- img_t.update({'split': 'train'}, where=img_t.img.width > 100)
847
- assert 'not expressible' in str(excinfo.value)
848
-
849
- def test_cascading_update(self, test_tbl: pxt.InsertableTable) -> None:
850
- t = test_tbl
851
- t.add_column(d1=t.c3 - 1)
852
- # add column that can be updated
853
- t.add_column(c10=FloatType(nullable=True))
854
- t.update({'c10': t.c3})
855
- # computed column that depends on two columns: exercise duplicate elimination during query construction
856
- t.add_column(d2=t.c3 - t.c10)
857
- r1 = t.where(t.c2 < 5).select(t.c3 + 1.0, t.c10 - 1.0, t.c3, 2.0).order_by(t.c2).show(0)
858
- t.update({'c4': True, 'c3': t.c3 + 1.0, 'c10': t.c10 - 1.0}, where=t.c2 < 5, cascade=True)
859
- r2 = t.where(t.c2 < 5).select(t.c3, t.c10, t.d1, t.d2).order_by(t.c2).show(0)
860
- assert_resultset_eq(r1, r2)
861
-
862
- def test_delete(self, test_tbl: pxt.Table, small_img_tbl) -> None:
863
- t = test_tbl
864
-
865
- cnt = t.where(t.c3 < 10.0).count()
866
- assert cnt == 10
867
- cnt = t.where(t.c3 == 10.0).count()
868
- assert cnt == 1
869
- status = t.delete(where=t.c3 < 10.0)
870
- assert status.num_rows == 10
871
- cnt = t.where(t.c3 < 10.0).count()
872
- assert cnt == 0
873
- cnt = t.where(t.c3 == 10.0).count()
874
- assert cnt == 1
875
-
876
- # revert, then verify that we're back where we started
877
- cl = pxt.Client(reload=True)
878
- t = cl.get_table(t.get_name())
879
- t.revert()
880
- cnt = t.where(t.c3 < 10.0).count()
881
- assert cnt == 10
882
- cnt = t.where(t.c3 == 10.0).count()
883
- assert cnt == 1
884
-
885
- # non-Predicate filter
886
- with pytest.raises(excs.Error) as excinfo:
887
- t.delete(where=lambda c2: c2 == 10)
888
- assert 'Predicate' in str(excinfo.value)
889
-
890
- img_t = small_img_tbl
891
- # similarity search is not supported
892
- with pytest.raises(excs.Error) as excinfo:
893
- img_t.delete(where=img_t.img.nearest('car'))
894
- assert 'nearest()' in str(excinfo.value)
895
-
896
- # filter not expressible in SQL
897
- with pytest.raises(excs.Error) as excinfo:
898
- img_t.delete(where=img_t.img.width > 100)
899
- assert 'not expressible' in str(excinfo.value)
900
-
901
- def test_computed_cols(self, test_client: pxt.client) -> None:
902
- cl = test_client
903
- schema = {
904
- 'c1': IntType(nullable=False),
905
- 'c2': FloatType(nullable=False),
906
- 'c3': JsonType(nullable=False),
907
- }
908
- t : pxt.InsertableTable = cl.create_table('test', schema)
909
- status = t.add_column(c4=t.c1 + 1)
910
- assert status.num_excs == 0
911
- status = t.add_column(c5=t.c4 + 1)
912
- assert status.num_excs == 0
913
- status = t.add_column(c6=t.c1 / t.c2)
914
- assert status.num_excs == 0
915
- status = t.add_column(c7=t.c6 * t.c2)
916
- assert status.num_excs == 0
917
- status = t.add_column(c8=t.c3.detections['*'].bounding_box)
918
- assert status.num_excs == 0
919
- status = t.add_column(c9=lambda c2: math.sqrt(c2), type=FloatType())
920
- assert status.num_excs == 0
921
-
922
- # unstored cols that compute window functions aren't currently supported
923
- with pytest.raises((excs.Error)):
924
- t.add_column(c10=ptf.sum(t.c1, group_by=t.c1), stored=False)
925
-
926
- # Column.dependent_cols are computed correctly
927
- assert len(t.c1.col.dependent_cols) == 2
928
- assert len(t.c2.col.dependent_cols) == 3
929
- assert len(t.c3.col.dependent_cols) == 1
930
- assert len(t.c4.col.dependent_cols) == 1
931
- assert len(t.c5.col.dependent_cols) == 0
932
- assert len(t.c6.col.dependent_cols) == 1
933
- assert len(t.c7.col.dependent_cols) == 0
934
- assert len(t.c8.col.dependent_cols) == 0
935
-
936
- rows = create_table_data(t, ['c1', 'c2', 'c3'], num_rows=10)
937
- t.insert(rows)
938
- _ = t.show()
939
-
940
- # not allowed to pass values for computed cols
941
- with pytest.raises(excs.Error):
942
- rows2 = create_table_data(t, ['c1', 'c2', 'c3', 'c4'], num_rows=10)
943
- t.insert(rows2)
944
-
945
- # test loading from store
946
- cl = pxt.Client(reload=True)
947
- t = cl.get_table('test')
948
- assert len(t.columns()) == len(t.columns())
949
- for i in range(len(t.columns())):
950
- if t.columns()[i].value_expr is not None:
951
- assert t.columns()[i].value_expr.equals(t.columns()[i].value_expr)
952
-
953
- # make sure we can still insert data and that computed cols are still set correctly
954
- status = t.insert(rows)
955
- assert status.num_excs == 0
956
- res = t.show(0)
957
- tbl_df = t.show(0).to_pandas()
958
-
959
- # can't drop c4: c5 depends on it
960
- with pytest.raises(excs.Error):
961
- t.drop_column('c4')
962
- t.drop_column('c5')
963
- # now it works
964
- t.drop_column('c4')
965
-
966
- def test_expr_udf_computed_cols(self, test_client: pxt.Client) -> None:
967
- cl = test_client
968
- t = cl.create_table('test', {'c1': IntType(nullable=False)})
969
- rows = [{'c1': i} for i in range(100)]
970
- status = t.insert(rows)
971
- assert status.num_rows == len(rows)
972
- status = t.add_column(c2=t.c1 + 1)
973
- assert status.num_excs == 0
974
- # call with positional arg
975
- status = t.add_column(c3=self.add1(t.c1))
976
- assert status.num_excs == 0
977
- # call with keyword arg
978
- status = t.add_column(c4=self.add1(a=t.c1))
979
- assert status.num_excs == 0
980
-
981
- # TODO: how to verify the output?
982
- describe_output = t.__repr__()
983
- # 'add1' didn't get swallowed/the expr udf is still visible in the column definition
984
- assert 'add1' in describe_output
985
-
986
- def check(t: pxt.Table) -> None:
987
- assert_resultset_eq(
988
- t.select(t.c1 + 1).order_by(t.c1).collect(),
989
- t.select(t.c2).order_by(t.c1).collect())
990
- assert_resultset_eq(
991
- t.select(t.c1 + 1).order_by(t.c1).collect(),
992
- t.select(t.c3).order_by(t.c1).collect())
993
-
994
- check(t)
995
- # test loading from store
996
- cl = pxt.Client(reload=True)
997
- t = cl.get_table('test')
998
- check(t)
999
-
1000
- # make sure we can still insert data and that computed cols are still set correctly
1001
- status = t.insert(rows)
1002
- assert status.num_excs == 0
1003
- check(t)
1004
-
1005
- def test_computed_col_exceptions(self, test_client: pxt.Client, test_tbl: catalog.Table) -> None:
1006
- cl = test_client
1007
-
1008
- # exception during insert()
1009
- schema = {'c2': IntType(nullable=False)}
1010
- rows = list(test_tbl.select(test_tbl.c2).collect())
1011
- t = cl.create_table('test_insert', schema)
1012
- status = t.add_column(add1=self.f2(self.f1(t.c2)))
1013
- assert status.num_excs == 0
1014
- status = t.insert(rows, fail_on_exception=False)
1015
- assert status.num_excs == 10
1016
- assert 'test_insert.add1' in status.cols_with_excs
1017
- assert t.where(t.add1.errortype != None).count() == 10
1018
-
1019
- # exception during add_column()
1020
- t = cl.create_table('test_add_column', schema)
1021
- status = t.insert(rows)
1022
- assert status.num_rows == 100
1023
- assert status.num_excs == 0
1024
- status = t.add_column(add1=self.f2(self.f1(t.c2)))
1025
- assert status.num_excs == 10
1026
- assert 'test_add_column.add1' in status.cols_with_excs
1027
- assert t.where(t.add1.errortype != None).count() == 10
1028
-
1029
- def _test_computed_img_cols(self, t: catalog.Table, stores_img_col: bool) -> None:
1030
- rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
1031
- rows = [{'img': r['img']} for r in rows[:20]]
1032
- status = t.insert(rows)
1033
- assert status.num_rows == 20
1034
- _ = t.count()
1035
- _ = t.show()
1036
- assert MediaStore.count(t.get_id()) == t.count() * stores_img_col
1037
-
1038
- # test loading from store
1039
- cl = pxt.Client(reload=True)
1040
- t2 = cl.get_table(t.get_name())
1041
- assert len(t.columns()) == len(t2.columns())
1042
- for i in range(len(t.columns())):
1043
- if t.columns()[i].value_expr is not None:
1044
- assert t.columns()[i].value_expr.equals(t2.columns()[i].value_expr)
1045
-
1046
- # make sure we can still insert data and that computed cols are still set correctly
1047
- t2.insert(rows)
1048
- assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
1049
- res = t2.show(0)
1050
- tbl_df = t2.show(0).to_pandas()
1051
-
1052
- # revert also removes computed images
1053
- t2.revert()
1054
- assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
1055
-
1056
- @pxt.udf(return_type=ImageType(), param_types=[ImageType()])
1057
- def img_fn_with_exc(img: PIL.Image.Image) -> PIL.Image.Image:
1058
- raise RuntimeError
1059
-
1060
- def test_computed_img_cols(self, test_client: pxt.Client) -> None:
1061
- cl = test_client
1062
- schema = {'img': ImageType(nullable=False)}
1063
- t = cl.create_table('test', schema)
1064
- t.add_column(c2=t.img.width)
1065
- # c3 is not stored by default
1066
- t.add_column(c3=t.img.rotate(90))
1067
- self._test_computed_img_cols(t, stores_img_col=False)
1068
-
1069
- t = cl.create_table('test2', schema)
1070
- # c3 is now stored
1071
- t.add_column(c3=t.img.rotate(90), stored=True)
1072
- self._test_computed_img_cols(t, stores_img_col=True)
1073
- _ = t[t.c3.errortype].show(0)
1074
-
1075
- # computed img col with exceptions
1076
- t = cl.create_table('test3', schema)
1077
- t.add_column(c3=self.img_fn_with_exc(t.img), stored=True)
1078
- rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
1079
- rows = [{'img': r['img']} for r in rows[:20]]
1080
- t.insert(rows, fail_on_exception=False)
1081
- _ = t[t.c3.errortype].show(0)
1082
-
1083
- def test_computed_window_fn(self, test_client: pxt.Client, test_tbl: catalog.Table) -> None:
1084
- cl = test_client
1085
- t = test_tbl
1086
- # backfill
1087
- t.add_column(c9=ptf.sum(t.c2, group_by=t.c4, order_by=t.c3))
1088
-
1089
- schema = {
1090
- 'c2': IntType(nullable=False),
1091
- 'c3': FloatType(nullable=False),
1092
- 'c4': BoolType(nullable=False),
1093
- }
1094
- new_t = cl.create_table('insert_test', schema)
1095
- new_t.add_column(c5=lambda c2: c2 * c2, type=IntType())
1096
- new_t.add_column(c6=ptf.sum(new_t.c5, group_by=new_t.c4, order_by=new_t.c3))
1097
- rows = list(t.select(t.c2, t.c4, t.c3).collect())
1098
- new_t.insert(rows)
1099
- _ = new_t.show(0)
1100
-
1101
- def test_revert(self, test_client: pxt.Client) -> None:
1102
- cl = test_client
1103
- t1 = make_tbl(cl, 'test1', ['c1', 'c2'])
1104
- assert t1.version() == 0
1105
- rows1 = create_table_data(t1)
1106
- t1.insert(rows1)
1107
- assert t1.count() == len(rows1)
1108
- assert t1.version() == 1
1109
- rows2 = create_table_data(t1)
1110
- t1.insert(rows2)
1111
- assert t1.count() == len(rows1) + len(rows2)
1112
- assert t1.version() == 2
1113
- t1.revert()
1114
- assert t1.count() == len(rows1)
1115
- assert t1.version() == 1
1116
- t1.insert(rows2)
1117
- assert t1.count() == len(rows1) + len(rows2)
1118
- assert t1.version() == 2
1119
-
1120
- # can't revert past version 0
1121
- t1.revert()
1122
- t1.revert()
1123
- with pytest.raises(excs.Error) as excinfo:
1124
- t1.revert()
1125
- assert 'version 0' in str(excinfo.value)
1126
-
1127
- def test_add_column(self, test_tbl: catalog.Table) -> None:
1128
- t = test_tbl
1129
- num_orig_cols = len(t.columns())
1130
- t.add_column(add1=pxt.IntType(nullable=True))
1131
- assert len(t.columns()) == num_orig_cols + 1
1132
-
1133
- with pytest.raises(excs.Error) as exc_info:
1134
- _ = t.add_column(add2=pxt.IntType(nullable=False))
1135
- assert 'cannot add non-nullable' in str(exc_info.value).lower()
1136
-
1137
- with pytest.raises(excs.Error) as exc_info:
1138
- _ = t.add_column(add2=pxt.IntType(nullable=False), add3=pxt.StringType())
1139
- assert 'requires exactly one keyword argument' in str(exc_info.value).lower()
1140
-
1141
- with pytest.raises(excs.Error) as exc_info:
1142
- _ = t.add_column(pos=pxt.StringType(nullable=True))
1143
- assert 'is reserved' in str(exc_info.value).lower()
1144
-
1145
- with pytest.raises(excs.Error) as exc_info:
1146
- _ = t.add_column(add2=pxt.IntType(nullable=False), type=pxt.StringType())
1147
- assert '"type" is redundant' in str(exc_info.value).lower()
1148
-
1149
- with pytest.raises(excs.Error) as exc_info:
1150
- _ = t.add_column(add2=[[1.0, 2.0], [3.0, 4.0]], type=pxt.StringType())
1151
- assert '"type" is redundant' in str(exc_info.value).lower()
1152
-
1153
- with pytest.raises(excs.Error) as exc_info:
1154
- _ = t.add_column(add2=pxt.IntType(nullable=False), stored=False)
1155
- assert 'stored=false only applies' in str(exc_info.value).lower()
1156
-
1157
- # duplicate name
1158
- with pytest.raises(excs.Error) as exc_info:
1159
- _ = t.add_column(c1=pxt.IntType())
1160
- assert 'duplicate column name' in str(exc_info.value).lower()
1161
-
1162
- # 'stored' kwarg only applies to computed image columns
1163
- with pytest.raises(excs.Error):
1164
- _ = t.add_column(c5=IntType(), stored=False)
1165
- with pytest.raises(excs.Error):
1166
- _ = t.add_column(c5=ImageType(), stored=False)
1167
- with pytest.raises(excs.Error):
1168
- _ = t.add_column(c5=(t.c2 + t.c3), stored=False)
1169
-
1170
- # make sure this is still true after reloading the metadata
1171
- cl = pxt.Client(reload=True)
1172
- t = cl.get_table(t.get_name())
1173
- assert len(t.columns()) == num_orig_cols + 1
1174
-
1175
- # revert() works
1176
- t.revert()
1177
- assert len(t.columns()) == num_orig_cols
1178
-
1179
- # make sure this is still true after reloading the metadata once more
1180
- cl = pxt.Client(reload=True)
1181
- t = cl.get_table(t.get_name())
1182
- assert len(t.columns()) == num_orig_cols
1183
-
1184
- def test_add_column_setitem(self, test_tbl: catalog.Table) -> None:
1185
- t = test_tbl
1186
- num_orig_cols = len(t.columns())
1187
- t['add1'] = pxt.IntType(nullable=True)
1188
- assert len(t.columns()) == num_orig_cols + 1
1189
- t['computed1'] = t.c2 + 1
1190
- assert len(t.columns()) == num_orig_cols + 2
1191
-
1192
- with pytest.raises(excs.Error) as exc_info:
1193
- _ = t['pos'] = pxt.StringType()
1194
- assert 'is reserved' in str(exc_info.value).lower()
1195
-
1196
- with pytest.raises(excs.Error) as exc_info:
1197
- _ = t[2] = pxt.StringType()
1198
- assert 'must be a string' in str(exc_info.value).lower()
1199
-
1200
- with pytest.raises(excs.Error) as exc_info:
1201
- _ = t['add 2'] = pxt.StringType()
1202
- assert 'invalid column name' in str(exc_info.value).lower()
1203
-
1204
- with pytest.raises(excs.Error) as exc_info:
1205
- _ = t['add2'] = {'value': t.c2 + 1, 'type': pxt.StringType()}
1206
- assert '"type" is redundant' in str(exc_info.value).lower()
1207
-
1208
- with pytest.raises(excs.Error) as exc_info:
1209
- _ = t['add2'] = {'value': pxt.IntType()}
1210
- assert 'value needs to be either' in str(exc_info.value).lower()
1211
-
1212
- with pytest.raises(excs.Error) as exc_info:
1213
- _ = t['add2'] = {'value': t.c2 + 1, 'stored': False}
1214
- assert 'stored=false only applies' in str(exc_info.value).lower()
1215
-
1216
- # duplicate name
1217
- with pytest.raises(excs.Error) as exc_info:
1218
- _ = t['c1'] = pxt.IntType()
1219
- assert 'duplicate column name' in str(exc_info.value).lower()
1220
-
1221
- # make sure this is still true after reloading the metadata
1222
- cl = pxt.Client(reload=True)
1223
- t = cl.get_table(t.get_name())
1224
- assert len(t.columns()) == num_orig_cols + 2
1225
-
1226
- # revert() works
1227
- t.revert()
1228
- t.revert()
1229
- assert len(t.columns()) == num_orig_cols
1230
-
1231
- # make sure this is still true after reloading the metadata once more
1232
- cl = pxt.Client(reload=True)
1233
- t = cl.get_table(t.get_name())
1234
- assert len(t.columns()) == num_orig_cols
1235
-
1236
- def test_drop_column(self, test_tbl: catalog.Table) -> None:
1237
- t = test_tbl
1238
- num_orig_cols = len(t.columns())
1239
- t.drop_column('c1')
1240
- assert len(t.columns()) == num_orig_cols - 1
1241
-
1242
- with pytest.raises(excs.Error):
1243
- t.drop_column('unknown')
1244
-
1245
- # make sure this is still true after reloading the metadata
1246
- cl = pxt.Client(reload=True)
1247
- t = cl.get_table(t.get_name())
1248
- assert len(t.columns()) == num_orig_cols - 1
1249
-
1250
- # revert() works
1251
- t.revert()
1252
- assert len(t.columns()) == num_orig_cols
1253
-
1254
- # make sure this is still true after reloading the metadata once more
1255
- cl = pxt.Client(reload=True)
1256
- t = cl.get_table(t.get_name())
1257
- assert len(t.columns()) == num_orig_cols
1258
-
1259
- def test_rename_column(self, test_tbl: catalog.Table) -> None:
1260
- t = test_tbl
1261
- num_orig_cols = len(t.columns())
1262
- t.rename_column('c1', 'c1_renamed')
1263
- assert len(t.columns()) == num_orig_cols
1264
-
1265
- def check_rename(t: pxt.Table, known: str, unknown: str) -> None:
1266
- with pytest.raises(AttributeError) as exc_info:
1267
- _ = t.select(t[unknown]).collect()
1268
- assert 'unknown' in str(exc_info.value).lower()
1269
- _ = t.select(t[known]).collect()
1270
-
1271
- check_rename(t, 'c1_renamed', 'c1')
1272
-
1273
- # unknown column
1274
- with pytest.raises(excs.Error):
1275
- t.rename_column('unknown', 'unknown_renamed')
1276
- # bad name
1277
- with pytest.raises(excs.Error):
1278
- t.rename_column('c2', 'bad name')
1279
- # existing name
1280
- with pytest.raises(excs.Error):
1281
- t.rename_column('c2', 'c3')
1282
-
1283
- # make sure this is still true after reloading the metadata
1284
- cl = pxt.Client(reload=True)
1285
- t = cl.get_table(t.get_name())
1286
- check_rename(t, 'c1_renamed', 'c1')
1287
-
1288
- # revert() works
1289
- _ = t.select(t.c1_renamed).collect()
1290
- t.revert()
1291
- _ = t.select(t.c1).collect()
1292
- #check_rename(t, 'c1', 'c1_renamed')
1293
-
1294
- # make sure this is still true after reloading the metadata once more
1295
- cl = pxt.Client(reload=True)
1296
- t = cl.get_table(t.get_name())
1297
- check_rename(t, 'c1', 'c1_renamed')
1298
-
1299
- def test_add_computed_column(self, test_tbl: catalog.Table) -> None:
1300
- t = test_tbl
1301
- status = t.add_column(add1=t.c2 + 10)
1302
- assert status.num_excs == 0
1303
- _ = t.show()
1304
-
1305
- # with exception in SQL
1306
- with pytest.raises(excs.Error):
1307
- t.add_column(add2=(t.c2 - 10) / (t.c3 - 10))
1308
-
1309
- # with exception in Python for c6.f2 == 10
1310
- status = t.add_column(add2=(t.c6.f2 - 10) / (t.c6.f2 - 10))
1311
- assert status.num_excs == 1
1312
- result = t[t.add2.errortype != None][t.c6.f2, t.add2, t.add2.errortype, t.add2.errormsg].show()
1313
- assert len(result) == 1
1314
-
1315
- # test case: exceptions in dependencies prevent execution of dependent exprs
1316
- status = t.add_column(add3=self.f2(self.f1(t.c2)))
1317
- assert status.num_excs == 10
1318
- result = t[t.add3.errortype != None][t.c2, t.add3, t.add3.errortype, t.add3.errormsg].show()
1319
- assert len(result) == 10
1320
-
1321
- def test_describe(self, test_tbl: catalog.Table) -> None:
1322
- t = test_tbl
1323
- fn = lambda c2: np.full((3, 4), c2)
1324
- t.add_column(computed1=fn, type=ArrayType((3, 4), dtype=IntType()))
1325
- t.describe()
1326
- t.comment = 'This is a comment.'
1327
- t.describe()
1328
-
1329
- # TODO: how to you check the output of these?
1330
- _ = repr(t)
1331
- _ = t._repr_html_()
1332
-
1333
- def test_common_col_names(self, test_client: pxt.Client) -> None:
1334
- """Make sure that commonly used column names don't collide with Table member vars"""
1335
- cl = test_client
1336
- schema = {'id': IntType(nullable=False), 'name': StringType(nullable=False)}
1337
- tbl = cl.create_table('test', schema)
1338
- status = tbl.insert({'id': id, 'name': str(id)} for id in range(10))
1339
- assert status.num_rows == 10
1340
- assert status.num_excs == 0
1341
- assert tbl.count() == 10
1342
- # we can create references to those column via __getattr__
1343
- _ = tbl.select(tbl.id, tbl.name).collect()