pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
pixeltable/tests/test_table.py
DELETED
|
@@ -1,1343 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
import math
|
|
3
|
-
import os
|
|
4
|
-
import random
|
|
5
|
-
from typing import List, Tuple
|
|
6
|
-
|
|
7
|
-
import PIL
|
|
8
|
-
import cv2
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pandas as pd
|
|
11
|
-
import pathlib
|
|
12
|
-
import pytest
|
|
13
|
-
|
|
14
|
-
import pixeltable as pxt
|
|
15
|
-
import pixeltable.functions as ptf
|
|
16
|
-
from pixeltable import catalog
|
|
17
|
-
from pixeltable import exceptions as excs
|
|
18
|
-
from pixeltable.iterators import FrameIterator
|
|
19
|
-
from pixeltable.tests.utils import \
|
|
20
|
-
make_tbl, create_table_data, read_data_file, get_video_files, get_audio_files, get_image_files, get_documents, \
|
|
21
|
-
assert_resultset_eq, assert_hf_dataset_equal, make_test_arrow_table, validate_update_status
|
|
22
|
-
from pixeltable.tests.utils import skip_test_if_not_installed
|
|
23
|
-
from pixeltable.type_system import \
|
|
24
|
-
StringType, IntType, FloatType, TimestampType, ImageType, VideoType, JsonType, BoolType, ArrayType, AudioType, \
|
|
25
|
-
DocumentType
|
|
26
|
-
from pixeltable.utils.filecache import FileCache
|
|
27
|
-
from pixeltable.utils.media_store import MediaStore
|
|
28
|
-
|
|
29
|
-
class TestTable:
|
|
30
|
-
# exc for a % 10 == 0
|
|
31
|
-
@pxt.udf(return_type=FloatType(), param_types=[IntType()])
|
|
32
|
-
def f1(a: int) -> float:
|
|
33
|
-
return a / (a % 10)
|
|
34
|
-
|
|
35
|
-
# exception for a == None; this should not get triggered
|
|
36
|
-
@pxt.udf(return_type=FloatType(), param_types=[FloatType()])
|
|
37
|
-
def f2(a: float) -> float:
|
|
38
|
-
return a + 1
|
|
39
|
-
|
|
40
|
-
@pxt.expr_udf(param_types=[IntType(nullable=False)])
|
|
41
|
-
def add1(a: int) -> int:
|
|
42
|
-
return a + 1
|
|
43
|
-
|
|
44
|
-
@pxt.uda(
|
|
45
|
-
update_types=[IntType()], value_type=IntType(), requires_order_by=True,
|
|
46
|
-
allows_window=True)
|
|
47
|
-
class window_fn:
|
|
48
|
-
def __init__(self):
|
|
49
|
-
pass
|
|
50
|
-
def update(self, i: int) -> None:
|
|
51
|
-
pass
|
|
52
|
-
def value(self) -> int:
|
|
53
|
-
return 1
|
|
54
|
-
|
|
55
|
-
@pxt.expr_udf(param_types=[IntType(nullable=False)])
|
|
56
|
-
def add1(a: int) -> int:
|
|
57
|
-
return a + 1
|
|
58
|
-
|
|
59
|
-
def test_create(self, test_client: pxt.Client) -> None:
|
|
60
|
-
cl = test_client
|
|
61
|
-
cl.create_dir('dir1')
|
|
62
|
-
schema = {
|
|
63
|
-
'c1': StringType(nullable=False),
|
|
64
|
-
'c2': IntType(nullable=False),
|
|
65
|
-
'c3': FloatType(nullable=False),
|
|
66
|
-
'c4': TimestampType(nullable=False),
|
|
67
|
-
}
|
|
68
|
-
tbl = cl.create_table('test', schema)
|
|
69
|
-
_ = cl.create_table('dir1.test', schema)
|
|
70
|
-
|
|
71
|
-
with pytest.raises(excs.Error):
|
|
72
|
-
_ = cl.create_table('1test', schema)
|
|
73
|
-
with pytest.raises(excs.Error):
|
|
74
|
-
_ = cl.create_table('bad name', schema={'c1': StringType()})
|
|
75
|
-
with pytest.raises(excs.Error):
|
|
76
|
-
_ = cl.create_table('test', schema)
|
|
77
|
-
with pytest.raises(excs.Error):
|
|
78
|
-
_ = cl.create_table('dir2.test2', schema)
|
|
79
|
-
|
|
80
|
-
_ = cl.list_tables()
|
|
81
|
-
_ = cl.list_tables('dir1')
|
|
82
|
-
|
|
83
|
-
with pytest.raises(excs.Error):
|
|
84
|
-
_ = cl.list_tables('1dir')
|
|
85
|
-
with pytest.raises(excs.Error):
|
|
86
|
-
_ = cl.list_tables('dir2')
|
|
87
|
-
|
|
88
|
-
# test loading with new client
|
|
89
|
-
cl = pxt.Client(reload=True)
|
|
90
|
-
|
|
91
|
-
tbl = cl.get_table('test')
|
|
92
|
-
assert isinstance(tbl, catalog.InsertableTable)
|
|
93
|
-
tbl.add_column(c5=IntType())
|
|
94
|
-
tbl.drop_column('c1')
|
|
95
|
-
tbl.rename_column('c2', 'c17')
|
|
96
|
-
|
|
97
|
-
cl.move('test', 'test2')
|
|
98
|
-
|
|
99
|
-
cl.drop_table('test2')
|
|
100
|
-
cl.drop_table('dir1.test')
|
|
101
|
-
|
|
102
|
-
with pytest.raises(excs.Error):
|
|
103
|
-
cl.drop_table('test')
|
|
104
|
-
with pytest.raises(excs.Error):
|
|
105
|
-
cl.drop_table('dir1.test2')
|
|
106
|
-
with pytest.raises(excs.Error):
|
|
107
|
-
cl.drop_table('.test2')
|
|
108
|
-
|
|
109
|
-
def test_empty_table(self, test_client: pxt.Client) -> None:
|
|
110
|
-
cl = test_client
|
|
111
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
112
|
-
cl.create_table('empty_table', {})
|
|
113
|
-
assert 'Table schema is empty' in str(exc_info.value)
|
|
114
|
-
|
|
115
|
-
def test_table_attrs(self, test_client: pxt.Client) -> None:
|
|
116
|
-
cl = test_client
|
|
117
|
-
schema = {'c': StringType(nullable=False)}
|
|
118
|
-
num_retained_versions = 20
|
|
119
|
-
comment = "This is a table."
|
|
120
|
-
tbl = cl.create_table('test_table_attrs', schema, num_retained_versions=num_retained_versions, comment=comment)
|
|
121
|
-
assert tbl.num_retained_versions == num_retained_versions
|
|
122
|
-
assert tbl.comment == comment
|
|
123
|
-
new_num_retained_versions = 30
|
|
124
|
-
new_comment = "This is an updated table."
|
|
125
|
-
tbl.num_retained_versions = new_num_retained_versions
|
|
126
|
-
assert tbl.num_retained_versions == new_num_retained_versions
|
|
127
|
-
tbl.comment = new_comment
|
|
128
|
-
assert tbl.comment == new_comment
|
|
129
|
-
tbl.revert()
|
|
130
|
-
assert tbl.comment == comment
|
|
131
|
-
tbl.revert()
|
|
132
|
-
assert tbl.num_retained_versions == num_retained_versions
|
|
133
|
-
|
|
134
|
-
def test_import_parquet(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
|
|
135
|
-
skip_test_if_not_installed('pyarrow')
|
|
136
|
-
import pyarrow as pa
|
|
137
|
-
from pixeltable.utils.arrow import iter_tuples
|
|
138
|
-
|
|
139
|
-
parquet_dir = tmp_path / 'test_data'
|
|
140
|
-
parquet_dir.mkdir()
|
|
141
|
-
make_test_arrow_table(parquet_dir)
|
|
142
|
-
|
|
143
|
-
tab = test_client.import_parquet('test_parquet', parquet_path=str(parquet_dir))
|
|
144
|
-
assert 'test_parquet' in test_client.list_tables()
|
|
145
|
-
assert tab is not None
|
|
146
|
-
num_elts = tab.count()
|
|
147
|
-
arrow_tab: pa.Table = pa.parquet.read_table(str(parquet_dir))
|
|
148
|
-
assert num_elts == arrow_tab.num_rows
|
|
149
|
-
assert set(tab.column_names()) == set(arrow_tab.column_names)
|
|
150
|
-
|
|
151
|
-
result_set = tab.order_by(tab.c_id).collect()
|
|
152
|
-
column_types = tab.column_types()
|
|
153
|
-
|
|
154
|
-
for tup, arrow_tup in zip(result_set, iter_tuples(arrow_tab)):
|
|
155
|
-
assert tup['c_id'] == arrow_tup['c_id']
|
|
156
|
-
for col, val in tup.items():
|
|
157
|
-
if val is None:
|
|
158
|
-
assert arrow_tup[col] is None
|
|
159
|
-
continue
|
|
160
|
-
|
|
161
|
-
if column_types[col].is_array_type():
|
|
162
|
-
assert (val == arrow_tup[col]).all()
|
|
163
|
-
else:
|
|
164
|
-
assert val == arrow_tup[col]
|
|
165
|
-
|
|
166
|
-
def test_import_huggingface_dataset(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
|
|
167
|
-
skip_test_if_not_installed('datasets')
|
|
168
|
-
import datasets
|
|
169
|
-
|
|
170
|
-
test_cases = [
|
|
171
|
-
# { # includes a timestamp. 20MB for specific slice
|
|
172
|
-
# Disbled this test case because download is failing, and its not critical.
|
|
173
|
-
# 'dataset_name': 'c4',
|
|
174
|
-
# # see https://huggingface.co/datasets/allenai/c4/blob/main/realnewslike/c4-train.00000-of-00512.json.gz
|
|
175
|
-
# 'dataset': datasets.load_dataset(
|
|
176
|
-
# "allenai/c4",
|
|
177
|
-
# data_dir="realnewslike",
|
|
178
|
-
# data_files="c4-train.00000-of-00512.json.gz",
|
|
179
|
-
# split='train[:1000]',
|
|
180
|
-
# cache_dir=tmp_path
|
|
181
|
-
# ),
|
|
182
|
-
# },
|
|
183
|
-
{ # includes an embedding (array type), common in a few RAG datasets.
|
|
184
|
-
'dataset_name': 'cohere_wikipedia',
|
|
185
|
-
'dataset': datasets.load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3",
|
|
186
|
-
data_dir='cr').select_columns(['url', 'title', 'text', 'emb']),
|
|
187
|
-
# column with name `_id`` is not currently allowed by pixeltable rules,
|
|
188
|
-
# so filter out that column.
|
|
189
|
-
# cr subdir has a small number of rows, avoid running out of space in CI runner
|
|
190
|
-
# see https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/tree/main/cr
|
|
191
|
-
'schema_override': {'emb': ArrayType((1024,), dtype=FloatType(), nullable=False)}
|
|
192
|
-
},
|
|
193
|
-
# example of dataset dictionary with multiple splits
|
|
194
|
-
{
|
|
195
|
-
'dataset_name': 'rotten_tomatoes',
|
|
196
|
-
'dataset': datasets.load_dataset("rotten_tomatoes"),
|
|
197
|
-
},
|
|
198
|
-
]
|
|
199
|
-
|
|
200
|
-
# test a column name for splits other than the default of 'split'
|
|
201
|
-
split_column_name = 'my_split_col'
|
|
202
|
-
for rec in test_cases:
|
|
203
|
-
dataset_name = rec['dataset_name']
|
|
204
|
-
hf_dataset = rec['dataset']
|
|
205
|
-
|
|
206
|
-
tab = test_client.import_huggingface_dataset(
|
|
207
|
-
dataset_name,
|
|
208
|
-
hf_dataset,
|
|
209
|
-
column_name_for_split=split_column_name,
|
|
210
|
-
schema_override=rec.get('schema_override', None),
|
|
211
|
-
)
|
|
212
|
-
if isinstance(hf_dataset, datasets.Dataset):
|
|
213
|
-
assert_hf_dataset_equal(hf_dataset, tab.df(), split_column_name)
|
|
214
|
-
elif isinstance(hf_dataset, datasets.DatasetDict):
|
|
215
|
-
assert tab.count() == sum(hf_dataset.num_rows.values())
|
|
216
|
-
assert split_column_name in tab.column_names()
|
|
217
|
-
|
|
218
|
-
for dataset_name in hf_dataset:
|
|
219
|
-
df = tab.where(tab.my_split_col == dataset_name)
|
|
220
|
-
assert_hf_dataset_equal(hf_dataset[dataset_name], df, split_column_name)
|
|
221
|
-
else:
|
|
222
|
-
assert False
|
|
223
|
-
|
|
224
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
225
|
-
test_client.import_huggingface_dataset('test', {})
|
|
226
|
-
assert 'type(dataset)' in str(exc_info.value)
|
|
227
|
-
|
|
228
|
-
def test_image_table(self, test_client: pxt.Client) -> None:
|
|
229
|
-
n_sample_rows = 20
|
|
230
|
-
cl = test_client
|
|
231
|
-
schema = {
|
|
232
|
-
'img': ImageType(nullable=False),
|
|
233
|
-
'category': StringType(nullable=False),
|
|
234
|
-
'split': StringType(nullable=False),
|
|
235
|
-
'img_literal': ImageType(nullable=False),
|
|
236
|
-
}
|
|
237
|
-
tbl = cl.create_table('test', schema)
|
|
238
|
-
assert(MediaStore.count(tbl.get_id()) == 0)
|
|
239
|
-
|
|
240
|
-
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
241
|
-
sample_rows = random.sample(rows, n_sample_rows)
|
|
242
|
-
|
|
243
|
-
# add literal image data and column
|
|
244
|
-
for r in rows:
|
|
245
|
-
with open(r['img'], 'rb') as f:
|
|
246
|
-
r['img_literal'] = f.read()
|
|
247
|
-
|
|
248
|
-
tbl.insert(sample_rows)
|
|
249
|
-
assert(MediaStore.count(tbl.get_id()) == n_sample_rows)
|
|
250
|
-
|
|
251
|
-
# compare img and img_literal
|
|
252
|
-
# TODO: make tbl.select(tbl.img == tbl.img_literal) work
|
|
253
|
-
tdf = tbl.select(tbl.img, tbl.img_literal).show()
|
|
254
|
-
pdf = tdf.to_pandas()
|
|
255
|
-
for tup in pdf.itertuples():
|
|
256
|
-
assert tup.img == tup.img_literal
|
|
257
|
-
|
|
258
|
-
# Test adding stored image transformation
|
|
259
|
-
tbl.add_column(rotated=tbl.img.rotate(30), stored=True)
|
|
260
|
-
assert(MediaStore.count(tbl.get_id()) == 2 * n_sample_rows)
|
|
261
|
-
|
|
262
|
-
# Test MediaStore.stats()
|
|
263
|
-
stats = list(filter(lambda x: x[0] == tbl.get_id(), MediaStore.stats()))
|
|
264
|
-
assert len(stats) == 2 # Two columns
|
|
265
|
-
assert stats[0][2] == n_sample_rows # Each column has n_sample_rows associated images
|
|
266
|
-
assert stats[1][2] == n_sample_rows
|
|
267
|
-
|
|
268
|
-
# Test that version-specific images are cleared when table is reverted
|
|
269
|
-
tbl.revert()
|
|
270
|
-
assert(MediaStore.count(tbl.get_id()) == n_sample_rows)
|
|
271
|
-
|
|
272
|
-
# Test that all stored images are cleared when table is dropped
|
|
273
|
-
cl.drop_table('test')
|
|
274
|
-
assert(MediaStore.count(tbl.get_id()) == 0)
|
|
275
|
-
|
|
276
|
-
def test_schema_spec(self, test_client: pxt.Client) -> None:
|
|
277
|
-
cl = test_client
|
|
278
|
-
|
|
279
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
280
|
-
cl.create_table('test', {'c 1': IntType()})
|
|
281
|
-
assert 'invalid column name' in str(exc_info.value).lower()
|
|
282
|
-
|
|
283
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
284
|
-
cl.create_table('test', {'c1': {}})
|
|
285
|
-
assert '"type" is required' in str(exc_info.value)
|
|
286
|
-
|
|
287
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
288
|
-
cl.create_table('test', {'c1': {'xyz': IntType()}})
|
|
289
|
-
assert "invalid key 'xyz'" in str(exc_info.value)
|
|
290
|
-
|
|
291
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
292
|
-
cl.create_table('test', {'c1': {'stored': True}})
|
|
293
|
-
assert '"type" is required' in str(exc_info.value)
|
|
294
|
-
|
|
295
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
296
|
-
cl.create_table('test', {'c1': {'type': 'string'}})
|
|
297
|
-
assert 'must be a ColumnType' in str(exc_info.value)
|
|
298
|
-
|
|
299
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
300
|
-
cl.create_table('test', {'c1': {'value': 1, 'type': StringType()}})
|
|
301
|
-
assert '"type" is redundant' in str(exc_info.value)
|
|
302
|
-
|
|
303
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
304
|
-
cl.create_table('test', {'c1': {'value': pytest}})
|
|
305
|
-
assert 'value needs to be either' in str(exc_info.value)
|
|
306
|
-
|
|
307
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
308
|
-
def f() -> float:
|
|
309
|
-
return 1.0
|
|
310
|
-
cl.create_table('test', {'c1': {'value': f}})
|
|
311
|
-
assert '"type" is required' in str(exc_info.value)
|
|
312
|
-
|
|
313
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
314
|
-
cl.create_table('test', {'c1': {'type': StringType(), 'stored': 'true'}})
|
|
315
|
-
assert '"stored" must be a bool' in str(exc_info.value)
|
|
316
|
-
|
|
317
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
318
|
-
cl.create_table('test', {'c1': StringType()}, primary_key='c2')
|
|
319
|
-
assert 'primary key column c2 not found' in str(exc_info.value).lower()
|
|
320
|
-
|
|
321
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
322
|
-
cl.create_table('test', {'c1': StringType()}, primary_key=['c1', 'c2'])
|
|
323
|
-
assert 'primary key column c2 not found' in str(exc_info.value).lower()
|
|
324
|
-
|
|
325
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
326
|
-
cl.create_table('test', {'c1': StringType()}, primary_key=['c2'])
|
|
327
|
-
assert 'primary key column c2 not found' in str(exc_info.value).lower()
|
|
328
|
-
|
|
329
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
330
|
-
cl.create_table('test', {'c1': StringType()}, primary_key=0)
|
|
331
|
-
assert 'primary_key must be a' in str(exc_info.value).lower()
|
|
332
|
-
|
|
333
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
334
|
-
cl.create_table('test', {'c1': StringType(nullable=True)}, primary_key='c1')
|
|
335
|
-
assert 'cannot be nullable' in str(exc_info.value).lower()
|
|
336
|
-
|
|
337
|
-
def check_bad_media(
|
|
338
|
-
self, test_client: pxt.Client, rows: List[Tuple[str, bool]], col_type: pxt.ColumnType,
|
|
339
|
-
validate_local_path: bool = True
|
|
340
|
-
) -> None:
|
|
341
|
-
schema = {
|
|
342
|
-
'media': col_type,
|
|
343
|
-
'is_bad_media': BoolType(nullable=False),
|
|
344
|
-
}
|
|
345
|
-
tbl = test_client.create_table('test', schema)
|
|
346
|
-
|
|
347
|
-
assert len(rows) > 0
|
|
348
|
-
total_bad_rows = sum([int(row['is_bad_media']) for row in rows])
|
|
349
|
-
assert total_bad_rows > 0
|
|
350
|
-
|
|
351
|
-
# Mode 1: Validation error on bad input (default)
|
|
352
|
-
# we ignore the exact error here, because it depends on the media type
|
|
353
|
-
with pytest.raises(excs.Error):
|
|
354
|
-
tbl.insert(rows, fail_on_exception=True)
|
|
355
|
-
|
|
356
|
-
# Mode 2: ignore_errors=True, store error information in table
|
|
357
|
-
status = tbl.insert(rows, fail_on_exception=False)
|
|
358
|
-
_ = tbl.select(tbl.media, tbl.media.errormsg).show()
|
|
359
|
-
assert status.num_rows == len(rows)
|
|
360
|
-
assert status.num_excs == total_bad_rows
|
|
361
|
-
|
|
362
|
-
# check that we have the right number of bad and good rows
|
|
363
|
-
assert tbl.where(tbl.is_bad_media == True).count() == total_bad_rows
|
|
364
|
-
assert tbl.where(tbl.is_bad_media == False).count() == len(rows) - total_bad_rows
|
|
365
|
-
|
|
366
|
-
# check error type is set correctly
|
|
367
|
-
assert tbl.where((tbl.is_bad_media == True) & (tbl.media.errortype == None)).count() == 0
|
|
368
|
-
assert tbl.where((tbl.is_bad_media == False) & (tbl.media.errortype == None)).count() \
|
|
369
|
-
== len(rows) - total_bad_rows
|
|
370
|
-
|
|
371
|
-
# check fileurl is set for valid images, and check no file url is set for bad images
|
|
372
|
-
assert tbl.where((tbl.is_bad_media == False) & (tbl.media.fileurl == None)).count() == 0
|
|
373
|
-
assert tbl.where((tbl.is_bad_media == True) & (tbl.media.fileurl != None)).count() == 0
|
|
374
|
-
|
|
375
|
-
if validate_local_path:
|
|
376
|
-
# check that tbl.media is a valid local path
|
|
377
|
-
paths = tbl.where(tbl.media != None).select(output=tbl.media).collect()['output']
|
|
378
|
-
for path in paths:
|
|
379
|
-
assert os.path.exists(path) and os.path.isfile(path)
|
|
380
|
-
|
|
381
|
-
def test_validate_image(self, test_client: pxt.Client) -> None:
|
|
382
|
-
rows = read_data_file('imagenette2-160', 'manifest_bad.csv', ['img'])
|
|
383
|
-
rows = [{'media': r['img'], 'is_bad_media': r['is_bad_image']} for r in rows]
|
|
384
|
-
self.check_bad_media(test_client, rows, ImageType(nullable=True), validate_local_path=False)
|
|
385
|
-
|
|
386
|
-
def test_validate_video(self, test_client: pxt.Client) -> None:
|
|
387
|
-
files = get_video_files(include_bad_video=True)
|
|
388
|
-
rows = [{'media': f, 'is_bad_media': f.endswith('bad_video.mp4')} for f in files]
|
|
389
|
-
self.check_bad_media(test_client, rows, VideoType(nullable=True))
|
|
390
|
-
|
|
391
|
-
def test_validate_audio(self, test_client: pxt.Client) -> None:
|
|
392
|
-
files = get_audio_files(include_bad_audio=True)
|
|
393
|
-
rows = [{'media': f, 'is_bad_media': f.endswith('bad_audio.mp3')} for f in files]
|
|
394
|
-
self.check_bad_media(test_client, rows, AudioType(nullable=True))
|
|
395
|
-
|
|
396
|
-
def test_validate_docs(self, test_client: pxt.Client) -> None:
|
|
397
|
-
valid_doc_paths = get_documents()
|
|
398
|
-
invalid_doc_paths = [get_video_files()[0], get_audio_files()[0], get_image_files()[0]]
|
|
399
|
-
doc_paths = valid_doc_paths + invalid_doc_paths
|
|
400
|
-
is_valid = [True] * len(valid_doc_paths) + [False] * len(invalid_doc_paths)
|
|
401
|
-
rows = [{'media': f, 'is_bad_media': not is_valid} for f, is_valid in zip(doc_paths, is_valid)]
|
|
402
|
-
self.check_bad_media(test_client, rows, DocumentType(nullable=True))
|
|
403
|
-
|
|
404
|
-
def test_validate_external_url(self, test_client: pxt.Client) -> None:
|
|
405
|
-
skip_test_if_not_installed('boto3')
|
|
406
|
-
rows = [
|
|
407
|
-
{'media': 's3://open-images-dataset/validation/doesnotexist.jpg', 'is_bad_media': True},
|
|
408
|
-
{'media': 'https://archive.random.org/download?file=2024-01-28.bin', 'is_bad_media': True}, # 403 error
|
|
409
|
-
{'media': 's3://open-images-dataset/validation/3c02ca9ec9b2b77b.jpg', 'is_bad_media': True}, # wrong media
|
|
410
|
-
# test s3 url
|
|
411
|
-
{
|
|
412
|
-
'media': 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4',
|
|
413
|
-
'is_bad_media': False
|
|
414
|
-
},
|
|
415
|
-
# test http url
|
|
416
|
-
{
|
|
417
|
-
'media': 'https://github.com/pixeltable/pixeltable/raw/master/pixeltable/tests/data/videos/bangkok.mp4',
|
|
418
|
-
'is_bad_media': False
|
|
419
|
-
},
|
|
420
|
-
|
|
421
|
-
]
|
|
422
|
-
self.check_bad_media(test_client, rows, VideoType(nullable=True))
|
|
423
|
-
|
|
424
|
-
def test_create_s3_image_table(self, test_client: pxt.Client) -> None:
|
|
425
|
-
skip_test_if_not_installed('boto3')
|
|
426
|
-
cl = test_client
|
|
427
|
-
tbl = cl.create_table('test', {'img': ImageType(nullable=False)})
|
|
428
|
-
# this is needed because Client.reset_catalog() doesn't call TableVersion.drop(), which would
|
|
429
|
-
# clear the file cache
|
|
430
|
-
# TODO: change reset_catalog() to drop tables
|
|
431
|
-
FileCache.get().clear()
|
|
432
|
-
cache_stats = FileCache.get().stats()
|
|
433
|
-
assert cache_stats.num_requests == 0, f'{str(cache_stats)} tbl_id={tbl.get_id()}'
|
|
434
|
-
# add computed column to make sure that external files are cached locally during insert
|
|
435
|
-
tbl.add_column(rotated=tbl.img.rotate(30), stored=True)
|
|
436
|
-
urls = [
|
|
437
|
-
's3://open-images-dataset/validation/3c02ca9ec9b2b77b.jpg',
|
|
438
|
-
's3://open-images-dataset/validation/3c13e0015b6c3bcf.jpg',
|
|
439
|
-
's3://open-images-dataset/validation/3ba5380490084697.jpg',
|
|
440
|
-
's3://open-images-dataset/validation/3afeb4b34f90c0cf.jpg',
|
|
441
|
-
's3://open-images-dataset/validation/3b07a2c0d5c0c789.jpg',
|
|
442
|
-
]
|
|
443
|
-
|
|
444
|
-
tbl.insert({'img': url} for url in urls)
|
|
445
|
-
# check that we populated the cache
|
|
446
|
-
cache_stats = FileCache.get().stats()
|
|
447
|
-
assert cache_stats.num_requests == len(urls), f'{str(cache_stats)} tbl_id={tbl.get_id()}'
|
|
448
|
-
assert cache_stats.num_hits == 0
|
|
449
|
-
assert FileCache.get().num_files() == len(urls)
|
|
450
|
-
assert FileCache.get().num_files(tbl.get_id()) == len(urls)
|
|
451
|
-
assert FileCache.get().avg_file_size() > 0
|
|
452
|
-
|
|
453
|
-
# query: we read from the cache
|
|
454
|
-
_ = tbl.show(0)
|
|
455
|
-
cache_stats = FileCache.get().stats()
|
|
456
|
-
assert cache_stats.num_requests == 2 * len(urls)
|
|
457
|
-
assert cache_stats.num_hits == len(urls)
|
|
458
|
-
|
|
459
|
-
# after clearing the cache, we need to re-fetch the files
|
|
460
|
-
FileCache.get().clear()
|
|
461
|
-
_ = tbl.show(0)
|
|
462
|
-
cache_stats = FileCache.get().stats()
|
|
463
|
-
assert cache_stats.num_requests == len(urls)
|
|
464
|
-
assert cache_stats.num_hits == 0
|
|
465
|
-
|
|
466
|
-
# start with fresh client and FileCache instance to test FileCache initialization with pre-existing files
|
|
467
|
-
cl = pxt.Client(reload=True)
|
|
468
|
-
# is there a better way to do this?
|
|
469
|
-
FileCache._instance = None
|
|
470
|
-
t = cl.get_table('test')
|
|
471
|
-
_ = t.show(0)
|
|
472
|
-
cache_stats = FileCache.get().stats()
|
|
473
|
-
assert cache_stats.num_requests == len(urls)
|
|
474
|
-
assert cache_stats.num_hits == len(urls)
|
|
475
|
-
|
|
476
|
-
# dropping the table also clears the file cache
|
|
477
|
-
cl.drop_table('test')
|
|
478
|
-
cache_stats = FileCache.get().stats()
|
|
479
|
-
assert cache_stats.total_size == 0
|
|
480
|
-
|
|
481
|
-
def test_video_url(self, test_client: pxt.Client) -> None:
|
|
482
|
-
skip_test_if_not_installed('boto3')
|
|
483
|
-
cl = test_client
|
|
484
|
-
schema = {
|
|
485
|
-
'payload': IntType(nullable=False),
|
|
486
|
-
'video': VideoType(nullable=False),
|
|
487
|
-
}
|
|
488
|
-
tbl = cl.create_table('test', schema)
|
|
489
|
-
url = 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4'
|
|
490
|
-
tbl.insert(payload=1, video=url)
|
|
491
|
-
row = tbl.select(tbl.video.fileurl, tbl.video.localpath).collect()[0]
|
|
492
|
-
assert row['video_fileurl'] == url
|
|
493
|
-
# row[1] contains valid path to an mp4 file
|
|
494
|
-
local_path = row['video_localpath']
|
|
495
|
-
assert os.path.exists(local_path) and os.path.isfile(local_path)
|
|
496
|
-
cap = cv2.VideoCapture(local_path)
|
|
497
|
-
# TODO: this isn't sufficient to determine that this is actually a video, rather than an image
|
|
498
|
-
assert cap.isOpened()
|
|
499
|
-
cap.release()
|
|
500
|
-
|
|
501
|
-
def test_create_video_table(self, test_client: pxt.Client) -> None:
|
|
502
|
-
skip_test_if_not_installed('boto3')
|
|
503
|
-
cl = test_client
|
|
504
|
-
tbl = cl.create_table(
|
|
505
|
-
'test_tbl',
|
|
506
|
-
{'payload': IntType(nullable=False), 'video': VideoType(nullable=True)})
|
|
507
|
-
args = {'video': tbl.video, 'fps': 0}
|
|
508
|
-
view = cl.create_view('test_view', tbl, iterator_class=FrameIterator, iterator_args=args)
|
|
509
|
-
view.add_column(c1=view.frame.rotate(30), stored=True)
|
|
510
|
-
view.add_column(c2=view.c1.rotate(40), stored=False)
|
|
511
|
-
view.add_column(c3=view.c2.rotate(50), stored=True)
|
|
512
|
-
# a non-materialized column that refers to another non-materialized column
|
|
513
|
-
view.add_column(c4=view.c2.rotate(60), stored=False)
|
|
514
|
-
|
|
515
|
-
# cols computed with window functions are stored by default
|
|
516
|
-
view.add_column(c5=self.window_fn(view.frame_idx, 1, group_by=view.video))
|
|
517
|
-
|
|
518
|
-
# reload to make sure that metadata gets restored correctly
|
|
519
|
-
cl = pxt.Client(reload=True)
|
|
520
|
-
tbl = cl.get_table('test_tbl')
|
|
521
|
-
view = cl.get_table('test_view')
|
|
522
|
-
# we're inserting only a single row and the video column is not in position 0
|
|
523
|
-
url = 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4'
|
|
524
|
-
status = tbl.insert(payload=1, video=url)
|
|
525
|
-
assert status.num_excs == 0
|
|
526
|
-
# * 2: we have 2 stored img cols
|
|
527
|
-
assert MediaStore.count(view.get_id()) == view.count() * 2
|
|
528
|
-
# also insert a local file
|
|
529
|
-
tbl.insert(payload=1, video=get_video_files()[0])
|
|
530
|
-
assert MediaStore.count(view.get_id()) == view.count() * 2
|
|
531
|
-
|
|
532
|
-
# TODO: test inserting Nulls
|
|
533
|
-
#status = tbl.insert(payload=1, video=None)
|
|
534
|
-
#assert status.num_excs == 0
|
|
535
|
-
|
|
536
|
-
# revert() clears stored images
|
|
537
|
-
tbl.revert()
|
|
538
|
-
tbl.revert()
|
|
539
|
-
assert MediaStore.count(view.get_id()) == 0
|
|
540
|
-
|
|
541
|
-
with pytest.raises(excs.Error):
|
|
542
|
-
# can't drop frame col
|
|
543
|
-
view.drop_column('frame')
|
|
544
|
-
with pytest.raises(excs.Error):
|
|
545
|
-
# can't drop frame_idx col
|
|
546
|
-
view.drop_column('frame_idx')
|
|
547
|
-
|
|
548
|
-
# drop() clears stored images and the cache
|
|
549
|
-
tbl.insert(payload=1, video=get_video_files()[0])
|
|
550
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
551
|
-
cl.drop_table('test_tbl')
|
|
552
|
-
assert 'has dependents: test_view' in str(exc_info.value)
|
|
553
|
-
cl.drop_table('test_view')
|
|
554
|
-
cl.drop_table('test_tbl')
|
|
555
|
-
assert MediaStore.count(view.get_id()) == 0
|
|
556
|
-
|
|
557
|
-
def test_insert_nulls(self, test_client: pxt.Client) -> None:
|
|
558
|
-
cl = test_client
|
|
559
|
-
schema = {
|
|
560
|
-
'c1': StringType(nullable=True),
|
|
561
|
-
'c2': IntType(nullable=True),
|
|
562
|
-
'c3': FloatType(nullable=True),
|
|
563
|
-
'c4': BoolType(nullable=True),
|
|
564
|
-
'c5': ArrayType((2, 3), dtype=IntType(), nullable=True),
|
|
565
|
-
'c6': JsonType(nullable=True),
|
|
566
|
-
'c7': ImageType(nullable=True),
|
|
567
|
-
'c8': VideoType(nullable=True),
|
|
568
|
-
}
|
|
569
|
-
t = cl.create_table('test1', schema)
|
|
570
|
-
status = t.insert(c1='abc')
|
|
571
|
-
assert status.num_rows == 1
|
|
572
|
-
assert status.num_excs == 0
|
|
573
|
-
|
|
574
|
-
def test_insert(self, test_client: pxt.Client) -> None:
|
|
575
|
-
cl = test_client
|
|
576
|
-
schema = {
|
|
577
|
-
'c1': StringType(nullable=False),
|
|
578
|
-
'c2': IntType(nullable=False),
|
|
579
|
-
'c3': FloatType(nullable=False),
|
|
580
|
-
'c4': BoolType(nullable=False),
|
|
581
|
-
'c5': ArrayType((2, 3), dtype=IntType(), nullable=False),
|
|
582
|
-
'c6': JsonType(nullable=False),
|
|
583
|
-
'c7': ImageType(nullable=False),
|
|
584
|
-
'c8': VideoType(nullable=False),
|
|
585
|
-
}
|
|
586
|
-
t = cl.create_table('test1', schema)
|
|
587
|
-
rows = create_table_data(t)
|
|
588
|
-
status = t.insert(rows)
|
|
589
|
-
assert status.num_rows == len(rows)
|
|
590
|
-
assert status.num_excs == 0
|
|
591
|
-
|
|
592
|
-
# alternate (kwargs) insert syntax
|
|
593
|
-
status = t.insert(
|
|
594
|
-
c1='string',
|
|
595
|
-
c2=91,
|
|
596
|
-
c3=1.0,
|
|
597
|
-
c4=True,
|
|
598
|
-
c5=np.ones((2, 3), dtype=np.dtype(np.int64)),
|
|
599
|
-
c6={'key': 'val'},
|
|
600
|
-
c7=get_image_files()[0],
|
|
601
|
-
c8=get_video_files()[0]
|
|
602
|
-
)
|
|
603
|
-
assert status.num_rows == 1
|
|
604
|
-
assert status.num_excs == 0
|
|
605
|
-
|
|
606
|
-
# empty input
|
|
607
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
608
|
-
t.insert([])
|
|
609
|
-
assert 'empty' in str(exc_info.value)
|
|
610
|
-
|
|
611
|
-
# missing column
|
|
612
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
613
|
-
# drop first column
|
|
614
|
-
col_names = list(rows[0].keys())[1:]
|
|
615
|
-
new_rows = [{col_name: row[col_name] for col_name in col_names} for row in rows]
|
|
616
|
-
t.insert(new_rows)
|
|
617
|
-
assert 'Missing' in str(exc_info.value)
|
|
618
|
-
|
|
619
|
-
# incompatible schema
|
|
620
|
-
for (col_name, col_type), value_col_name in zip(schema.items(), ['c2', 'c3', 'c5', 'c5', 'c6', 'c7', 'c2', 'c2']):
|
|
621
|
-
cl.drop_table('test1', ignore_errors=True)
|
|
622
|
-
t = cl.create_table('test1', {col_name: col_type})
|
|
623
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
624
|
-
t.insert({col_name: r[value_col_name]} for r in rows)
|
|
625
|
-
assert 'expected' in str(exc_info.value).lower()
|
|
626
|
-
|
|
627
|
-
# rows not list of dicts
|
|
628
|
-
cl.drop_table('test1', ignore_errors=True)
|
|
629
|
-
t = cl.create_table('test1', {'c1': StringType()})
|
|
630
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
631
|
-
t.insert(['1'])
|
|
632
|
-
assert 'list of dictionaries' in str(exc_info.value)
|
|
633
|
-
|
|
634
|
-
# bad null value
|
|
635
|
-
cl.drop_table('test1', ignore_errors=True)
|
|
636
|
-
t = cl.create_table('test1', {'c1': StringType(nullable=False)})
|
|
637
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
638
|
-
t.insert(c1=None)
|
|
639
|
-
assert 'expected non-None' in str(exc_info.value)
|
|
640
|
-
|
|
641
|
-
# bad array literal
|
|
642
|
-
cl.drop_table('test1', ignore_errors=True)
|
|
643
|
-
t = cl.create_table('test1', {'c5': ArrayType((2, 3), dtype=IntType(), nullable=False)})
|
|
644
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
645
|
-
t.insert(c5=np.ndarray((3, 2)))
|
|
646
|
-
assert 'expected ndarray((2, 3)' in str(exc_info.value)
|
|
647
|
-
|
|
648
|
-
def test_insert_string_with_null(self, test_client: pxt.Client) -> None:
|
|
649
|
-
cl = test_client
|
|
650
|
-
t = cl.create_table('test', {'c1': StringType()})
|
|
651
|
-
|
|
652
|
-
t.insert([{'c1': 'this is a python\x00string'}])
|
|
653
|
-
assert t.count() == 1
|
|
654
|
-
for tup in t.df().collect():
|
|
655
|
-
assert tup['c1'] == 'this is a python string'
|
|
656
|
-
|
|
657
|
-
def test_query(self, test_client: pxt.Client) -> None:
|
|
658
|
-
skip_test_if_not_installed('boto3')
|
|
659
|
-
cl = test_client
|
|
660
|
-
col_names = ['c1', 'c2', 'c3', 'c4', 'c5']
|
|
661
|
-
t = make_tbl(cl, 'test', col_names)
|
|
662
|
-
rows = create_table_data(t)
|
|
663
|
-
t.insert(rows)
|
|
664
|
-
_ = t.show(n=0)
|
|
665
|
-
|
|
666
|
-
# test querying existing table
|
|
667
|
-
cl = pxt.Client(reload=True)
|
|
668
|
-
t2 = cl.get_table('test')
|
|
669
|
-
_ = t2.show(n=0)
|
|
670
|
-
|
|
671
|
-
def test_batch_update(self, test_tbl: pxt.Table) -> None:
|
|
672
|
-
t = test_tbl
|
|
673
|
-
validate_update_status(
|
|
674
|
-
t.batch_update([{'c1': '1', 'c2': 1}, {'c1': '2', 'c2': 2}]),
|
|
675
|
-
expected_rows=2)
|
|
676
|
-
assert t.where(t.c2 == 1).collect()[0]['c1'] == '1'
|
|
677
|
-
assert t.where(t.c2 == 2).collect()[0]['c1'] == '2'
|
|
678
|
-
validate_update_status(
|
|
679
|
-
t.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two', '_rowid': (2,)}]),
|
|
680
|
-
expected_rows=2)
|
|
681
|
-
assert t.where(t.c2 == 1).collect()[0]['c1'] == 'one'
|
|
682
|
-
assert t.where(t.c2 == 2).collect()[0]['c1'] == 'two'
|
|
683
|
-
|
|
684
|
-
cl = pxt.Client()
|
|
685
|
-
# test composite primary key
|
|
686
|
-
schema = {'c1': StringType(), 'c2': IntType(), 'c3': FloatType()}
|
|
687
|
-
t = cl.create_table('composite', schema=schema, primary_key=['c1', 'c2'])
|
|
688
|
-
rows = [{'c1': str(i), 'c2': i, 'c3': float(i)} for i in range(10)]
|
|
689
|
-
validate_update_status(t.insert(rows), expected_rows=10)
|
|
690
|
-
|
|
691
|
-
validate_update_status(
|
|
692
|
-
t.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0}, {'c1': '2', 'c2': 2, 'c3': 3.0}]),
|
|
693
|
-
expected_rows=2)
|
|
694
|
-
|
|
695
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
696
|
-
# can't mix _rowid with primary key
|
|
697
|
-
_ = t.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0, '_rowid': (1,)}])
|
|
698
|
-
assert 'c1 is a primary key column' in str(exc_info.value).lower()
|
|
699
|
-
|
|
700
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
701
|
-
# bad literal
|
|
702
|
-
_ = t.batch_update([{'c2': 1, 'c3': 'a'}])
|
|
703
|
-
assert "'a' is not a valid literal" in str(exc_info.value).lower()
|
|
704
|
-
|
|
705
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
706
|
-
# missing primary key column
|
|
707
|
-
t.batch_update([{'c1': '1', 'c3': 2.0}])
|
|
708
|
-
assert 'primary key columns (c2) missing' in str(exc_info.value).lower()
|
|
709
|
-
|
|
710
|
-
# table without primary key
|
|
711
|
-
t2 = cl.create_table('no_pk', schema=schema)
|
|
712
|
-
validate_update_status(t2.insert(rows), expected_rows=10)
|
|
713
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
714
|
-
_ = t2.batch_update([{'c1': '1', 'c2': 1, 'c3': 2.0}])
|
|
715
|
-
assert 'must have primary key for batch update' in str(exc_info.value).lower()
|
|
716
|
-
|
|
717
|
-
# updating with _rowid still works
|
|
718
|
-
validate_update_status(
|
|
719
|
-
t2.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two', '_rowid': (2,)}]),
|
|
720
|
-
expected_rows=2)
|
|
721
|
-
assert t2.where(t2.c2 == 1).collect()[0]['c1'] == 'one'
|
|
722
|
-
assert t2.where(t2.c2 == 2).collect()[0]['c1'] == 'two'
|
|
723
|
-
with pytest.raises(AssertionError):
|
|
724
|
-
# some rows are missing rowids
|
|
725
|
-
_ = t2.batch_update([{'c1': 'one', '_rowid': (1,)}, {'c1': 'two'}])
|
|
726
|
-
|
|
727
|
-
def test_update(self, test_tbl: pxt.Table, small_img_tbl) -> None:
|
|
728
|
-
t = test_tbl
|
|
729
|
-
# update every type with a literal
|
|
730
|
-
test_cases = [
|
|
731
|
-
('c1', 'new string'),
|
|
732
|
-
# TODO: ('c1n', None),
|
|
733
|
-
('c3', -1.0),
|
|
734
|
-
('c4', True),
|
|
735
|
-
('c5', datetime.datetime.now()),
|
|
736
|
-
('c6', [{'x': 1, 'y': 2}]),
|
|
737
|
-
]
|
|
738
|
-
count = t.count()
|
|
739
|
-
for col_name, literal in test_cases:
|
|
740
|
-
status = t.update({col_name: literal}, where=t.c3 < 10.0, cascade=False)
|
|
741
|
-
assert status.num_rows == 10
|
|
742
|
-
assert status.updated_cols == [f'{t.get_name()}.{col_name}']
|
|
743
|
-
assert t.count() == count
|
|
744
|
-
t.revert()
|
|
745
|
-
|
|
746
|
-
# exchange two columns
|
|
747
|
-
t.add_column(float_col=FloatType(nullable=True))
|
|
748
|
-
t.update({'float_col': 1.0})
|
|
749
|
-
float_col_vals = t.select(t.float_col).collect().to_pandas()['float_col']
|
|
750
|
-
c3_vals = t.select(t.c3).collect().to_pandas()['c3']
|
|
751
|
-
assert np.all(float_col_vals == pd.Series([1.0] * t.count()))
|
|
752
|
-
t.update({'c3': t.float_col, 'float_col': t.c3})
|
|
753
|
-
assert np.all(t.select(t.c3).collect().to_pandas()['c3'] == float_col_vals)
|
|
754
|
-
assert np.all(t.select(t.float_col).collect().to_pandas()['float_col'] == c3_vals)
|
|
755
|
-
t.revert()
|
|
756
|
-
|
|
757
|
-
# update column that is used in computed cols
|
|
758
|
-
t.add_column(computed1=t.c3 + 1)
|
|
759
|
-
t.add_column(computed2=t.computed1 + 1)
|
|
760
|
-
t.add_column(computed3=t.c3 + 3)
|
|
761
|
-
|
|
762
|
-
# cascade=False
|
|
763
|
-
computed1 = t.order_by(t.computed1).show(0).to_pandas()['computed1']
|
|
764
|
-
computed2 = t.order_by(t.computed2).show(0).to_pandas()['computed2']
|
|
765
|
-
computed3 = t.order_by(t.computed3).show(0).to_pandas()['computed3']
|
|
766
|
-
assert t.where(t.c3 < 10.0).count() == 10
|
|
767
|
-
assert t.where(t.c3 == 10.0).count() == 1
|
|
768
|
-
# update to a value that also satisfies the where clause
|
|
769
|
-
status = t.update({'c3': 0.0}, where=t.c3 < 10.0, cascade=False)
|
|
770
|
-
assert status.num_rows == 10
|
|
771
|
-
assert status.updated_cols == ['test_tbl.c3']
|
|
772
|
-
assert t.where(t.c3 < 10.0).count() == 10
|
|
773
|
-
assert t.where(t.c3 == 0.0).count() == 10
|
|
774
|
-
# computed cols are not updated
|
|
775
|
-
assert np.all(t.order_by(t.computed1).show(0).to_pandas()['computed1'] == computed1)
|
|
776
|
-
assert np.all(t.order_by(t.computed2).show(0).to_pandas()['computed2'] == computed2)
|
|
777
|
-
assert np.all(t.order_by(t.computed3).show(0).to_pandas()['computed3'] == computed3)
|
|
778
|
-
|
|
779
|
-
# revert, then verify that we're back to where we started
|
|
780
|
-
cl = pxt.Client(reload=True)
|
|
781
|
-
t = cl.get_table(t.get_name())
|
|
782
|
-
t.revert()
|
|
783
|
-
assert t.where(t.c3 < 10.0).count() == 10
|
|
784
|
-
assert t.where(t.c3 == 10.0).count() == 1
|
|
785
|
-
|
|
786
|
-
# cascade=True
|
|
787
|
-
status = t.update({'c3': 0.0}, where=t.c3 < 10.0, cascade=True)
|
|
788
|
-
assert status.num_rows == 10
|
|
789
|
-
assert set(status.updated_cols) == \
|
|
790
|
-
set(['test_tbl.c3', 'test_tbl.computed1', 'test_tbl.computed2', 'test_tbl.computed3'])
|
|
791
|
-
assert t.where(t.c3 < 10.0).count() == 10
|
|
792
|
-
assert t.where(t.c3 == 0.0).count() == 10
|
|
793
|
-
assert np.all(t.order_by(t.computed1).show(0).to_pandas()['computed1'][:10] == pd.Series([1.0] * 10))
|
|
794
|
-
assert np.all(t.order_by(t.computed2).show(0).to_pandas()['computed2'][:10] == pd.Series([2.0] * 10))
|
|
795
|
-
assert np.all(t.order_by(t.computed3).show(0).to_pandas()['computed3'][:10] == pd.Series([3.0] * 10))
|
|
796
|
-
|
|
797
|
-
# bad update spec
|
|
798
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
799
|
-
t.update({1: 1})
|
|
800
|
-
assert 'dict key' in str(excinfo.value)
|
|
801
|
-
|
|
802
|
-
# unknown column
|
|
803
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
804
|
-
t.update({'unknown': 1})
|
|
805
|
-
assert 'unknown unknown' in str(excinfo.value)
|
|
806
|
-
|
|
807
|
-
# incompatible type
|
|
808
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
809
|
-
t.update({'c1': 1})
|
|
810
|
-
assert 'not compatible' in str(excinfo.value)
|
|
811
|
-
|
|
812
|
-
# can't update primary key
|
|
813
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
814
|
-
t.update({'c2': 1})
|
|
815
|
-
assert 'primary key' in str(excinfo.value)
|
|
816
|
-
|
|
817
|
-
# can't update computed column
|
|
818
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
819
|
-
t.update({'computed1': 1})
|
|
820
|
-
assert 'is computed' in str(excinfo.value)
|
|
821
|
-
|
|
822
|
-
# non-expr
|
|
823
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
824
|
-
t.update({'c3': lambda c3: math.sqrt(c3)})
|
|
825
|
-
assert 'not a recognized' in str(excinfo.value)
|
|
826
|
-
|
|
827
|
-
# non-Predicate filter
|
|
828
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
829
|
-
t.update({'c3': 1.0}, where=lambda c2: c2 == 10)
|
|
830
|
-
assert 'Predicate' in str(excinfo.value)
|
|
831
|
-
|
|
832
|
-
img_t = small_img_tbl
|
|
833
|
-
|
|
834
|
-
# can't update image col
|
|
835
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
836
|
-
img_t.update({'img': 17}, where=img_t.img.nearest('car'))
|
|
837
|
-
assert 'has type image' in str(excinfo.value)
|
|
838
|
-
|
|
839
|
-
# similarity search is not supported
|
|
840
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
841
|
-
img_t.update({'split': 'train'}, where=img_t.img.nearest('car'))
|
|
842
|
-
assert 'nearest()' in str(excinfo.value)
|
|
843
|
-
|
|
844
|
-
# filter not expressible in SQL
|
|
845
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
846
|
-
img_t.update({'split': 'train'}, where=img_t.img.width > 100)
|
|
847
|
-
assert 'not expressible' in str(excinfo.value)
|
|
848
|
-
|
|
849
|
-
def test_cascading_update(self, test_tbl: pxt.InsertableTable) -> None:
|
|
850
|
-
t = test_tbl
|
|
851
|
-
t.add_column(d1=t.c3 - 1)
|
|
852
|
-
# add column that can be updated
|
|
853
|
-
t.add_column(c10=FloatType(nullable=True))
|
|
854
|
-
t.update({'c10': t.c3})
|
|
855
|
-
# computed column that depends on two columns: exercise duplicate elimination during query construction
|
|
856
|
-
t.add_column(d2=t.c3 - t.c10)
|
|
857
|
-
r1 = t.where(t.c2 < 5).select(t.c3 + 1.0, t.c10 - 1.0, t.c3, 2.0).order_by(t.c2).show(0)
|
|
858
|
-
t.update({'c4': True, 'c3': t.c3 + 1.0, 'c10': t.c10 - 1.0}, where=t.c2 < 5, cascade=True)
|
|
859
|
-
r2 = t.where(t.c2 < 5).select(t.c3, t.c10, t.d1, t.d2).order_by(t.c2).show(0)
|
|
860
|
-
assert_resultset_eq(r1, r2)
|
|
861
|
-
|
|
862
|
-
def test_delete(self, test_tbl: pxt.Table, small_img_tbl) -> None:
|
|
863
|
-
t = test_tbl
|
|
864
|
-
|
|
865
|
-
cnt = t.where(t.c3 < 10.0).count()
|
|
866
|
-
assert cnt == 10
|
|
867
|
-
cnt = t.where(t.c3 == 10.0).count()
|
|
868
|
-
assert cnt == 1
|
|
869
|
-
status = t.delete(where=t.c3 < 10.0)
|
|
870
|
-
assert status.num_rows == 10
|
|
871
|
-
cnt = t.where(t.c3 < 10.0).count()
|
|
872
|
-
assert cnt == 0
|
|
873
|
-
cnt = t.where(t.c3 == 10.0).count()
|
|
874
|
-
assert cnt == 1
|
|
875
|
-
|
|
876
|
-
# revert, then verify that we're back where we started
|
|
877
|
-
cl = pxt.Client(reload=True)
|
|
878
|
-
t = cl.get_table(t.get_name())
|
|
879
|
-
t.revert()
|
|
880
|
-
cnt = t.where(t.c3 < 10.0).count()
|
|
881
|
-
assert cnt == 10
|
|
882
|
-
cnt = t.where(t.c3 == 10.0).count()
|
|
883
|
-
assert cnt == 1
|
|
884
|
-
|
|
885
|
-
# non-Predicate filter
|
|
886
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
887
|
-
t.delete(where=lambda c2: c2 == 10)
|
|
888
|
-
assert 'Predicate' in str(excinfo.value)
|
|
889
|
-
|
|
890
|
-
img_t = small_img_tbl
|
|
891
|
-
# similarity search is not supported
|
|
892
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
893
|
-
img_t.delete(where=img_t.img.nearest('car'))
|
|
894
|
-
assert 'nearest()' in str(excinfo.value)
|
|
895
|
-
|
|
896
|
-
# filter not expressible in SQL
|
|
897
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
898
|
-
img_t.delete(where=img_t.img.width > 100)
|
|
899
|
-
assert 'not expressible' in str(excinfo.value)
|
|
900
|
-
|
|
901
|
-
def test_computed_cols(self, test_client: pxt.client) -> None:
|
|
902
|
-
cl = test_client
|
|
903
|
-
schema = {
|
|
904
|
-
'c1': IntType(nullable=False),
|
|
905
|
-
'c2': FloatType(nullable=False),
|
|
906
|
-
'c3': JsonType(nullable=False),
|
|
907
|
-
}
|
|
908
|
-
t : pxt.InsertableTable = cl.create_table('test', schema)
|
|
909
|
-
status = t.add_column(c4=t.c1 + 1)
|
|
910
|
-
assert status.num_excs == 0
|
|
911
|
-
status = t.add_column(c5=t.c4 + 1)
|
|
912
|
-
assert status.num_excs == 0
|
|
913
|
-
status = t.add_column(c6=t.c1 / t.c2)
|
|
914
|
-
assert status.num_excs == 0
|
|
915
|
-
status = t.add_column(c7=t.c6 * t.c2)
|
|
916
|
-
assert status.num_excs == 0
|
|
917
|
-
status = t.add_column(c8=t.c3.detections['*'].bounding_box)
|
|
918
|
-
assert status.num_excs == 0
|
|
919
|
-
status = t.add_column(c9=lambda c2: math.sqrt(c2), type=FloatType())
|
|
920
|
-
assert status.num_excs == 0
|
|
921
|
-
|
|
922
|
-
# unstored cols that compute window functions aren't currently supported
|
|
923
|
-
with pytest.raises((excs.Error)):
|
|
924
|
-
t.add_column(c10=ptf.sum(t.c1, group_by=t.c1), stored=False)
|
|
925
|
-
|
|
926
|
-
# Column.dependent_cols are computed correctly
|
|
927
|
-
assert len(t.c1.col.dependent_cols) == 2
|
|
928
|
-
assert len(t.c2.col.dependent_cols) == 3
|
|
929
|
-
assert len(t.c3.col.dependent_cols) == 1
|
|
930
|
-
assert len(t.c4.col.dependent_cols) == 1
|
|
931
|
-
assert len(t.c5.col.dependent_cols) == 0
|
|
932
|
-
assert len(t.c6.col.dependent_cols) == 1
|
|
933
|
-
assert len(t.c7.col.dependent_cols) == 0
|
|
934
|
-
assert len(t.c8.col.dependent_cols) == 0
|
|
935
|
-
|
|
936
|
-
rows = create_table_data(t, ['c1', 'c2', 'c3'], num_rows=10)
|
|
937
|
-
t.insert(rows)
|
|
938
|
-
_ = t.show()
|
|
939
|
-
|
|
940
|
-
# not allowed to pass values for computed cols
|
|
941
|
-
with pytest.raises(excs.Error):
|
|
942
|
-
rows2 = create_table_data(t, ['c1', 'c2', 'c3', 'c4'], num_rows=10)
|
|
943
|
-
t.insert(rows2)
|
|
944
|
-
|
|
945
|
-
# test loading from store
|
|
946
|
-
cl = pxt.Client(reload=True)
|
|
947
|
-
t = cl.get_table('test')
|
|
948
|
-
assert len(t.columns()) == len(t.columns())
|
|
949
|
-
for i in range(len(t.columns())):
|
|
950
|
-
if t.columns()[i].value_expr is not None:
|
|
951
|
-
assert t.columns()[i].value_expr.equals(t.columns()[i].value_expr)
|
|
952
|
-
|
|
953
|
-
# make sure we can still insert data and that computed cols are still set correctly
|
|
954
|
-
status = t.insert(rows)
|
|
955
|
-
assert status.num_excs == 0
|
|
956
|
-
res = t.show(0)
|
|
957
|
-
tbl_df = t.show(0).to_pandas()
|
|
958
|
-
|
|
959
|
-
# can't drop c4: c5 depends on it
|
|
960
|
-
with pytest.raises(excs.Error):
|
|
961
|
-
t.drop_column('c4')
|
|
962
|
-
t.drop_column('c5')
|
|
963
|
-
# now it works
|
|
964
|
-
t.drop_column('c4')
|
|
965
|
-
|
|
966
|
-
def test_expr_udf_computed_cols(self, test_client: pxt.Client) -> None:
|
|
967
|
-
cl = test_client
|
|
968
|
-
t = cl.create_table('test', {'c1': IntType(nullable=False)})
|
|
969
|
-
rows = [{'c1': i} for i in range(100)]
|
|
970
|
-
status = t.insert(rows)
|
|
971
|
-
assert status.num_rows == len(rows)
|
|
972
|
-
status = t.add_column(c2=t.c1 + 1)
|
|
973
|
-
assert status.num_excs == 0
|
|
974
|
-
# call with positional arg
|
|
975
|
-
status = t.add_column(c3=self.add1(t.c1))
|
|
976
|
-
assert status.num_excs == 0
|
|
977
|
-
# call with keyword arg
|
|
978
|
-
status = t.add_column(c4=self.add1(a=t.c1))
|
|
979
|
-
assert status.num_excs == 0
|
|
980
|
-
|
|
981
|
-
# TODO: how to verify the output?
|
|
982
|
-
describe_output = t.__repr__()
|
|
983
|
-
# 'add1' didn't get swallowed/the expr udf is still visible in the column definition
|
|
984
|
-
assert 'add1' in describe_output
|
|
985
|
-
|
|
986
|
-
def check(t: pxt.Table) -> None:
|
|
987
|
-
assert_resultset_eq(
|
|
988
|
-
t.select(t.c1 + 1).order_by(t.c1).collect(),
|
|
989
|
-
t.select(t.c2).order_by(t.c1).collect())
|
|
990
|
-
assert_resultset_eq(
|
|
991
|
-
t.select(t.c1 + 1).order_by(t.c1).collect(),
|
|
992
|
-
t.select(t.c3).order_by(t.c1).collect())
|
|
993
|
-
|
|
994
|
-
check(t)
|
|
995
|
-
# test loading from store
|
|
996
|
-
cl = pxt.Client(reload=True)
|
|
997
|
-
t = cl.get_table('test')
|
|
998
|
-
check(t)
|
|
999
|
-
|
|
1000
|
-
# make sure we can still insert data and that computed cols are still set correctly
|
|
1001
|
-
status = t.insert(rows)
|
|
1002
|
-
assert status.num_excs == 0
|
|
1003
|
-
check(t)
|
|
1004
|
-
|
|
1005
|
-
def test_computed_col_exceptions(self, test_client: pxt.Client, test_tbl: catalog.Table) -> None:
|
|
1006
|
-
cl = test_client
|
|
1007
|
-
|
|
1008
|
-
# exception during insert()
|
|
1009
|
-
schema = {'c2': IntType(nullable=False)}
|
|
1010
|
-
rows = list(test_tbl.select(test_tbl.c2).collect())
|
|
1011
|
-
t = cl.create_table('test_insert', schema)
|
|
1012
|
-
status = t.add_column(add1=self.f2(self.f1(t.c2)))
|
|
1013
|
-
assert status.num_excs == 0
|
|
1014
|
-
status = t.insert(rows, fail_on_exception=False)
|
|
1015
|
-
assert status.num_excs == 10
|
|
1016
|
-
assert 'test_insert.add1' in status.cols_with_excs
|
|
1017
|
-
assert t.where(t.add1.errortype != None).count() == 10
|
|
1018
|
-
|
|
1019
|
-
# exception during add_column()
|
|
1020
|
-
t = cl.create_table('test_add_column', schema)
|
|
1021
|
-
status = t.insert(rows)
|
|
1022
|
-
assert status.num_rows == 100
|
|
1023
|
-
assert status.num_excs == 0
|
|
1024
|
-
status = t.add_column(add1=self.f2(self.f1(t.c2)))
|
|
1025
|
-
assert status.num_excs == 10
|
|
1026
|
-
assert 'test_add_column.add1' in status.cols_with_excs
|
|
1027
|
-
assert t.where(t.add1.errortype != None).count() == 10
|
|
1028
|
-
|
|
1029
|
-
def _test_computed_img_cols(self, t: catalog.Table, stores_img_col: bool) -> None:
|
|
1030
|
-
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
1031
|
-
rows = [{'img': r['img']} for r in rows[:20]]
|
|
1032
|
-
status = t.insert(rows)
|
|
1033
|
-
assert status.num_rows == 20
|
|
1034
|
-
_ = t.count()
|
|
1035
|
-
_ = t.show()
|
|
1036
|
-
assert MediaStore.count(t.get_id()) == t.count() * stores_img_col
|
|
1037
|
-
|
|
1038
|
-
# test loading from store
|
|
1039
|
-
cl = pxt.Client(reload=True)
|
|
1040
|
-
t2 = cl.get_table(t.get_name())
|
|
1041
|
-
assert len(t.columns()) == len(t2.columns())
|
|
1042
|
-
for i in range(len(t.columns())):
|
|
1043
|
-
if t.columns()[i].value_expr is not None:
|
|
1044
|
-
assert t.columns()[i].value_expr.equals(t2.columns()[i].value_expr)
|
|
1045
|
-
|
|
1046
|
-
# make sure we can still insert data and that computed cols are still set correctly
|
|
1047
|
-
t2.insert(rows)
|
|
1048
|
-
assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
|
|
1049
|
-
res = t2.show(0)
|
|
1050
|
-
tbl_df = t2.show(0).to_pandas()
|
|
1051
|
-
|
|
1052
|
-
# revert also removes computed images
|
|
1053
|
-
t2.revert()
|
|
1054
|
-
assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
|
|
1055
|
-
|
|
1056
|
-
@pxt.udf(return_type=ImageType(), param_types=[ImageType()])
|
|
1057
|
-
def img_fn_with_exc(img: PIL.Image.Image) -> PIL.Image.Image:
|
|
1058
|
-
raise RuntimeError
|
|
1059
|
-
|
|
1060
|
-
def test_computed_img_cols(self, test_client: pxt.Client) -> None:
|
|
1061
|
-
cl = test_client
|
|
1062
|
-
schema = {'img': ImageType(nullable=False)}
|
|
1063
|
-
t = cl.create_table('test', schema)
|
|
1064
|
-
t.add_column(c2=t.img.width)
|
|
1065
|
-
# c3 is not stored by default
|
|
1066
|
-
t.add_column(c3=t.img.rotate(90))
|
|
1067
|
-
self._test_computed_img_cols(t, stores_img_col=False)
|
|
1068
|
-
|
|
1069
|
-
t = cl.create_table('test2', schema)
|
|
1070
|
-
# c3 is now stored
|
|
1071
|
-
t.add_column(c3=t.img.rotate(90), stored=True)
|
|
1072
|
-
self._test_computed_img_cols(t, stores_img_col=True)
|
|
1073
|
-
_ = t[t.c3.errortype].show(0)
|
|
1074
|
-
|
|
1075
|
-
# computed img col with exceptions
|
|
1076
|
-
t = cl.create_table('test3', schema)
|
|
1077
|
-
t.add_column(c3=self.img_fn_with_exc(t.img), stored=True)
|
|
1078
|
-
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
1079
|
-
rows = [{'img': r['img']} for r in rows[:20]]
|
|
1080
|
-
t.insert(rows, fail_on_exception=False)
|
|
1081
|
-
_ = t[t.c3.errortype].show(0)
|
|
1082
|
-
|
|
1083
|
-
def test_computed_window_fn(self, test_client: pxt.Client, test_tbl: catalog.Table) -> None:
|
|
1084
|
-
cl = test_client
|
|
1085
|
-
t = test_tbl
|
|
1086
|
-
# backfill
|
|
1087
|
-
t.add_column(c9=ptf.sum(t.c2, group_by=t.c4, order_by=t.c3))
|
|
1088
|
-
|
|
1089
|
-
schema = {
|
|
1090
|
-
'c2': IntType(nullable=False),
|
|
1091
|
-
'c3': FloatType(nullable=False),
|
|
1092
|
-
'c4': BoolType(nullable=False),
|
|
1093
|
-
}
|
|
1094
|
-
new_t = cl.create_table('insert_test', schema)
|
|
1095
|
-
new_t.add_column(c5=lambda c2: c2 * c2, type=IntType())
|
|
1096
|
-
new_t.add_column(c6=ptf.sum(new_t.c5, group_by=new_t.c4, order_by=new_t.c3))
|
|
1097
|
-
rows = list(t.select(t.c2, t.c4, t.c3).collect())
|
|
1098
|
-
new_t.insert(rows)
|
|
1099
|
-
_ = new_t.show(0)
|
|
1100
|
-
|
|
1101
|
-
def test_revert(self, test_client: pxt.Client) -> None:
|
|
1102
|
-
cl = test_client
|
|
1103
|
-
t1 = make_tbl(cl, 'test1', ['c1', 'c2'])
|
|
1104
|
-
assert t1.version() == 0
|
|
1105
|
-
rows1 = create_table_data(t1)
|
|
1106
|
-
t1.insert(rows1)
|
|
1107
|
-
assert t1.count() == len(rows1)
|
|
1108
|
-
assert t1.version() == 1
|
|
1109
|
-
rows2 = create_table_data(t1)
|
|
1110
|
-
t1.insert(rows2)
|
|
1111
|
-
assert t1.count() == len(rows1) + len(rows2)
|
|
1112
|
-
assert t1.version() == 2
|
|
1113
|
-
t1.revert()
|
|
1114
|
-
assert t1.count() == len(rows1)
|
|
1115
|
-
assert t1.version() == 1
|
|
1116
|
-
t1.insert(rows2)
|
|
1117
|
-
assert t1.count() == len(rows1) + len(rows2)
|
|
1118
|
-
assert t1.version() == 2
|
|
1119
|
-
|
|
1120
|
-
# can't revert past version 0
|
|
1121
|
-
t1.revert()
|
|
1122
|
-
t1.revert()
|
|
1123
|
-
with pytest.raises(excs.Error) as excinfo:
|
|
1124
|
-
t1.revert()
|
|
1125
|
-
assert 'version 0' in str(excinfo.value)
|
|
1126
|
-
|
|
1127
|
-
def test_add_column(self, test_tbl: catalog.Table) -> None:
|
|
1128
|
-
t = test_tbl
|
|
1129
|
-
num_orig_cols = len(t.columns())
|
|
1130
|
-
t.add_column(add1=pxt.IntType(nullable=True))
|
|
1131
|
-
assert len(t.columns()) == num_orig_cols + 1
|
|
1132
|
-
|
|
1133
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1134
|
-
_ = t.add_column(add2=pxt.IntType(nullable=False))
|
|
1135
|
-
assert 'cannot add non-nullable' in str(exc_info.value).lower()
|
|
1136
|
-
|
|
1137
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1138
|
-
_ = t.add_column(add2=pxt.IntType(nullable=False), add3=pxt.StringType())
|
|
1139
|
-
assert 'requires exactly one keyword argument' in str(exc_info.value).lower()
|
|
1140
|
-
|
|
1141
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1142
|
-
_ = t.add_column(pos=pxt.StringType(nullable=True))
|
|
1143
|
-
assert 'is reserved' in str(exc_info.value).lower()
|
|
1144
|
-
|
|
1145
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1146
|
-
_ = t.add_column(add2=pxt.IntType(nullable=False), type=pxt.StringType())
|
|
1147
|
-
assert '"type" is redundant' in str(exc_info.value).lower()
|
|
1148
|
-
|
|
1149
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1150
|
-
_ = t.add_column(add2=[[1.0, 2.0], [3.0, 4.0]], type=pxt.StringType())
|
|
1151
|
-
assert '"type" is redundant' in str(exc_info.value).lower()
|
|
1152
|
-
|
|
1153
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1154
|
-
_ = t.add_column(add2=pxt.IntType(nullable=False), stored=False)
|
|
1155
|
-
assert 'stored=false only applies' in str(exc_info.value).lower()
|
|
1156
|
-
|
|
1157
|
-
# duplicate name
|
|
1158
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1159
|
-
_ = t.add_column(c1=pxt.IntType())
|
|
1160
|
-
assert 'duplicate column name' in str(exc_info.value).lower()
|
|
1161
|
-
|
|
1162
|
-
# 'stored' kwarg only applies to computed image columns
|
|
1163
|
-
with pytest.raises(excs.Error):
|
|
1164
|
-
_ = t.add_column(c5=IntType(), stored=False)
|
|
1165
|
-
with pytest.raises(excs.Error):
|
|
1166
|
-
_ = t.add_column(c5=ImageType(), stored=False)
|
|
1167
|
-
with pytest.raises(excs.Error):
|
|
1168
|
-
_ = t.add_column(c5=(t.c2 + t.c3), stored=False)
|
|
1169
|
-
|
|
1170
|
-
# make sure this is still true after reloading the metadata
|
|
1171
|
-
cl = pxt.Client(reload=True)
|
|
1172
|
-
t = cl.get_table(t.get_name())
|
|
1173
|
-
assert len(t.columns()) == num_orig_cols + 1
|
|
1174
|
-
|
|
1175
|
-
# revert() works
|
|
1176
|
-
t.revert()
|
|
1177
|
-
assert len(t.columns()) == num_orig_cols
|
|
1178
|
-
|
|
1179
|
-
# make sure this is still true after reloading the metadata once more
|
|
1180
|
-
cl = pxt.Client(reload=True)
|
|
1181
|
-
t = cl.get_table(t.get_name())
|
|
1182
|
-
assert len(t.columns()) == num_orig_cols
|
|
1183
|
-
|
|
1184
|
-
def test_add_column_setitem(self, test_tbl: catalog.Table) -> None:
|
|
1185
|
-
t = test_tbl
|
|
1186
|
-
num_orig_cols = len(t.columns())
|
|
1187
|
-
t['add1'] = pxt.IntType(nullable=True)
|
|
1188
|
-
assert len(t.columns()) == num_orig_cols + 1
|
|
1189
|
-
t['computed1'] = t.c2 + 1
|
|
1190
|
-
assert len(t.columns()) == num_orig_cols + 2
|
|
1191
|
-
|
|
1192
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1193
|
-
_ = t['pos'] = pxt.StringType()
|
|
1194
|
-
assert 'is reserved' in str(exc_info.value).lower()
|
|
1195
|
-
|
|
1196
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1197
|
-
_ = t[2] = pxt.StringType()
|
|
1198
|
-
assert 'must be a string' in str(exc_info.value).lower()
|
|
1199
|
-
|
|
1200
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1201
|
-
_ = t['add 2'] = pxt.StringType()
|
|
1202
|
-
assert 'invalid column name' in str(exc_info.value).lower()
|
|
1203
|
-
|
|
1204
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1205
|
-
_ = t['add2'] = {'value': t.c2 + 1, 'type': pxt.StringType()}
|
|
1206
|
-
assert '"type" is redundant' in str(exc_info.value).lower()
|
|
1207
|
-
|
|
1208
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1209
|
-
_ = t['add2'] = {'value': pxt.IntType()}
|
|
1210
|
-
assert 'value needs to be either' in str(exc_info.value).lower()
|
|
1211
|
-
|
|
1212
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1213
|
-
_ = t['add2'] = {'value': t.c2 + 1, 'stored': False}
|
|
1214
|
-
assert 'stored=false only applies' in str(exc_info.value).lower()
|
|
1215
|
-
|
|
1216
|
-
# duplicate name
|
|
1217
|
-
with pytest.raises(excs.Error) as exc_info:
|
|
1218
|
-
_ = t['c1'] = pxt.IntType()
|
|
1219
|
-
assert 'duplicate column name' in str(exc_info.value).lower()
|
|
1220
|
-
|
|
1221
|
-
# make sure this is still true after reloading the metadata
|
|
1222
|
-
cl = pxt.Client(reload=True)
|
|
1223
|
-
t = cl.get_table(t.get_name())
|
|
1224
|
-
assert len(t.columns()) == num_orig_cols + 2
|
|
1225
|
-
|
|
1226
|
-
# revert() works
|
|
1227
|
-
t.revert()
|
|
1228
|
-
t.revert()
|
|
1229
|
-
assert len(t.columns()) == num_orig_cols
|
|
1230
|
-
|
|
1231
|
-
# make sure this is still true after reloading the metadata once more
|
|
1232
|
-
cl = pxt.Client(reload=True)
|
|
1233
|
-
t = cl.get_table(t.get_name())
|
|
1234
|
-
assert len(t.columns()) == num_orig_cols
|
|
1235
|
-
|
|
1236
|
-
def test_drop_column(self, test_tbl: catalog.Table) -> None:
|
|
1237
|
-
t = test_tbl
|
|
1238
|
-
num_orig_cols = len(t.columns())
|
|
1239
|
-
t.drop_column('c1')
|
|
1240
|
-
assert len(t.columns()) == num_orig_cols - 1
|
|
1241
|
-
|
|
1242
|
-
with pytest.raises(excs.Error):
|
|
1243
|
-
t.drop_column('unknown')
|
|
1244
|
-
|
|
1245
|
-
# make sure this is still true after reloading the metadata
|
|
1246
|
-
cl = pxt.Client(reload=True)
|
|
1247
|
-
t = cl.get_table(t.get_name())
|
|
1248
|
-
assert len(t.columns()) == num_orig_cols - 1
|
|
1249
|
-
|
|
1250
|
-
# revert() works
|
|
1251
|
-
t.revert()
|
|
1252
|
-
assert len(t.columns()) == num_orig_cols
|
|
1253
|
-
|
|
1254
|
-
# make sure this is still true after reloading the metadata once more
|
|
1255
|
-
cl = pxt.Client(reload=True)
|
|
1256
|
-
t = cl.get_table(t.get_name())
|
|
1257
|
-
assert len(t.columns()) == num_orig_cols
|
|
1258
|
-
|
|
1259
|
-
def test_rename_column(self, test_tbl: catalog.Table) -> None:
|
|
1260
|
-
t = test_tbl
|
|
1261
|
-
num_orig_cols = len(t.columns())
|
|
1262
|
-
t.rename_column('c1', 'c1_renamed')
|
|
1263
|
-
assert len(t.columns()) == num_orig_cols
|
|
1264
|
-
|
|
1265
|
-
def check_rename(t: pxt.Table, known: str, unknown: str) -> None:
|
|
1266
|
-
with pytest.raises(AttributeError) as exc_info:
|
|
1267
|
-
_ = t.select(t[unknown]).collect()
|
|
1268
|
-
assert 'unknown' in str(exc_info.value).lower()
|
|
1269
|
-
_ = t.select(t[known]).collect()
|
|
1270
|
-
|
|
1271
|
-
check_rename(t, 'c1_renamed', 'c1')
|
|
1272
|
-
|
|
1273
|
-
# unknown column
|
|
1274
|
-
with pytest.raises(excs.Error):
|
|
1275
|
-
t.rename_column('unknown', 'unknown_renamed')
|
|
1276
|
-
# bad name
|
|
1277
|
-
with pytest.raises(excs.Error):
|
|
1278
|
-
t.rename_column('c2', 'bad name')
|
|
1279
|
-
# existing name
|
|
1280
|
-
with pytest.raises(excs.Error):
|
|
1281
|
-
t.rename_column('c2', 'c3')
|
|
1282
|
-
|
|
1283
|
-
# make sure this is still true after reloading the metadata
|
|
1284
|
-
cl = pxt.Client(reload=True)
|
|
1285
|
-
t = cl.get_table(t.get_name())
|
|
1286
|
-
check_rename(t, 'c1_renamed', 'c1')
|
|
1287
|
-
|
|
1288
|
-
# revert() works
|
|
1289
|
-
_ = t.select(t.c1_renamed).collect()
|
|
1290
|
-
t.revert()
|
|
1291
|
-
_ = t.select(t.c1).collect()
|
|
1292
|
-
#check_rename(t, 'c1', 'c1_renamed')
|
|
1293
|
-
|
|
1294
|
-
# make sure this is still true after reloading the metadata once more
|
|
1295
|
-
cl = pxt.Client(reload=True)
|
|
1296
|
-
t = cl.get_table(t.get_name())
|
|
1297
|
-
check_rename(t, 'c1', 'c1_renamed')
|
|
1298
|
-
|
|
1299
|
-
def test_add_computed_column(self, test_tbl: catalog.Table) -> None:
|
|
1300
|
-
t = test_tbl
|
|
1301
|
-
status = t.add_column(add1=t.c2 + 10)
|
|
1302
|
-
assert status.num_excs == 0
|
|
1303
|
-
_ = t.show()
|
|
1304
|
-
|
|
1305
|
-
# with exception in SQL
|
|
1306
|
-
with pytest.raises(excs.Error):
|
|
1307
|
-
t.add_column(add2=(t.c2 - 10) / (t.c3 - 10))
|
|
1308
|
-
|
|
1309
|
-
# with exception in Python for c6.f2 == 10
|
|
1310
|
-
status = t.add_column(add2=(t.c6.f2 - 10) / (t.c6.f2 - 10))
|
|
1311
|
-
assert status.num_excs == 1
|
|
1312
|
-
result = t[t.add2.errortype != None][t.c6.f2, t.add2, t.add2.errortype, t.add2.errormsg].show()
|
|
1313
|
-
assert len(result) == 1
|
|
1314
|
-
|
|
1315
|
-
# test case: exceptions in dependencies prevent execution of dependent exprs
|
|
1316
|
-
status = t.add_column(add3=self.f2(self.f1(t.c2)))
|
|
1317
|
-
assert status.num_excs == 10
|
|
1318
|
-
result = t[t.add3.errortype != None][t.c2, t.add3, t.add3.errortype, t.add3.errormsg].show()
|
|
1319
|
-
assert len(result) == 10
|
|
1320
|
-
|
|
1321
|
-
def test_describe(self, test_tbl: catalog.Table) -> None:
|
|
1322
|
-
t = test_tbl
|
|
1323
|
-
fn = lambda c2: np.full((3, 4), c2)
|
|
1324
|
-
t.add_column(computed1=fn, type=ArrayType((3, 4), dtype=IntType()))
|
|
1325
|
-
t.describe()
|
|
1326
|
-
t.comment = 'This is a comment.'
|
|
1327
|
-
t.describe()
|
|
1328
|
-
|
|
1329
|
-
# TODO: how to you check the output of these?
|
|
1330
|
-
_ = repr(t)
|
|
1331
|
-
_ = t._repr_html_()
|
|
1332
|
-
|
|
1333
|
-
def test_common_col_names(self, test_client: pxt.Client) -> None:
|
|
1334
|
-
"""Make sure that commonly used column names don't collide with Table member vars"""
|
|
1335
|
-
cl = test_client
|
|
1336
|
-
schema = {'id': IntType(nullable=False), 'name': StringType(nullable=False)}
|
|
1337
|
-
tbl = cl.create_table('test', schema)
|
|
1338
|
-
status = tbl.insert({'id': id, 'name': str(id)} for id in range(10))
|
|
1339
|
-
assert status.num_rows == 10
|
|
1340
|
-
assert status.num_excs == 0
|
|
1341
|
-
assert tbl.count() == 10
|
|
1342
|
-
# we can create references to those column via __getattr__
|
|
1343
|
-
_ = tbl.select(tbl.id, tbl.name).collect()
|