pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +590 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +359 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +195 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +34 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +256 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +122 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +418 -182
  88. pixeltable/tests/conftest.py +146 -88
  89. pixeltable/tests/functions/test_fireworks.py +42 -0
  90. pixeltable/tests/functions/test_functions.py +60 -0
  91. pixeltable/tests/functions/test_huggingface.py +158 -0
  92. pixeltable/tests/functions/test_openai.py +152 -0
  93. pixeltable/tests/functions/test_together.py +111 -0
  94. pixeltable/tests/test_audio.py +65 -0
  95. pixeltable/tests/test_catalog.py +27 -0
  96. pixeltable/tests/test_client.py +14 -14
  97. pixeltable/tests/test_component_view.py +370 -0
  98. pixeltable/tests/test_dataframe.py +439 -0
  99. pixeltable/tests/test_dirs.py +78 -62
  100. pixeltable/tests/test_document.py +120 -0
  101. pixeltable/tests/test_exprs.py +592 -135
  102. pixeltable/tests/test_function.py +297 -67
  103. pixeltable/tests/test_migration.py +43 -0
  104. pixeltable/tests/test_nos.py +54 -0
  105. pixeltable/tests/test_snapshot.py +208 -0
  106. pixeltable/tests/test_table.py +1195 -263
  107. pixeltable/tests/test_transactional_directory.py +42 -0
  108. pixeltable/tests/test_types.py +5 -11
  109. pixeltable/tests/test_video.py +151 -34
  110. pixeltable/tests/test_view.py +530 -0
  111. pixeltable/tests/utils.py +320 -45
  112. pixeltable/tool/create_test_db_dump.py +149 -0
  113. pixeltable/tool/create_test_video.py +81 -0
  114. pixeltable/type_system.py +445 -124
  115. pixeltable/utils/__init__.py +17 -46
  116. pixeltable/utils/arrow.py +98 -0
  117. pixeltable/utils/clip.py +12 -15
  118. pixeltable/utils/coco.py +136 -0
  119. pixeltable/utils/documents.py +39 -0
  120. pixeltable/utils/filecache.py +195 -0
  121. pixeltable/utils/help.py +11 -0
  122. pixeltable/utils/hf_datasets.py +157 -0
  123. pixeltable/utils/media_store.py +76 -0
  124. pixeltable/utils/parquet.py +167 -0
  125. pixeltable/utils/pytorch.py +91 -0
  126. pixeltable/utils/s3.py +13 -0
  127. pixeltable/utils/sql.py +17 -0
  128. pixeltable/utils/transactional_directory.py +35 -0
  129. pixeltable-0.2.4.dist-info/LICENSE +18 -0
  130. pixeltable-0.2.4.dist-info/METADATA +127 -0
  131. pixeltable-0.2.4.dist-info/RECORD +132 -0
  132. {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
  133. pixeltable/catalog.py +0 -1421
  134. pixeltable/exprs.py +0 -1745
  135. pixeltable/function.py +0 -269
  136. pixeltable/functions/clip.py +0 -10
  137. pixeltable/functions/pil/__init__.py +0 -23
  138. pixeltable/functions/tf.py +0 -21
  139. pixeltable/index.py +0 -57
  140. pixeltable/tests/test_dict.py +0 -24
  141. pixeltable/tests/test_functions.py +0 -11
  142. pixeltable/tests/test_tf.py +0 -69
  143. pixeltable/tf.py +0 -33
  144. pixeltable/utils/tf.py +0 -33
  145. pixeltable/utils/video.py +0 -32
  146. pixeltable-0.1.0.dist-info/METADATA +0 -34
  147. pixeltable-0.1.0.dist-info/RECORD +0 -36
@@ -1,191 +1,853 @@
1
+ import datetime
2
+ import math
3
+ import os
4
+ import random
5
+ from typing import List, Tuple
6
+
7
+ import PIL
8
+ import cv2
9
+ import numpy as np
1
10
  import pandas as pd
11
+ import pathlib
2
12
  import pytest
3
- import math
4
13
 
5
- import pixeltable as pt
6
- from pixeltable import exceptions as exc
14
+ import pixeltable as pxt
15
+ import pixeltable.functions as ptf
7
16
  from pixeltable import catalog
17
+ from pixeltable import exceptions as excs
18
+ from pixeltable.iterators import FrameIterator
19
+ from pixeltable.tests.utils import \
20
+ make_tbl, create_table_data, read_data_file, get_video_files, get_audio_files, get_image_files, get_documents, \
21
+ assert_resultset_eq, assert_hf_dataset_equal, make_test_arrow_table
22
+ from pixeltable.tests.utils import skip_test_if_not_installed
8
23
  from pixeltable.type_system import \
9
- StringType, IntType, FloatType, TimestampType, ImageType, VideoType, JsonType, BoolType
10
- from pixeltable.tests.utils import make_tbl, create_table_data, read_data_file, get_video_files, sum_uda
11
- from pixeltable.functions import make_video
12
- from pixeltable import utils
13
-
24
+ StringType, IntType, FloatType, TimestampType, ImageType, VideoType, JsonType, BoolType, ArrayType, AudioType, \
25
+ DocumentType
26
+ from pixeltable.utils.filecache import FileCache
27
+ from pixeltable.utils.media_store import MediaStore
14
28
 
15
29
  class TestTable:
16
- def test_create(self, test_db: catalog.Db) -> None:
17
- db = test_db
18
- db.create_dir('dir1')
19
- c1 = catalog.Column('c1', StringType(), nullable=False)
20
- c2 = catalog.Column('c2', IntType(), nullable=False)
21
- c3 = catalog.Column('c3', FloatType(), nullable=False)
22
- c4 = catalog.Column('c4', TimestampType(), nullable=False)
23
- schema = [c1, c2, c3, c4]
24
- _ = db.create_table('test', schema)
25
- _ = db.create_table('dir1.test', schema)
26
-
27
- with pytest.raises(exc.BadFormatError):
28
- _ = db.create_table('1test', schema)
29
- with pytest.raises(exc.BadFormatError):
30
+ # exc for a % 10 == 0
31
+ @pxt.udf(return_type=FloatType(), param_types=[IntType()])
32
+ def f1(a: int) -> float:
33
+ return a / (a % 10)
34
+
35
+ # exception for a == None; this should not get triggered
36
+ @pxt.udf(return_type=FloatType(), param_types=[FloatType()])
37
+ def f2(a: float) -> float:
38
+ return a + 1
39
+
40
+ @pxt.expr_udf(param_types=[IntType(nullable=False)])
41
+ def add1(a: int) -> int:
42
+ return a + 1
43
+
44
+ def test_create(self, test_client: pxt.Client) -> None:
45
+ cl = test_client
46
+ cl.create_dir('dir1')
47
+ schema = {
48
+ 'c1': StringType(nullable=False),
49
+ 'c2': IntType(nullable=False),
50
+ 'c3': FloatType(nullable=False),
51
+ 'c4': TimestampType(nullable=False),
52
+ }
53
+ tbl = cl.create_table('test', schema)
54
+ _ = cl.create_table('dir1.test', schema)
55
+
56
+ with pytest.raises(excs.Error):
57
+ _ = cl.create_table('1test', schema)
58
+ with pytest.raises(excs.Error):
30
59
  _ = catalog.Column('1c', StringType())
31
- with pytest.raises(exc.DuplicateNameError):
32
- _ = db.create_table('test2', [c1, c1])
33
- with pytest.raises(exc.DuplicateNameError):
34
- _ = db.create_table('test', schema)
35
- with pytest.raises(exc.DuplicateNameError):
36
- _ = db.create_table('test2', [c1, c1])
37
- with pytest.raises(exc.UnknownEntityError):
38
- _ = db.create_table('dir2.test2', schema)
39
-
40
- _ = db.list_tables()
41
- _ = db.list_tables('dir1')
42
-
43
- with pytest.raises(exc.BadFormatError):
44
- _ = db.list_tables('1dir')
45
- with pytest.raises(exc.UnknownEntityError):
46
- _ = db.list_tables('dir2')
60
+ with pytest.raises(excs.Error):
61
+ _ = cl.create_table('test', schema)
62
+ with pytest.raises(excs.Error):
63
+ _ = cl.create_table('dir2.test2', schema)
64
+
65
+ _ = cl.list_tables()
66
+ _ = cl.list_tables('dir1')
67
+
68
+ with pytest.raises(excs.Error):
69
+ _ = cl.list_tables('1dir')
70
+ with pytest.raises(excs.Error):
71
+ _ = cl.list_tables('dir2')
47
72
 
48
73
  # test loading with new client
49
- cl2 = pt.Client()
50
- db = cl2.get_db('test')
74
+ cl = pxt.Client(reload=True)
51
75
 
52
- tbl = db.get_table('test')
53
- assert isinstance(tbl, catalog.MutableTable)
54
- tbl.add_column(catalog.Column('c5', IntType()))
76
+ tbl = cl.get_table('test')
77
+ assert isinstance(tbl, catalog.InsertableTable)
78
+ tbl.add_column(c5=IntType())
55
79
  tbl.drop_column('c1')
56
80
  tbl.rename_column('c2', 'c17')
57
81
 
58
- db.rename_table('test', 'test2')
82
+ cl.move('test', 'test2')
83
+
84
+ cl.drop_table('test2')
85
+ cl.drop_table('dir1.test')
86
+
87
+ with pytest.raises(excs.Error):
88
+ cl.drop_table('test')
89
+ with pytest.raises(excs.Error):
90
+ cl.drop_table('dir1.test2')
91
+ with pytest.raises(excs.Error):
92
+ cl.drop_table('.test2')
59
93
 
60
- db.drop_table('test2')
61
- db.drop_table('dir1.test')
94
+ def test_empty_table(self, test_client: pxt.Client) -> None:
95
+ cl = test_client
96
+ with pytest.raises(excs.Error) as exc_info:
97
+ cl.create_table('empty_table', {})
98
+ assert 'Table schema is empty' in str(exc_info.value)
62
99
 
63
- with pytest.raises(exc.UnknownEntityError):
64
- db.drop_table('test')
65
- with pytest.raises(exc.UnknownEntityError):
66
- db.drop_table('dir1.test2')
67
- with pytest.raises(exc.BadFormatError):
68
- db.drop_table('.test2')
100
+ def test_table_attrs(self, test_client: pxt.Client) -> None:
101
+ cl = test_client
102
+ schema = {'c': StringType(nullable=False)}
103
+ num_retained_versions = 20
104
+ comment = "This is a table."
105
+ tbl = cl.create_table('test_table_attrs', schema, num_retained_versions=num_retained_versions, comment=comment)
106
+ assert tbl.num_retained_versions == num_retained_versions
107
+ assert tbl.comment == comment
108
+ new_num_retained_versions = 30
109
+ new_comment = "This is an updated table."
110
+ tbl.num_retained_versions = new_num_retained_versions
111
+ assert tbl.num_retained_versions == new_num_retained_versions
112
+ tbl.comment = new_comment
113
+ assert tbl.comment == new_comment
114
+ tbl.revert()
115
+ assert tbl.comment == comment
116
+ tbl.revert()
117
+ assert tbl.num_retained_versions == num_retained_versions
118
+
119
+ def test_import_parquet(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
120
+ skip_test_if_not_installed('pyarrow')
121
+ import pyarrow as pa
122
+ from pixeltable.utils.arrow import iter_tuples
123
+
124
+ parquet_dir = tmp_path / 'test_data'
125
+ parquet_dir.mkdir()
126
+ make_test_arrow_table(parquet_dir)
127
+
128
+ tab = test_client.import_parquet('test_parquet', parquet_path=str(parquet_dir))
129
+ assert 'test_parquet' in test_client.list_tables()
130
+ assert tab is not None
131
+ num_elts = tab.count()
132
+ arrow_tab: pa.Table = pa.parquet.read_table(str(parquet_dir))
133
+ assert num_elts == arrow_tab.num_rows
134
+ assert set(tab.column_names()) == set(arrow_tab.column_names)
135
+
136
+ result_set = tab.order_by(tab.c_id).collect()
137
+ column_types = tab.column_types()
69
138
 
70
- def test_create_images(self, test_db: catalog.Db) -> None:
71
- db = test_db
72
- cols = [
73
- catalog.Column('img', ImageType(), nullable=False),
74
- catalog.Column('category', StringType(), nullable=False),
75
- catalog.Column('split', StringType(), nullable=False),
139
+ for tup, arrow_tup in zip(result_set, iter_tuples(arrow_tab)):
140
+ assert tup['c_id'] == arrow_tup['c_id']
141
+ for col, val in tup.items():
142
+ if val is None:
143
+ assert arrow_tup[col] is None
144
+ continue
145
+
146
+ if column_types[col].is_array_type():
147
+ assert (val == arrow_tup[col]).all()
148
+ else:
149
+ assert val == arrow_tup[col]
150
+
151
+ def test_import_huggingface_dataset(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
152
+ skip_test_if_not_installed('datasets')
153
+ import datasets
154
+
155
+ test_cases = [
156
+ # { # includes a timestamp. 20MB for specific slice
157
+ # Disbled this test case because download is failing, and its not critical.
158
+ # 'dataset_name': 'c4',
159
+ # # see https://huggingface.co/datasets/allenai/c4/blob/main/realnewslike/c4-train.00000-of-00512.json.gz
160
+ # 'dataset': datasets.load_dataset(
161
+ # "allenai/c4",
162
+ # data_dir="realnewslike",
163
+ # data_files="c4-train.00000-of-00512.json.gz",
164
+ # split='train[:1000]',
165
+ # cache_dir=tmp_path
166
+ # ),
167
+ # },
168
+ { # includes an embedding (array type), common in a few RAG datasets.
169
+ 'dataset_name': 'cohere_wikipedia',
170
+ 'dataset': datasets.load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3",
171
+ data_dir='cr').select_columns(['url', 'title', 'text', 'emb']),
172
+ # column with name `_id`` is not currently allowed by pixeltable rules,
173
+ # so filter out that column.
174
+ # cr subdir has a small number of rows, avoid running out of space in CI runner
175
+ # see https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/tree/main/cr
176
+ 'schema_override': {'emb': ArrayType((1024,), dtype=FloatType(), nullable=False)}
177
+ },
178
+ # example of dataset dictionary with multiple splits
179
+ {
180
+ 'dataset_name': 'rotten_tomatoes',
181
+ 'dataset': datasets.load_dataset("rotten_tomatoes"),
182
+ },
76
183
  ]
77
- tbl = db.create_table('test', cols)
78
- df = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
79
- # TODO: insert a random subset
80
- tbl.insert_pandas(df[:20])
81
- html_str = tbl.show(n=100)._repr_html_()
82
- print(html_str)
83
- # TODO: check html_str
84
-
85
- def test_create_video(self, test_db: catalog.Db) -> None:
86
- db = test_db
87
- cols = [
88
- catalog.Column('video', VideoType(), nullable=False),
89
- catalog.Column('frame', ImageType(), nullable=False),
90
- catalog.Column('frame_idx', IntType(), nullable=False),
184
+
185
+ # test a column name for splits other than the default of 'split'
186
+ split_column_name = 'my_split_col'
187
+ for rec in test_cases:
188
+ dataset_name = rec['dataset_name']
189
+ hf_dataset = rec['dataset']
190
+
191
+ tab = test_client.import_huggingface_dataset(
192
+ dataset_name,
193
+ hf_dataset,
194
+ column_name_for_split=split_column_name,
195
+ schema_override=rec.get('schema_override', None),
196
+ )
197
+ if isinstance(hf_dataset, datasets.Dataset):
198
+ assert_hf_dataset_equal(hf_dataset, tab.df(), split_column_name)
199
+ elif isinstance(hf_dataset, datasets.DatasetDict):
200
+ assert tab.count() == sum(hf_dataset.num_rows.values())
201
+ assert split_column_name in tab.column_names()
202
+
203
+ for dataset_name in hf_dataset:
204
+ df = tab.where(tab.my_split_col == dataset_name)
205
+ assert_hf_dataset_equal(hf_dataset[dataset_name], df, split_column_name)
206
+ else:
207
+ assert False
208
+
209
+ with pytest.raises(excs.Error) as exc_info:
210
+ test_client.import_huggingface_dataset('test', {})
211
+ assert 'type(dataset)' in str(exc_info.value)
212
+
213
+ def test_image_table(self, test_client: pxt.Client) -> None:
214
+ n_sample_rows = 20
215
+ cl = test_client
216
+ schema = {
217
+ 'img': ImageType(nullable=False),
218
+ 'category': StringType(nullable=False),
219
+ 'split': StringType(nullable=False),
220
+ 'img_literal': ImageType(nullable=False),
221
+ }
222
+ tbl = cl.create_table('test', schema)
223
+ assert(MediaStore.count(tbl.get_id()) == 0)
224
+
225
+ rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
226
+ sample_rows = random.sample(rows, n_sample_rows)
227
+
228
+ # add literal image data and column
229
+ for r in rows:
230
+ with open(r['img'], 'rb') as f:
231
+ r['img_literal'] = f.read()
232
+
233
+ tbl.insert(sample_rows)
234
+ assert(MediaStore.count(tbl.get_id()) == n_sample_rows)
235
+
236
+ # compare img and img_literal
237
+ # TODO: make tbl.select(tbl.img == tbl.img_literal) work
238
+ tdf = tbl.select(tbl.img, tbl.img_literal).show()
239
+ pdf = tdf.to_pandas()
240
+ for tup in pdf.itertuples():
241
+ assert tup.img == tup.img_literal
242
+
243
+ # Test adding stored image transformation
244
+ tbl.add_column(rotated=tbl.img.rotate(30), stored=True)
245
+ assert(MediaStore.count(tbl.get_id()) == 2 * n_sample_rows)
246
+
247
+ # Test MediaStore.stats()
248
+ stats = list(filter(lambda x: x[0] == tbl.get_id(), MediaStore.stats()))
249
+ assert len(stats) == 2 # Two columns
250
+ assert stats[0][2] == n_sample_rows # Each column has n_sample_rows associated images
251
+ assert stats[1][2] == n_sample_rows
252
+
253
+ # Test that version-specific images are cleared when table is reverted
254
+ tbl.revert()
255
+ assert(MediaStore.count(tbl.get_id()) == n_sample_rows)
256
+
257
+ # Test that all stored images are cleared when table is dropped
258
+ cl.drop_table('test')
259
+ assert(MediaStore.count(tbl.get_id()) == 0)
260
+
261
+ def test_schema_spec(self, test_client: pxt.Client) -> None:
262
+ cl = test_client
263
+
264
+ with pytest.raises(excs.Error) as exc_info:
265
+ cl.create_table('test', {'c 1': IntType()})
266
+ assert 'invalid column name' in str(exc_info.value).lower()
267
+
268
+ with pytest.raises(excs.Error) as exc_info:
269
+ cl.create_table('test', {'c1': {}})
270
+ assert '"type" is required' in str(exc_info.value)
271
+
272
+ with pytest.raises(excs.Error) as exc_info:
273
+ cl.create_table('test', {'c1': {'xyz': IntType()}})
274
+ assert "invalid key 'xyz'" in str(exc_info.value)
275
+
276
+ with pytest.raises(excs.Error) as exc_info:
277
+ cl.create_table('test', {'c1': {'stored': True}})
278
+ assert '"type" is required' in str(exc_info.value)
279
+
280
+ with pytest.raises(excs.Error) as exc_info:
281
+ cl.create_table('test', {'c1': {'type': 'string'}})
282
+ assert 'must be a ColumnType' in str(exc_info.value)
283
+
284
+ with pytest.raises(excs.Error) as exc_info:
285
+ cl.create_table('test', {'c1': {'value': 1, 'type': StringType()}})
286
+ assert '"type" is redundant' in str(exc_info.value)
287
+
288
+ with pytest.raises(excs.Error) as exc_info:
289
+ cl.create_table('test', {'c1': {'value': pytest}})
290
+ assert 'value needs to be either' in str(exc_info.value)
291
+
292
+ with pytest.raises(excs.Error) as exc_info:
293
+ def f() -> float:
294
+ return 1.0
295
+ cl.create_table('test', {'c1': {'value': f}})
296
+ assert '"type" is required' in str(exc_info.value)
297
+
298
+ with pytest.raises(excs.Error) as exc_info:
299
+ cl.create_table('test', {'c1': {'type': StringType(), 'stored': 'true'}})
300
+ assert '"stored" must be a bool' in str(exc_info.value)
301
+
302
+ with pytest.raises(excs.Error) as exc_info:
303
+ cl.create_table('test', {'c1': {'type': StringType(), 'indexed': 'true'}})
304
+ assert '"indexed" must be a bool' in str(exc_info.value)
305
+
306
+ with pytest.raises(excs.Error) as exc_info:
307
+ cl.create_table('test', {'c1': StringType()}, primary_key='c2')
308
+ assert 'primary key column c2 not found' in str(exc_info.value).lower()
309
+
310
+ with pytest.raises(excs.Error) as exc_info:
311
+ cl.create_table('test', {'c1': StringType()}, primary_key=['c1', 'c2'])
312
+ assert 'primary key column c2 not found' in str(exc_info.value).lower()
313
+
314
+ with pytest.raises(excs.Error) as exc_info:
315
+ cl.create_table('test', {'c1': StringType()}, primary_key=['c2'])
316
+ assert 'primary key column c2 not found' in str(exc_info.value).lower()
317
+
318
+ with pytest.raises(excs.Error) as exc_info:
319
+ cl.create_table('test', {'c1': StringType()}, primary_key=0)
320
+ assert 'primary_key must be a' in str(exc_info.value).lower()
321
+
322
+ with pytest.raises(excs.Error) as exc_info:
323
+ cl.create_table('test', {'c1': StringType(nullable=True)}, primary_key='c1')
324
+ assert 'cannot be nullable' in str(exc_info.value).lower()
325
+
326
+ def check_bad_media(
327
+ self, test_client: pxt.Client, rows: List[Tuple[str, bool]], col_type: pxt.ColumnType,
328
+ validate_local_path: bool = True
329
+ ) -> None:
330
+ schema = {
331
+ 'media': col_type,
332
+ 'is_bad_media': BoolType(nullable=False),
333
+ }
334
+ tbl = test_client.create_table('test', schema)
335
+
336
+ assert len(rows) > 0
337
+ total_bad_rows = sum([int(row['is_bad_media']) for row in rows])
338
+ assert total_bad_rows > 0
339
+
340
+ # Mode 1: Validation error on bad input (default)
341
+ # we ignore the exact error here, because it depends on the media type
342
+ with pytest.raises(excs.Error):
343
+ tbl.insert(rows, fail_on_exception=True)
344
+
345
+ # Mode 2: ignore_errors=True, store error information in table
346
+ status = tbl.insert(rows, fail_on_exception=False)
347
+ _ = tbl.select(tbl.media, tbl.media.errormsg).show()
348
+ assert status.num_rows == len(rows)
349
+ assert status.num_excs == total_bad_rows
350
+
351
+ # check that we have the right number of bad and good rows
352
+ assert tbl.where(tbl.is_bad_media == True).count() == total_bad_rows
353
+ assert tbl.where(tbl.is_bad_media == False).count() == len(rows) - total_bad_rows
354
+
355
+ # check error type is set correctly
356
+ assert tbl.where((tbl.is_bad_media == True) & (tbl.media.errortype == None)).count() == 0
357
+ assert tbl.where((tbl.is_bad_media == False) & (tbl.media.errortype == None)).count() \
358
+ == len(rows) - total_bad_rows
359
+
360
+ # check fileurl is set for valid images, and check no file url is set for bad images
361
+ assert tbl.where((tbl.is_bad_media == False) & (tbl.media.fileurl == None)).count() == 0
362
+ assert tbl.where((tbl.is_bad_media == True) & (tbl.media.fileurl != None)).count() == 0
363
+
364
+ if validate_local_path:
365
+ # check that tbl.media is a valid local path
366
+ paths = tbl.where(tbl.media != None).select(output=tbl.media).collect()['output']
367
+ for path in paths:
368
+ assert os.path.exists(path) and os.path.isfile(path)
369
+
370
+ def test_validate_image(self, test_client: pxt.Client) -> None:
371
+ rows = read_data_file('imagenette2-160', 'manifest_bad.csv', ['img'])
372
+ rows = [{'media': r['img'], 'is_bad_media': r['is_bad_image']} for r in rows]
373
+ self.check_bad_media(test_client, rows, ImageType(nullable=True), validate_local_path=False)
374
+
375
+ def test_validate_video(self, test_client: pxt.Client) -> None:
376
+ files = get_video_files(include_bad_video=True)
377
+ rows = [{'media': f, 'is_bad_media': f.endswith('bad_video.mp4')} for f in files]
378
+ self.check_bad_media(test_client, rows, VideoType(nullable=True))
379
+
380
+ def test_validate_audio(self, test_client: pxt.Client) -> None:
381
+ files = get_audio_files(include_bad_audio=True)
382
+ rows = [{'media': f, 'is_bad_media': f.endswith('bad_audio.mp3')} for f in files]
383
+ self.check_bad_media(test_client, rows, AudioType(nullable=True))
384
+
385
+ def test_validate_docs(self, test_client: pxt.Client) -> None:
386
+ valid_doc_paths = get_documents()
387
+ invalid_doc_paths = [get_video_files()[0], get_audio_files()[0], get_image_files()[0]]
388
+ doc_paths = valid_doc_paths + invalid_doc_paths
389
+ is_valid = [True] * len(valid_doc_paths) + [False] * len(invalid_doc_paths)
390
+ rows = [{'media': f, 'is_bad_media': not is_valid} for f, is_valid in zip(doc_paths, is_valid)]
391
+ self.check_bad_media(test_client, rows, DocumentType(nullable=True))
392
+
393
+ def test_validate_external_url(self, test_client: pxt.Client) -> None:
394
+ skip_test_if_not_installed('boto3')
395
+ rows = [
396
+ {'media': 's3://open-images-dataset/validation/doesnotexist.jpg', 'is_bad_media': True},
397
+ {'media': 'https://archive.random.org/download?file=2024-01-28.bin', 'is_bad_media': True}, # 403 error
398
+ {'media': 's3://open-images-dataset/validation/3c02ca9ec9b2b77b.jpg', 'is_bad_media': True}, # wrong media
399
+ # test s3 url
400
+ {
401
+ 'media': 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4',
402
+ 'is_bad_media': False
403
+ },
404
+ # test http url
405
+ {
406
+ 'media': 'https://github.com/pixeltable/pixeltable/raw/master/pixeltable/tests/data/videos/bangkok.mp4',
407
+ 'is_bad_media': False
408
+ },
409
+
410
+ ]
411
+ self.check_bad_media(test_client, rows, VideoType(nullable=True))
412
+
413
+ def test_create_s3_image_table(self, test_client: pxt.Client) -> None:
414
+ skip_test_if_not_installed('boto3')
415
+ cl = test_client
416
+ tbl = cl.create_table('test', {'img': ImageType(nullable=False)})
417
+ # this is needed because Client.reset_catalog() doesn't call TableVersion.drop(), which would
418
+ # clear the file cache
419
+ # TODO: change reset_catalog() to drop tables
420
+ FileCache.get().clear()
421
+ cache_stats = FileCache.get().stats()
422
+ assert cache_stats.num_requests == 0, f'{str(cache_stats)} tbl_id={tbl.get_id()}'
423
+ # add computed column to make sure that external files are cached locally during insert
424
+ tbl.add_column(rotated=tbl.img.rotate(30), stored=True)
425
+ urls = [
426
+ 's3://open-images-dataset/validation/3c02ca9ec9b2b77b.jpg',
427
+ 's3://open-images-dataset/validation/3c13e0015b6c3bcf.jpg',
428
+ 's3://open-images-dataset/validation/3ba5380490084697.jpg',
429
+ 's3://open-images-dataset/validation/3afeb4b34f90c0cf.jpg',
430
+ 's3://open-images-dataset/validation/3b07a2c0d5c0c789.jpg',
91
431
  ]
92
- tbl = db.create_table(
93
- 'test', cols, extract_frames_from='video', extracted_frame_col='frame',
94
- extracted_frame_idx_col='frame_idx', extracted_fps=0)
95
- params = tbl.parameters
432
+
433
+ tbl.insert({'img': url} for url in urls)
434
+ # check that we populated the cache
435
+ cache_stats = FileCache.get().stats()
436
+ assert cache_stats.num_requests == len(urls), f'{str(cache_stats)} tbl_id={tbl.get_id()}'
437
+ assert cache_stats.num_hits == 0
438
+ assert FileCache.get().num_files() == len(urls)
439
+ assert FileCache.get().num_files(tbl.get_id()) == len(urls)
440
+ assert FileCache.get().avg_file_size() > 0
441
+
442
+ # query: we read from the cache
443
+ _ = tbl.show(0)
444
+ cache_stats = FileCache.get().stats()
445
+ assert cache_stats.num_requests == 2 * len(urls)
446
+ assert cache_stats.num_hits == len(urls)
447
+
448
+ # after clearing the cache, we need to re-fetch the files
449
+ FileCache.get().clear()
450
+ _ = tbl.show(0)
451
+ cache_stats = FileCache.get().stats()
452
+ assert cache_stats.num_requests == len(urls)
453
+ assert cache_stats.num_hits == 0
454
+
455
+ # start with fresh client and FileCache instance to test FileCache initialization with pre-existing files
456
+ cl = pxt.Client(reload=True)
457
+ # is there a better way to do this?
458
+ FileCache._instance = None
459
+ t = cl.get_table('test')
460
+ _ = t.show(0)
461
+ cache_stats = FileCache.get().stats()
462
+ assert cache_stats.num_requests == len(urls)
463
+ assert cache_stats.num_hits == len(urls)
464
+
465
+ # dropping the table also clears the file cache
466
+ cl.drop_table('test')
467
+ cache_stats = FileCache.get().stats()
468
+ assert cache_stats.total_size == 0
469
+
470
+ def test_video_url(self, test_client: pxt.Client) -> None:
471
+ skip_test_if_not_installed('boto3')
472
+ cl = test_client
473
+ schema = {
474
+ 'payload': IntType(nullable=False),
475
+ 'video': VideoType(nullable=False),
476
+ }
477
+ tbl = cl.create_table('test', schema)
478
+ url = 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4'
479
+ tbl.insert(payload=1, video=url)
480
+ row = tbl.select(tbl.video.fileurl, tbl.video.localpath).collect()[0]
481
+ assert row['video_fileurl'] == url
482
+ # row[1] contains valid path to an mp4 file
483
+ local_path = row['video_localpath']
484
+ assert os.path.exists(local_path) and os.path.isfile(local_path)
485
+ cap = cv2.VideoCapture(local_path)
486
+ # TODO: this isn't sufficient to determine that this is actually a video, rather than an image
487
+ assert cap.isOpened()
488
+ cap.release()
489
+
490
+ def test_create_video_table(self, test_client: pxt.Client) -> None:
491
+ skip_test_if_not_installed('boto3')
492
+ cl = test_client
493
+ tbl = cl.create_table(
494
+ 'test_tbl',
495
+ {'payload': IntType(nullable=False), 'video': VideoType(nullable=True)})
496
+ args = {'video': tbl.video, 'fps': 0}
497
+ view = cl.create_view('test_view', tbl, iterator_class=FrameIterator, iterator_args=args)
498
+ view.add_column(c1=view.frame.rotate(30), stored=True)
499
+ view.add_column(c2=view.c1.rotate(40), stored=False)
500
+ view.add_column(c3=view.c2.rotate(50), stored=True)
501
+ # a non-materialized column that refers to another non-materialized column
502
+ view.add_column(c4=view.c2.rotate(60), stored=False)
503
+
504
+ @pxt.uda(
505
+ name='window_fn', update_types=[IntType()], value_type=IntType(), requires_order_by = True,
506
+ allows_window = True)
507
+ class WindowFnAggregator:
508
+ def __init__(self):
509
+ pass
510
+ def update(self, i: int) -> None:
511
+ pass
512
+ def value(self) -> int:
513
+ return 1
514
+ # cols computed with window functions are stored by default
515
+ view.add_column(c5=window_fn(view.frame_idx, 1, group_by=view.video))
516
+
96
517
  # reload to make sure that metadata gets restored correctly
97
- cl = pt.Client()
98
- db = cl.get_db('test')
99
- tbl = db.get_table('test')
100
- assert tbl.parameters == params
101
- tbl.insert_rows([[get_video_files()[0]]], ['video'])
102
- html_str = tbl.show(n=100)._repr_html_()
103
- # TODO: check html_str
104
- _ = tbl[make_video(tbl.frame_idx, tbl.frame)].group_by(tbl.video).show()
105
-
106
- with pytest.raises(exc.Error):
518
+ cl = pxt.Client(reload=True)
519
+ tbl = cl.get_table('test_tbl')
520
+ view = cl.get_table('test_view')
521
+ # we're inserting only a single row and the video column is not in position 0
522
+ url = 's3://multimedia-commons/data/videos/mp4/ffe/ff3/ffeff3c6bf57504e7a6cecaff6aefbc9.mp4'
523
+ status = tbl.insert(payload=1, video=url)
524
+ assert status.num_excs == 0
525
+ # * 2: we have 2 stored img cols
526
+ assert MediaStore.count(view.get_id()) == view.count() * 2
527
+ # also insert a local file
528
+ tbl.insert(payload=1, video=get_video_files()[0])
529
+ assert MediaStore.count(view.get_id()) == view.count() * 2
530
+
531
+ # TODO: test inserting Nulls
532
+ #status = tbl.insert(payload=1, video=None)
533
+ #assert status.num_excs == 0
534
+
535
+ # revert() clears stored images
536
+ tbl.revert()
537
+ tbl.revert()
538
+ assert MediaStore.count(view.get_id()) == 0
539
+
540
+ with pytest.raises(excs.Error):
107
541
  # can't drop frame col
108
- tbl.drop_column('frame')
109
- with pytest.raises(exc.Error):
542
+ view.drop_column('frame')
543
+ with pytest.raises(excs.Error):
110
544
  # can't drop frame_idx col
111
- tbl.drop_column('frame_idx')
112
- with pytest.raises(exc.BadFormatError):
113
- # missing parameters
114
- _ = db.create_table(
115
- 'exc', cols, extract_frames_from='video',
116
- extracted_frame_idx_col='frame_idx', extracted_fps=0)
117
- with pytest.raises(exc.BadFormatError):
118
- # wrong column type
119
- _ = db.create_table(
120
- 'exc', cols, extract_frames_from='frame', extracted_frame_col='frame',
121
- extracted_frame_idx_col='frame_idx', extracted_fps=0)
122
- with pytest.raises(exc.BadFormatError):
123
- # wrong column type
124
- _ = db.create_table(
125
- 'exc', cols, extract_frames_from='video', extracted_frame_col='frame_idx',
126
- extracted_frame_idx_col='frame_idx', extracted_fps=0)
127
- with pytest.raises(exc.BadFormatError):
128
- # wrong column type
129
- _ = db.create_table(
130
- 'exc', cols, extract_frames_from='video', extracted_frame_col='frame',
131
- extracted_frame_idx_col='frame', extracted_fps=0)
132
- with pytest.raises(exc.BadFormatError):
133
- # unknown column
134
- _ = db.create_table(
135
- 'exc', cols, extract_frames_from='breaks', extracted_frame_col='frame',
136
- extracted_frame_idx_col='frame_idx', extracted_fps=0)
137
- with pytest.raises(exc.BadFormatError):
138
- # unknown column
139
- _ = db.create_table(
140
- 'exc', cols, extract_frames_from='video', extracted_frame_col='breaks',
141
- extracted_frame_idx_col='frame_idx', extracted_fps=0)
142
- with pytest.raises(exc.BadFormatError):
143
- # unknown column
144
- _ = db.create_table(
145
- 'exc', cols, extract_frames_from='video', extracted_frame_col='frame',
146
- extracted_frame_idx_col='breaks', extracted_fps=0)
147
-
148
- @pytest.mark.dependency(name='test_insert')
149
- def test_insert(self, test_db: catalog.Db) -> None:
150
- db = test_db
151
- t1 = make_tbl(db, 'test1', ['c1', 'c2'])
152
- data1 = create_table_data(t1)
153
- t1.insert_pandas(data1)
154
- assert t1.count() == len(data1)
545
+ view.drop_column('frame_idx')
546
+
547
+ # drop() clears stored images and the cache
548
+ tbl.insert(payload=1, video=get_video_files()[0])
549
+ with pytest.raises(excs.Error) as exc_info:
550
+ cl.drop_table('test_tbl')
551
+ assert 'has dependents: test_view' in str(exc_info.value)
552
+ cl.drop_table('test_view')
553
+ cl.drop_table('test_tbl')
554
+ assert MediaStore.count(view.get_id()) == 0
555
+
556
+ def test_insert(self, test_client: pxt.Client) -> None:
557
+ cl = test_client
558
+ schema = {
559
+ 'c1': StringType(nullable=False),
560
+ 'c2': IntType(nullable=False),
561
+ 'c3': FloatType(nullable=False),
562
+ 'c4': BoolType(nullable=False),
563
+ 'c5': ArrayType((2, 3), dtype=IntType(), nullable=False),
564
+ 'c6': JsonType(nullable=False),
565
+ 'c7': ImageType(nullable=False),
566
+ 'c8': VideoType(nullable=False),
567
+ }
568
+ t = cl.create_table('test1', schema)
569
+ rows = create_table_data(t)
570
+ status = t.insert(rows)
571
+ assert status.num_rows == len(rows)
572
+ assert status.num_excs == 0
573
+
574
+ # alternate (kwargs) insert syntax
575
+ status = t.insert(
576
+ c1='string',
577
+ c2=91,
578
+ c3=1.0,
579
+ c4=True,
580
+ c5=np.ones((2, 3), dtype=np.dtype(np.int64)),
581
+ c6={'key': 'val'},
582
+ c7=get_image_files()[0],
583
+ c8=get_video_files()[0]
584
+ )
585
+ assert status.num_rows == 1
586
+ assert status.num_excs == 0
587
+
588
+ # empty input
589
+ with pytest.raises(excs.Error) as exc_info:
590
+ t.insert([])
591
+ assert 'empty' in str(exc_info.value)
592
+
593
+ # missing column
594
+ with pytest.raises(excs.Error) as exc_info:
595
+ # drop first column
596
+ col_names = list(rows[0].keys())[1:]
597
+ new_rows = [{col_name: row[col_name] for col_name in col_names} for row in rows]
598
+ t.insert(new_rows)
599
+ assert 'Missing' in str(exc_info.value)
155
600
 
156
601
  # incompatible schema
157
- t2 = make_tbl(db, 'test2', ['c2', 'c1'])
158
- t2_data = create_table_data(t2)
159
- with pytest.raises(exc.InsertError):
160
- t1.insert_pandas(t2_data)
161
-
162
- @pytest.mark.dependency(depends=['test_insert'])
163
- def test_query(self, test_db: catalog.Db) -> None:
164
- db = test_db
165
- t = make_tbl(db, 'test', ['c1', 'c2', 'c3', 'c4', 'c5'])
166
- t_data = create_table_data(t)
167
- t.insert_pandas(t_data)
602
+ for (col_name, col_type), value_col_name in zip(schema.items(), ['c2', 'c3', 'c5', 'c5', 'c6', 'c7', 'c2', 'c2']):
603
+ cl.drop_table('test1', ignore_errors=True)
604
+ t = cl.create_table('test1', {col_name: col_type})
605
+ with pytest.raises(excs.Error) as exc_info:
606
+ t.insert({col_name: r[value_col_name]} for r in rows)
607
+ assert 'expected' in str(exc_info.value).lower()
608
+
609
+ # rows not list of dicts
610
+ cl.drop_table('test1', ignore_errors=True)
611
+ t = cl.create_table('test1', {'c1': StringType()})
612
+ with pytest.raises(excs.Error) as exc_info:
613
+ t.insert(['1'])
614
+ assert 'list of dictionaries' in str(exc_info.value)
615
+
616
+ # bad null value
617
+ cl.drop_table('test1', ignore_errors=True)
618
+ t = cl.create_table('test1', {'c1': StringType(nullable=False)})
619
+ with pytest.raises(excs.Error) as exc_info:
620
+ t.insert(c1=None)
621
+ assert 'expected non-None' in str(exc_info.value)
622
+
623
+ # bad array literal
624
+ cl.drop_table('test1', ignore_errors=True)
625
+ t = cl.create_table('test1', {'c5': ArrayType((2, 3), dtype=IntType(), nullable=False)})
626
+ with pytest.raises(excs.Error) as exc_info:
627
+ t.insert(c5=np.ndarray((3, 2)))
628
+ assert 'expected ndarray((2, 3)' in str(exc_info.value)
629
+
630
+ def test_insert_string_with_null(self, test_client: pxt.Client) -> None:
631
+ cl = test_client
632
+ t = cl.create_table('test', {'c1': StringType()})
633
+
634
+ t.insert([{'c1': 'this is a python\x00string'}])
635
+ assert t.count() == 1
636
+ for tup in t.df().collect():
637
+ assert tup['c1'] == 'this is a python string'
638
+
639
+ def test_query(self, test_client: pxt.Client) -> None:
640
+ skip_test_if_not_installed('boto3')
641
+ cl = test_client
642
+ col_names = ['c1', 'c2', 'c3', 'c4', 'c5']
643
+ t = make_tbl(cl, 'test', col_names)
644
+ rows = create_table_data(t)
645
+ t.insert(rows)
168
646
  _ = t.show(n=0)
169
647
 
170
648
  # test querying existing table
171
- cl2 = pt.Client()
172
- db2 = cl2.get_db('test')
173
- t2 = db2.get_table('test')
649
+ cl = pxt.Client(reload=True)
650
+ t2 = cl.get_table('test')
174
651
  _ = t2.show(n=0)
175
652
 
176
- def test_computed_cols(self, test_db: catalog.Db) -> None:
177
- db = test_db
178
- c1 = catalog.Column('c1', IntType(), nullable=False)
179
- c2 = catalog.Column('c2', FloatType(), nullable=False)
180
- c3 = catalog.Column('c3', JsonType(), nullable=False)
181
- schema = [c1, c2, c3]
182
- t = db.create_table('test', schema)
183
- t.add_column(catalog.Column('c4', computed_with=t.c1 + 1))
184
- t.add_column(catalog.Column('c5', computed_with=t.c4 + 1))
185
- t.add_column(catalog.Column('c6', computed_with=t.c1 / t.c2))
186
- t.add_column(catalog.Column('c7', computed_with=t.c6 * t.c2))
187
- t.add_column(catalog.Column('c8', computed_with=t.c3.detections['*'].bounding_box))
188
- t.add_column(catalog.Column('c9', FloatType(), computed_with=lambda c2: math.sqrt(c2)))
653
+ def test_update(self, test_tbl: pxt.Table, indexed_img_tbl: pxt.Table) -> None:
654
+ t = test_tbl
655
+ # update every type with a literal
656
+ test_cases = [
657
+ ('c1', 'new string'),
658
+ # TODO: ('c1n', None),
659
+ ('c3', -1.0),
660
+ ('c4', True),
661
+ ('c5', datetime.datetime.now()),
662
+ ('c6', [{'x': 1, 'y': 2}]),
663
+ ]
664
+ count = t.count()
665
+ for col_name, literal in test_cases:
666
+ status = t.update({col_name: literal}, where=t.c3 < 10.0, cascade=False)
667
+ assert status.num_rows == 10
668
+ assert status.updated_cols == [f'{t.get_name()}.{col_name}']
669
+ assert t.count() == count
670
+ t.revert()
671
+
672
+ # exchange two columns
673
+ t.add_column(float_col=FloatType(nullable=True))
674
+ t.update({'float_col': 1.0})
675
+ float_col_vals = t.select(t.float_col).collect().to_pandas()['float_col']
676
+ c3_vals = t.select(t.c3).collect().to_pandas()['c3']
677
+ assert np.all(float_col_vals == pd.Series([1.0] * t.count()))
678
+ t.update({'c3': t.float_col, 'float_col': t.c3})
679
+ assert np.all(t.select(t.c3).collect().to_pandas()['c3'] == float_col_vals)
680
+ assert np.all(t.select(t.float_col).collect().to_pandas()['float_col'] == c3_vals)
681
+ t.revert()
682
+
683
+ # update column that is used in computed cols
684
+ t.add_column(computed1=t.c3 + 1)
685
+ t.add_column(computed2=t.computed1 + 1)
686
+ t.add_column(computed3=t.c3 + 3)
687
+
688
+ # cascade=False
689
+ computed1 = t.order_by(t.computed1).show(0).to_pandas()['computed1']
690
+ computed2 = t.order_by(t.computed2).show(0).to_pandas()['computed2']
691
+ computed3 = t.order_by(t.computed3).show(0).to_pandas()['computed3']
692
+ assert t.where(t.c3 < 10.0).count() == 10
693
+ assert t.where(t.c3 == 10.0).count() == 1
694
+ # update to a value that also satisfies the where clause
695
+ status = t.update({'c3': 0.0}, where=t.c3 < 10.0, cascade=False)
696
+ assert status.num_rows == 10
697
+ assert status.updated_cols == ['test_tbl.c3']
698
+ assert t.where(t.c3 < 10.0).count() == 10
699
+ assert t.where(t.c3 == 0.0).count() == 10
700
+ # computed cols are not updated
701
+ assert np.all(t.order_by(t.computed1).show(0).to_pandas()['computed1'] == computed1)
702
+ assert np.all(t.order_by(t.computed2).show(0).to_pandas()['computed2'] == computed2)
703
+ assert np.all(t.order_by(t.computed3).show(0).to_pandas()['computed3'] == computed3)
704
+
705
+ # revert, then verify that we're back to where we started
706
+ cl = pxt.Client(reload=True)
707
+ t = cl.get_table(t.get_name())
708
+ t.revert()
709
+ assert t.where(t.c3 < 10.0).count() == 10
710
+ assert t.where(t.c3 == 10.0).count() == 1
711
+
712
+ # cascade=True
713
+ status = t.update({'c3': 0.0}, where=t.c3 < 10.0, cascade=True)
714
+ assert status.num_rows == 10
715
+ assert set(status.updated_cols) == \
716
+ set(['test_tbl.c3', 'test_tbl.computed1', 'test_tbl.computed2', 'test_tbl.computed3'])
717
+ assert t.where(t.c3 < 10.0).count() == 10
718
+ assert t.where(t.c3 == 0.0).count() == 10
719
+ assert np.all(t.order_by(t.computed1).show(0).to_pandas()['computed1'][:10] == pd.Series([1.0] * 10))
720
+ assert np.all(t.order_by(t.computed2).show(0).to_pandas()['computed2'][:10] == pd.Series([2.0] * 10))
721
+ assert np.all(t.order_by(t.computed3).show(0).to_pandas()['computed3'][:10] == pd.Series([3.0] * 10))
722
+
723
+ # bad update spec
724
+ with pytest.raises(excs.Error) as excinfo:
725
+ t.update({1: 1})
726
+ assert 'dict key' in str(excinfo.value)
727
+
728
+ # unknown column
729
+ with pytest.raises(excs.Error) as excinfo:
730
+ t.update({'unknown': 1})
731
+ assert 'unknown unknown' in str(excinfo.value)
732
+
733
+ # incompatible type
734
+ with pytest.raises(excs.Error) as excinfo:
735
+ t.update({'c1': 1})
736
+ assert 'not compatible' in str(excinfo.value)
737
+
738
+ # can't update primary key
739
+ with pytest.raises(excs.Error) as excinfo:
740
+ t.update({'c2': 1})
741
+ assert 'primary key' in str(excinfo.value)
742
+
743
+ # can't update computed column
744
+ with pytest.raises(excs.Error) as excinfo:
745
+ t.update({'computed1': 1})
746
+ assert 'is computed' in str(excinfo.value)
747
+
748
+ # non-expr
749
+ with pytest.raises(excs.Error) as excinfo:
750
+ t.update({'c3': lambda c3: math.sqrt(c3)})
751
+ assert 'not a recognized' in str(excinfo.value)
752
+
753
+ # non-Predicate filter
754
+ with pytest.raises(excs.Error) as excinfo:
755
+ t.update({'c3': 1.0}, where=lambda c2: c2 == 10)
756
+ assert 'Predicate' in str(excinfo.value)
757
+
758
+ img_t = indexed_img_tbl
759
+
760
+ # can't update image col
761
+ with pytest.raises(excs.Error) as excinfo:
762
+ img_t.update({'img': 17}, where=img_t.img.nearest('car'))
763
+ assert 'has type image' in str(excinfo.value)
764
+
765
+ # similarity search is not supported
766
+ with pytest.raises(excs.Error) as excinfo:
767
+ img_t.update({'split': 'train'}, where=img_t.img.nearest('car'))
768
+ assert 'nearest()' in str(excinfo.value)
769
+
770
+ # filter not expressible in SQL
771
+ with pytest.raises(excs.Error) as excinfo:
772
+ img_t.update({'split': 'train'}, where=img_t.img.width > 100)
773
+ assert 'not expressible' in str(excinfo.value)
774
+
775
+ def test_cascading_update(self, test_tbl: pxt.InsertableTable) -> None:
776
+ t = test_tbl
777
+ t.add_column(d1=t.c3 - 1)
778
+ # add column that can be updated
779
+ t.add_column(c10=FloatType(nullable=True))
780
+ t.update({'c10': t.c3})
781
+ # computed column that depends on two columns: exercise duplicate elimination during query construction
782
+ t.add_column(d2=t.c3 - t.c10)
783
+ r1 = t.where(t.c2 < 5).select(t.c3 + 1.0, t.c10 - 1.0, t.c3, 2.0).order_by(t.c2).show(0)
784
+ t.update({'c4': True, 'c3': t.c3 + 1.0, 'c10': t.c10 - 1.0}, where=t.c2 < 5, cascade=True)
785
+ r2 = t.where(t.c2 < 5).select(t.c3, t.c10, t.d1, t.d2).order_by(t.c2).show(0)
786
+ assert_resultset_eq(r1, r2)
787
+
788
+ def test_delete(self, test_tbl: pxt.Table, indexed_img_tbl: pxt.Table) -> None:
789
+ t = test_tbl
790
+
791
+ cnt = t.where(t.c3 < 10.0).count()
792
+ assert cnt == 10
793
+ cnt = t.where(t.c3 == 10.0).count()
794
+ assert cnt == 1
795
+ status = t.delete(where=t.c3 < 10.0)
796
+ assert status.num_rows == 10
797
+ cnt = t.where(t.c3 < 10.0).count()
798
+ assert cnt == 0
799
+ cnt = t.where(t.c3 == 10.0).count()
800
+ assert cnt == 1
801
+
802
+ # revert, then verify that we're back where we started
803
+ cl = pxt.Client(reload=True)
804
+ t = cl.get_table(t.get_name())
805
+ t.revert()
806
+ cnt = t.where(t.c3 < 10.0).count()
807
+ assert cnt == 10
808
+ cnt = t.where(t.c3 == 10.0).count()
809
+ assert cnt == 1
810
+
811
+ # non-Predicate filter
812
+ with pytest.raises(excs.Error) as excinfo:
813
+ t.delete(where=lambda c2: c2 == 10)
814
+ assert 'Predicate' in str(excinfo.value)
815
+
816
+ img_t = indexed_img_tbl
817
+ # similarity search is not supported
818
+ with pytest.raises(excs.Error) as excinfo:
819
+ img_t.delete(where=img_t.img.nearest('car'))
820
+ assert 'nearest()' in str(excinfo.value)
821
+
822
+ # filter not expressible in SQL
823
+ with pytest.raises(excs.Error) as excinfo:
824
+ img_t.delete(where=img_t.img.width > 100)
825
+ assert 'not expressible' in str(excinfo.value)
826
+
827
+ def test_computed_cols(self, test_client: pxt.client) -> None:
828
+ cl = test_client
829
+ schema = {
830
+ 'c1': IntType(nullable=False),
831
+ 'c2': FloatType(nullable=False),
832
+ 'c3': JsonType(nullable=False),
833
+ }
834
+ t : pxt.InsertableTable = cl.create_table('test', schema)
835
+ status = t.add_column(c4=t.c1 + 1)
836
+ assert status.num_excs == 0
837
+ status = t.add_column(c5=t.c4 + 1)
838
+ assert status.num_excs == 0
839
+ status = t.add_column(c6=t.c1 / t.c2)
840
+ assert status.num_excs == 0
841
+ status = t.add_column(c7=t.c6 * t.c2)
842
+ assert status.num_excs == 0
843
+ status = t.add_column(c8=t.c3.detections['*'].bounding_box)
844
+ assert status.num_excs == 0
845
+ status = t.add_column(c9=lambda c2: math.sqrt(c2), type=FloatType())
846
+ assert status.num_excs == 0
847
+
848
+ # unstored cols that compute window functions aren't currently supported
849
+ with pytest.raises((excs.Error)):
850
+ t.add_column(c10=ptf.sum(t.c1, group_by=t.c1), stored=False)
189
851
 
190
852
  # Column.dependent_cols are computed correctly
191
853
  assert len(t.c1.col.dependent_cols) == 2
@@ -197,139 +859,409 @@ class TestTable:
197
859
  assert len(t.c7.col.dependent_cols) == 0
198
860
  assert len(t.c8.col.dependent_cols) == 0
199
861
 
200
- data_df = create_table_data(t, ['c1', 'c2', 'c3'], num_rows=10)
201
- t.insert_pandas(data_df)
862
+ rows = create_table_data(t, ['c1', 'c2', 'c3'], num_rows=10)
863
+ t.insert(rows)
202
864
  _ = t.show()
203
865
 
204
866
  # not allowed to pass values for computed cols
205
- with pytest.raises(exc.InsertError):
206
- data_df2 = create_table_data(t, num_rows=10)
207
- t.insert_pandas(data_df2)
208
-
209
- # computed col references non-existent col
210
- with pytest.raises(exc.Error):
211
- c1 = catalog.Column('c1', IntType(), nullable=False)
212
- c2 = catalog.Column('c2', FloatType(), nullable=False)
213
- c3 = catalog.Column('c3', FloatType(), nullable=False, computed_with=lambda c2: math.sqrt(c2))
214
- _ = db.create_table('test2', [c1, c3, c2])
867
+ with pytest.raises(excs.Error):
868
+ rows2 = create_table_data(t, ['c1', 'c2', 'c3', 'c4'], num_rows=10)
869
+ t.insert(rows2)
215
870
 
216
871
  # test loading from store
217
- cl2 = pt.Client()
218
- db2 = cl2.get_db('test')
219
- t2 = db2.get_table('test')
220
- assert len(t.columns) == len(t2.columns)
221
- for i in range(len(t.columns)):
222
- if t.columns[i].value_expr is not None:
223
- assert t.columns[i].value_expr.equals(t2.columns[i].value_expr)
872
+ cl = pxt.Client(reload=True)
873
+ t = cl.get_table('test')
874
+ assert len(t.columns()) == len(t.columns())
875
+ for i in range(len(t.columns())):
876
+ if t.columns()[i].value_expr is not None:
877
+ assert t.columns()[i].value_expr.equals(t.columns()[i].value_expr)
224
878
 
225
879
  # make sure we can still insert data and that computed cols are still set correctly
226
- t2.insert_pandas(data_df)
227
- res = t2.show(0)
228
- tbl_df = t2.show(0).to_pandas()
880
+ status = t.insert(rows)
881
+ assert status.num_excs == 0
882
+ res = t.show(0)
883
+ tbl_df = t.show(0).to_pandas()
229
884
 
230
885
  # can't drop c4: c5 depends on it
231
- with pytest.raises(exc.Error):
886
+ with pytest.raises(excs.Error):
232
887
  t.drop_column('c4')
233
888
  t.drop_column('c5')
234
889
  # now it works
235
890
  t.drop_column('c4')
236
891
 
237
- def test_computed_img_cols(self, test_db: catalog.Db) -> None:
238
- db = test_db
239
- c1 = catalog.Column('img', ImageType(), nullable=False, indexed=True)
240
- schema = [c1]
241
- t = db.create_table('test', schema)
242
- t.add_column(catalog.Column('c2', computed_with=t.img.width))
243
- t.add_column(catalog.Column('c3', computed_with=t.img.rotate(90)))
892
+ def test_expr_udf_computed_cols(self, test_client: pxt.Client) -> None:
893
+ cl = test_client
894
+ t = cl.create_table('test', {'c1': IntType(nullable=False)})
895
+ rows = [{'c1': i} for i in range(100)]
896
+ status = t.insert(rows)
897
+ assert status.num_rows == len(rows)
898
+ status = t.add_column(c2=t.c1 + 1)
899
+ assert status.num_excs == 0
900
+ # call with positional arg
901
+ status = t.add_column(c3=self.add1(t.c1))
902
+ assert status.num_excs == 0
903
+ # call with keyword arg
904
+ status = t.add_column(c4=self.add1(a=t.c1))
905
+ assert status.num_excs == 0
906
+
907
+ # TODO: how to verify the output?
908
+ describe_output = t.__repr__()
909
+ # 'add1' didn't get swallowed/the expr udf is still visible in the column definition
910
+ assert 'add1' in describe_output
911
+
912
+ def check(t: pxt.Table) -> None:
913
+ assert_resultset_eq(
914
+ t.select(t.c1 + 1).order_by(t.c1).collect(),
915
+ t.select(t.c2).order_by(t.c1).collect())
916
+ assert_resultset_eq(
917
+ t.select(t.c1 + 1).order_by(t.c1).collect(),
918
+ t.select(t.c3).order_by(t.c1).collect())
919
+
920
+ check(t)
921
+ # test loading from store
922
+ cl = pxt.Client(reload=True)
923
+ t = cl.get_table('test')
924
+ check(t)
925
+
926
+ # make sure we can still insert data and that computed cols are still set correctly
927
+ status = t.insert(rows)
928
+ assert status.num_excs == 0
929
+ check(t)
930
+
931
+ def test_computed_col_exceptions(self, test_client: pxt.Client, test_tbl: catalog.Table) -> None:
932
+ cl = test_client
244
933
 
245
- data_df = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
246
- t.insert_pandas(data_df.loc[0:20, ['img']])
934
+ # exception during insert()
935
+ schema = {'c2': IntType(nullable=False)}
936
+ rows = list(test_tbl.select(test_tbl.c2).collect())
937
+ t = cl.create_table('test_insert', schema)
938
+ status = t.add_column(add1=self.f2(self.f1(t.c2)))
939
+ assert status.num_excs == 0
940
+ status = t.insert(rows, fail_on_exception=False)
941
+ assert status.num_excs == 10
942
+ assert 'test_insert.add1' in status.cols_with_excs
943
+ assert t.where(t.add1.errortype != None).count() == 10
944
+
945
+ # exception during add_column()
946
+ t = cl.create_table('test_add_column', schema)
947
+ status = t.insert(rows)
948
+ assert status.num_rows == 100
949
+ assert status.num_excs == 0
950
+ status = t.add_column(add1=self.f2(self.f1(t.c2)))
951
+ assert status.num_excs == 10
952
+ assert 'test_add_column.add1' in status.cols_with_excs
953
+ assert t.where(t.add1.errortype != None).count() == 10
954
+
955
+ def _test_computed_img_cols(self, t: catalog.Table, stores_img_col: bool) -> None:
956
+ rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
957
+ rows = [{'img': r['img']} for r in rows[:20]]
958
+ status = t.insert(rows)
959
+ assert status.num_rows == 20
960
+ _ = t.count()
247
961
  _ = t.show()
248
- assert utils.computed_img_count(tbl_id=t.id) == t.count()
962
+ assert MediaStore.count(t.get_id()) == t.count() * stores_img_col
249
963
 
250
964
  # test loading from store
251
- cl2 = pt.Client()
252
- db2 = cl2.get_db('test')
253
- t2 = db2.get_table('test')
254
- assert len(t.columns) == len(t2.columns)
255
- for i in range(len(t.columns)):
256
- if t.columns[i].value_expr is not None:
257
- assert t.columns[i].value_expr.equals(t2.columns[i].value_expr)
965
+ cl = pxt.Client(reload=True)
966
+ t2 = cl.get_table(t.get_name())
967
+ assert len(t.columns()) == len(t2.columns())
968
+ for i in range(len(t.columns())):
969
+ if t.columns()[i].value_expr is not None:
970
+ assert t.columns()[i].value_expr.equals(t2.columns()[i].value_expr)
258
971
 
259
972
  # make sure we can still insert data and that computed cols are still set correctly
260
- t2.insert_pandas(data_df.loc[0:20, ['img']])
261
- assert utils.computed_img_count(tbl_id=t.id) == t2.count()
973
+ t2.insert(rows)
974
+ assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
262
975
  res = t2.show(0)
263
976
  tbl_df = t2.show(0).to_pandas()
264
- print(tbl_df)
265
977
 
266
978
  # revert also removes computed images
267
979
  t2.revert()
268
- assert utils.computed_img_count() == t2.count()
980
+ assert MediaStore.count(t2.get_id()) == t2.count() * stores_img_col
981
+
982
+ def test_computed_img_cols(self, test_client: pxt.Client) -> None:
983
+ cl = test_client
984
+ schema = {'img': ImageType(nullable=False)}
985
+ t = cl.create_table('test', schema)
986
+ t.add_column(c2=t.img.width)
987
+ # c3 is not stored by default
988
+ t.add_column(c3=t.img.rotate(90))
989
+ self._test_computed_img_cols(t, stores_img_col=False)
269
990
 
270
- def test_computed_window_fn(self, test_db: catalog.Db, test_tbl: catalog.Table) -> None:
271
- db = test_db
991
+ t = cl.create_table('test2', schema)
992
+ # c3 is now stored
993
+ t.add_column(c3=t.img.rotate(90), stored=True)
994
+ self._test_computed_img_cols(t, stores_img_col=True)
995
+ _ = t[t.c3.errortype].show(0)
996
+
997
+ # computed img col with exceptions
998
+ t = cl.create_table('test3', schema)
999
+ @pxt.udf(return_type=ImageType(), param_types=[ImageType()])
1000
+ def f(img: PIL.Image.Image) -> PIL.Image.Image:
1001
+ raise RuntimeError
1002
+ t.add_column(c3=f(t.img), stored=True)
1003
+ rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
1004
+ rows = [{'img': r['img']} for r in rows[:20]]
1005
+ t.insert(rows, fail_on_exception=False)
1006
+ _ = t[t.c3.errortype].show(0)
1007
+
1008
+ def test_computed_window_fn(self, test_client: pxt.Client, test_tbl: catalog.Table) -> None:
1009
+ cl = test_client
272
1010
  t = test_tbl
273
1011
  # backfill
274
- t.add_column(catalog.Column('c9', computed_with=sum_uda(t.c2).window(partition_by=t.c4, order_by=t.c3)))
275
-
276
- c2 = catalog.Column('c2', IntType(), nullable=False)
277
- c3 = catalog.Column('c3', FloatType(), nullable=False)
278
- c4 = catalog.Column('c4', BoolType(), nullable=False)
279
- new_t = db.create_table('insert_test', [c2, c3, c4])
280
- new_t.add_column(catalog.Column('c5', IntType(), computed_with=lambda c2: c2 * c2))
281
- new_t.add_column(catalog.Column(
282
- 'c6', computed_with=sum_uda(new_t.c5).window(partition_by=new_t.c4, order_by=new_t.c3)))
283
- data_df = t[t.c2, t.c4, t.c3].show(0).to_pandas()
284
- new_t.insert_pandas(data_df)
1012
+ t.add_column(c9=ptf.sum(t.c2, group_by=t.c4, order_by=t.c3))
1013
+
1014
+ schema = {
1015
+ 'c2': IntType(nullable=False),
1016
+ 'c3': FloatType(nullable=False),
1017
+ 'c4': BoolType(nullable=False),
1018
+ }
1019
+ new_t = cl.create_table('insert_test', schema)
1020
+ new_t.add_column(c5=lambda c2: c2 * c2, type=IntType())
1021
+ new_t.add_column(c6=ptf.sum(new_t.c5, group_by=new_t.c4, order_by=new_t.c3))
1022
+ rows = list(t.select(t.c2, t.c4, t.c3).collect())
1023
+ new_t.insert(rows)
285
1024
  _ = new_t.show(0)
286
- print(_)
287
-
288
- @pytest.mark.dependency(depends=['test_insert'])
289
- def test_revert(self, test_db: catalog.Db) -> None:
290
- db = test_db
291
- t1 = make_tbl(db, 'test1', ['c1', 'c2'])
292
- data1 = create_table_data(t1)
293
- t1.insert_pandas(data1)
294
- assert t1.count() == len(data1)
295
- data2 = create_table_data(t1)
296
- t1.insert_pandas(data2)
297
- assert t1.count() == len(data1) + len(data2)
1025
+
1026
+ def test_revert(self, test_client: pxt.Client) -> None:
1027
+ cl = test_client
1028
+ t1 = make_tbl(cl, 'test1', ['c1', 'c2'])
1029
+ assert t1.version() == 0
1030
+ rows1 = create_table_data(t1)
1031
+ t1.insert(rows1)
1032
+ assert t1.count() == len(rows1)
1033
+ assert t1.version() == 1
1034
+ rows2 = create_table_data(t1)
1035
+ t1.insert(rows2)
1036
+ assert t1.count() == len(rows1) + len(rows2)
1037
+ assert t1.version() == 2
298
1038
  t1.revert()
299
- assert t1.count() == len(data1)
300
- t1.insert_pandas(data2)
301
- assert t1.count() == len(data1) + len(data2)
302
-
303
- @pytest.mark.dependency(depends=['test_insert'])
304
- def test_snapshot(self, test_db: catalog.Db) -> None:
305
- db = test_db
306
- db.create_dir('main')
307
- tbl = make_tbl(db, 'main.test1', ['c1', 'c2'])
308
- data1 = create_table_data(tbl)
309
- tbl.insert_pandas(data1)
310
- assert tbl.count() == len(data1)
311
-
312
- db.create_snapshot('snap', ['main.test1'])
313
- snap = db.get_table('snap.test1')
314
- assert snap.count() == len(data1)
315
-
316
- # adding data to a base table doesn't change the snapshot
317
- data2 = create_table_data(tbl)
318
- tbl.insert_pandas(data2)
319
- assert tbl.count() == len(data1) + len(data2)
320
- assert snap.count() == len(data1)
1039
+ assert t1.count() == len(rows1)
1040
+ assert t1.version() == 1
1041
+ t1.insert(rows2)
1042
+ assert t1.count() == len(rows1) + len(rows2)
1043
+ assert t1.version() == 2
321
1044
 
322
- tbl.revert()
323
- # can't revert a version referenced by a snapshot
324
- with pytest.raises(exc.OperationalError):
325
- tbl.revert()
326
-
327
- def test_add_column(self, test_db: catalog.Db) -> None:
328
- db = test_db
329
- t = make_tbl(db, 'test', ['c1', 'c2'])
330
- data1 = create_table_data(t)
331
- t.insert_pandas(data1)
332
- assert t.count() == len(data1)
333
- t.add_column(catalog.Column('c3', computed_with=t.c2 + 10, nullable=False))
1045
+ # can't revert past version 0
1046
+ t1.revert()
1047
+ t1.revert()
1048
+ with pytest.raises(excs.Error) as excinfo:
1049
+ t1.revert()
1050
+ assert 'version 0' in str(excinfo.value)
1051
+
1052
+ def test_add_column(self, test_tbl: catalog.Table) -> None:
1053
+ t = test_tbl
1054
+ num_orig_cols = len(t.columns())
1055
+ t.add_column(add1=pxt.IntType(nullable=True))
1056
+ assert len(t.columns()) == num_orig_cols + 1
1057
+
1058
+ with pytest.raises(excs.Error) as exc_info:
1059
+ _ = t.add_column(add2=pxt.IntType(nullable=False))
1060
+ assert 'cannot add non-nullable' in str(exc_info.value).lower()
1061
+
1062
+ with pytest.raises(excs.Error) as exc_info:
1063
+ _ = t.add_column(add2=pxt.IntType(nullable=False), add3=pxt.StringType())
1064
+ assert 'requires exactly one keyword argument' in str(exc_info.value).lower()
1065
+
1066
+ with pytest.raises(excs.Error) as exc_info:
1067
+ _ = t.add_column(pos=pxt.StringType(nullable=True))
1068
+ assert 'is reserved' in str(exc_info.value).lower()
1069
+
1070
+ with pytest.raises(excs.Error) as exc_info:
1071
+ _ = t.add_column(add2=pxt.IntType(nullable=False), type=pxt.StringType())
1072
+ assert '"type" is redundant' in str(exc_info.value).lower()
1073
+
1074
+ with pytest.raises(excs.Error) as exc_info:
1075
+ _ = t.add_column(add2=[[1.0, 2.0], [3.0, 4.0]], type=pxt.StringType())
1076
+ assert '"type" is redundant' in str(exc_info.value).lower()
1077
+
1078
+ with pytest.raises(excs.Error) as exc_info:
1079
+ _ = t.add_column(add2=pxt.IntType(nullable=False), stored=False)
1080
+ assert 'stored=false only applies' in str(exc_info.value).lower()
1081
+
1082
+ # duplicate name
1083
+ with pytest.raises(excs.Error) as exc_info:
1084
+ _ = t.add_column(c1=pxt.IntType())
1085
+ assert 'duplicate column name' in str(exc_info.value).lower()
1086
+
1087
+ # 'stored' kwarg only applies to computed image columns
1088
+ with pytest.raises(excs.Error):
1089
+ _ = t.add_column(c5=IntType(), stored=False)
1090
+ with pytest.raises(excs.Error):
1091
+ _ = t.add_column(c5=ImageType(), stored=False)
1092
+ with pytest.raises(excs.Error):
1093
+ _ = t.add_column(c5=(t.c2 + t.c3), stored=False)
1094
+
1095
+ # make sure this is still true after reloading the metadata
1096
+ cl = pxt.Client(reload=True)
1097
+ t = cl.get_table(t.get_name())
1098
+ assert len(t.columns()) == num_orig_cols + 1
1099
+
1100
+ # revert() works
1101
+ t.revert()
1102
+ assert len(t.columns()) == num_orig_cols
1103
+
1104
+ # make sure this is still true after reloading the metadata once more
1105
+ cl = pxt.Client(reload=True)
1106
+ t = cl.get_table(t.get_name())
1107
+ assert len(t.columns()) == num_orig_cols
1108
+
1109
+ def test_add_column_setitem(self, test_tbl: catalog.Table) -> None:
1110
+ t = test_tbl
1111
+ num_orig_cols = len(t.columns())
1112
+ t['add1'] = pxt.IntType(nullable=True)
1113
+ assert len(t.columns()) == num_orig_cols + 1
1114
+ t['computed1'] = t.c2 + 1
1115
+ assert len(t.columns()) == num_orig_cols + 2
1116
+
1117
+ with pytest.raises(excs.Error) as exc_info:
1118
+ _ = t['pos'] = pxt.StringType()
1119
+ assert 'is reserved' in str(exc_info.value).lower()
1120
+
1121
+ with pytest.raises(excs.Error) as exc_info:
1122
+ _ = t[2] = pxt.StringType()
1123
+ assert 'must be a string' in str(exc_info.value).lower()
1124
+
1125
+ with pytest.raises(excs.Error) as exc_info:
1126
+ _ = t['add 2'] = pxt.StringType()
1127
+ assert 'invalid column name' in str(exc_info.value).lower()
1128
+
1129
+ with pytest.raises(excs.Error) as exc_info:
1130
+ _ = t['add2'] = {'value': t.c2 + 1, 'type': pxt.StringType()}
1131
+ assert '"type" is redundant' in str(exc_info.value).lower()
1132
+
1133
+ with pytest.raises(excs.Error) as exc_info:
1134
+ _ = t['add2'] = {'value': pxt.IntType()}
1135
+ assert 'value needs to be either' in str(exc_info.value).lower()
1136
+
1137
+ with pytest.raises(excs.Error) as exc_info:
1138
+ _ = t['add2'] = {'value': t.c2 + 1, 'stored': False}
1139
+ assert 'stored=false only applies' in str(exc_info.value).lower()
1140
+
1141
+ # duplicate name
1142
+ with pytest.raises(excs.Error) as exc_info:
1143
+ _ = t['c1'] = pxt.IntType()
1144
+ assert 'duplicate column name' in str(exc_info.value).lower()
1145
+
1146
+ # make sure this is still true after reloading the metadata
1147
+ cl = pxt.Client(reload=True)
1148
+ t = cl.get_table(t.get_name())
1149
+ assert len(t.columns()) == num_orig_cols + 2
1150
+
1151
+ # revert() works
1152
+ t.revert()
1153
+ t.revert()
1154
+ assert len(t.columns()) == num_orig_cols
1155
+
1156
+ # make sure this is still true after reloading the metadata once more
1157
+ cl = pxt.Client(reload=True)
1158
+ t = cl.get_table(t.get_name())
1159
+ assert len(t.columns()) == num_orig_cols
1160
+
1161
+ def test_drop_column(self, test_tbl: catalog.Table) -> None:
1162
+ t = test_tbl
1163
+ num_orig_cols = len(t.columns())
1164
+ t.drop_column('c1')
1165
+ assert len(t.columns()) == num_orig_cols - 1
1166
+
1167
+ with pytest.raises(excs.Error):
1168
+ t.drop_column('unknown')
1169
+
1170
+ # make sure this is still true after reloading the metadata
1171
+ cl = pxt.Client(reload=True)
1172
+ t = cl.get_table(t.get_name())
1173
+ assert len(t.columns()) == num_orig_cols - 1
1174
+
1175
+ # revert() works
1176
+ t.revert()
1177
+ assert len(t.columns()) == num_orig_cols
1178
+
1179
+ # make sure this is still true after reloading the metadata once more
1180
+ cl = pxt.Client(reload=True)
1181
+ t = cl.get_table(t.get_name())
1182
+ assert len(t.columns()) == num_orig_cols
1183
+
1184
+ def test_rename_column(self, test_tbl: catalog.Table) -> None:
1185
+ t = test_tbl
1186
+ num_orig_cols = len(t.columns())
1187
+ t.rename_column('c1', 'c1_renamed')
1188
+ assert len(t.columns()) == num_orig_cols
1189
+
1190
+ def check_rename(t: pxt.Table, known: str, unknown: str) -> None:
1191
+ with pytest.raises(AttributeError) as exc_info:
1192
+ _ = t.select(t[unknown]).collect()
1193
+ assert 'unknown' in str(exc_info.value).lower()
1194
+ _ = t.select(t[known]).collect()
1195
+
1196
+ check_rename(t, 'c1_renamed', 'c1')
1197
+
1198
+ # unknown column
1199
+ with pytest.raises(excs.Error):
1200
+ t.rename_column('unknown', 'unknown_renamed')
1201
+ # bad name
1202
+ with pytest.raises(excs.Error):
1203
+ t.rename_column('c2', 'bad name')
1204
+ # existing name
1205
+ with pytest.raises(excs.Error):
1206
+ t.rename_column('c2', 'c3')
1207
+
1208
+ # make sure this is still true after reloading the metadata
1209
+ cl = pxt.Client(reload=True)
1210
+ t = cl.get_table(t.get_name())
1211
+ check_rename(t, 'c1_renamed', 'c1')
1212
+
1213
+ # revert() works
1214
+ t.revert()
1215
+ _ = t.select(t.c1).collect()
1216
+ #check_rename(t, 'c1', 'c1_renamed')
1217
+
1218
+ # make sure this is still true after reloading the metadata once more
1219
+ cl = pxt.Client(reload=True)
1220
+ t = cl.get_table(t.get_name())
1221
+ check_rename(t, 'c1', 'c1_renamed')
1222
+
1223
+ def test_add_computed_column(self, test_tbl: catalog.Table) -> None:
1224
+ t = test_tbl
1225
+ status = t.add_column(add1=t.c2 + 10)
1226
+ assert status.num_excs == 0
334
1227
  _ = t.show()
335
- print(_)
1228
+
1229
+ # with exception in SQL
1230
+ with pytest.raises(excs.Error):
1231
+ t.add_column(add2=(t.c2 - 10) / (t.c3 - 10))
1232
+
1233
+ # with exception in Python for c6.f2 == 10
1234
+ status = t.add_column(add2=(t.c6.f2 - 10) / (t.c6.f2 - 10))
1235
+ assert status.num_excs == 1
1236
+ result = t[t.add2.errortype != None][t.c6.f2, t.add2, t.add2.errortype, t.add2.errormsg].show()
1237
+ assert len(result) == 1
1238
+
1239
+ # test case: exceptions in dependencies prevent execution of dependent exprs
1240
+ status = t.add_column(add3=self.f2(self.f1(t.c2)))
1241
+ assert status.num_excs == 10
1242
+ result = t[t.add3.errortype != None][t.c2, t.add3, t.add3.errortype, t.add3.errormsg].show()
1243
+ assert len(result) == 10
1244
+
1245
+ def test_describe(self, test_tbl: catalog.Table) -> None:
1246
+ t = test_tbl
1247
+ fn = lambda c2: np.full((3, 4), c2)
1248
+ t.add_column(computed1=fn, type=ArrayType((3, 4), dtype=IntType()))
1249
+ t.describe()
1250
+ t.comment = 'This is a comment.'
1251
+ t.describe()
1252
+
1253
+ # TODO: how to you check the output of these?
1254
+ _ = repr(t)
1255
+ _ = t._repr_html_()
1256
+
1257
+ def test_common_col_names(self, test_client: pxt.Client) -> None:
1258
+ """Make sure that commonly used column names don't collide with Table member vars"""
1259
+ cl = test_client
1260
+ schema = {'id': IntType(nullable=False), 'name': StringType(nullable=False)}
1261
+ tbl = cl.create_table('test', schema)
1262
+ status = tbl.insert({'id': id, 'name': str(id)} for id in range(10))
1263
+ assert status.num_rows == 10
1264
+ assert status.num_excs == 0
1265
+ assert tbl.count() == 10
1266
+ # we can create references to those column via __getattr__
1267
+ _ = tbl.select(tbl.id, tbl.name).collect()