pixeltable 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/catalog/column.py +26 -49
- pixeltable/catalog/insertable_table.py +7 -4
- pixeltable/catalog/table.py +163 -57
- pixeltable/catalog/table_version.py +416 -140
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/client.py +72 -6
- pixeltable/dataframe.py +65 -21
- pixeltable/env.py +52 -53
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/in_memory_data_node.py +11 -7
- pixeltable/exprs/comparison.py +3 -3
- pixeltable/exprs/data_row.py +5 -1
- pixeltable/exprs/literal.py +16 -4
- pixeltable/exprs/row_builder.py +8 -40
- pixeltable/ext/__init__.py +5 -0
- pixeltable/ext/functions/yolox.py +92 -0
- pixeltable/func/aggregate_function.py +15 -15
- pixeltable/func/expr_template_function.py +9 -1
- pixeltable/func/globals.py +24 -14
- pixeltable/func/signature.py +18 -12
- pixeltable/func/udf.py +7 -2
- pixeltable/functions/__init__.py +9 -9
- pixeltable/functions/eval.py +7 -8
- pixeltable/functions/fireworks.py +10 -37
- pixeltable/functions/huggingface.py +47 -19
- pixeltable/functions/openai.py +192 -24
- pixeltable/functions/together.py +104 -9
- pixeltable/functions/util.py +11 -0
- pixeltable/index/__init__.py +2 -0
- pixeltable/index/base.py +49 -0
- pixeltable/index/embedding_index.py +95 -0
- pixeltable/metadata/schema.py +45 -22
- pixeltable/plan.py +15 -34
- pixeltable/store.py +38 -41
- pixeltable/tests/conftest.py +8 -14
- pixeltable/tests/ext/test_yolox.py +21 -0
- pixeltable/tests/functions/test_fireworks.py +43 -0
- pixeltable/tests/functions/test_functions.py +60 -0
- pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +7 -143
- pixeltable/tests/functions/test_openai.py +162 -0
- pixeltable/tests/functions/test_together.py +112 -0
- pixeltable/tests/test_component_view.py +14 -5
- pixeltable/tests/test_dataframe.py +23 -22
- pixeltable/tests/test_exprs.py +99 -102
- pixeltable/tests/test_function.py +51 -43
- pixeltable/tests/test_index.py +138 -0
- pixeltable/tests/test_migration.py +2 -1
- pixeltable/tests/test_snapshot.py +24 -1
- pixeltable/tests/test_table.py +205 -26
- pixeltable/tests/test_types.py +30 -0
- pixeltable/tests/test_video.py +16 -16
- pixeltable/tests/test_view.py +5 -0
- pixeltable/tests/utils.py +171 -14
- pixeltable/tool/create_test_db_dump.py +16 -0
- pixeltable/type_system.py +77 -128
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/hf_datasets.py +157 -0
- pixeltable/utils/parquet.py +68 -27
- pixeltable/utils/pytorch.py +16 -97
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/METADATA +35 -28
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/RECORD +63 -50
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
pixeltable/tests/utils.py
CHANGED
|
@@ -2,9 +2,11 @@ import datetime
|
|
|
2
2
|
import glob
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
|
+
from collections import namedtuple
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
7
|
+
from typing import Any, Dict, List, Optional, Set
|
|
7
8
|
|
|
9
|
+
import PIL.Image
|
|
8
10
|
import numpy as np
|
|
9
11
|
import pandas as pd
|
|
10
12
|
import pytest
|
|
@@ -12,12 +14,22 @@ import pytest
|
|
|
12
14
|
import pixeltable as pxt
|
|
13
15
|
import pixeltable.type_system as ts
|
|
14
16
|
from pixeltable import catalog
|
|
17
|
+
from pixeltable.catalog.globals import UpdateStatus
|
|
15
18
|
from pixeltable.dataframe import DataFrameResultSet
|
|
16
19
|
from pixeltable.env import Env
|
|
17
|
-
from pixeltable.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
20
|
+
from pixeltable.functions.huggingface import clip_image, clip_text
|
|
21
|
+
from pixeltable.type_system import (
|
|
22
|
+
ArrayType,
|
|
23
|
+
BoolType,
|
|
24
|
+
ColumnType,
|
|
25
|
+
FloatType,
|
|
26
|
+
ImageType,
|
|
27
|
+
IntType,
|
|
28
|
+
JsonType,
|
|
29
|
+
StringType,
|
|
30
|
+
TimestampType,
|
|
31
|
+
VideoType,
|
|
32
|
+
)
|
|
21
33
|
|
|
22
34
|
|
|
23
35
|
def make_default_type(t: ColumnType.Type) -> ColumnType:
|
|
@@ -33,6 +45,7 @@ def make_default_type(t: ColumnType.Type) -> ColumnType:
|
|
|
33
45
|
return TimestampType()
|
|
34
46
|
assert False
|
|
35
47
|
|
|
48
|
+
|
|
36
49
|
def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
|
|
37
50
|
if col_names is None:
|
|
38
51
|
col_names = ['c1']
|
|
@@ -41,7 +54,9 @@ def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]]
|
|
|
41
54
|
schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
|
|
42
55
|
return cl.create_table(name, schema)
|
|
43
56
|
|
|
44
|
-
|
|
57
|
+
|
|
58
|
+
def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[
|
|
59
|
+
Dict[str, Any]]:
|
|
45
60
|
if col_names is None:
|
|
46
61
|
col_names = []
|
|
47
62
|
data: Dict[str, Any] = {}
|
|
@@ -114,6 +129,7 @@ def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, n
|
|
|
114
129
|
rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
|
|
115
130
|
return rows
|
|
116
131
|
|
|
132
|
+
|
|
117
133
|
def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
|
|
118
134
|
schema = {
|
|
119
135
|
'c1': StringType(nullable=False),
|
|
@@ -179,12 +195,25 @@ def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table
|
|
|
179
195
|
t.insert(rows)
|
|
180
196
|
return t
|
|
181
197
|
|
|
198
|
+
|
|
199
|
+
def create_img_tbl(cl: pxt.Client, name: str = 'test_img_tbl') -> catalog.Table:
|
|
200
|
+
schema = {
|
|
201
|
+
'img': ImageType(nullable=False),
|
|
202
|
+
'category': StringType(nullable=False),
|
|
203
|
+
'split': StringType(nullable=False),
|
|
204
|
+
}
|
|
205
|
+
tbl = cl.create_table(name, schema)
|
|
206
|
+
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
207
|
+
tbl.insert(rows)
|
|
208
|
+
return tbl
|
|
209
|
+
|
|
210
|
+
|
|
182
211
|
def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
|
|
183
212
|
""" Creates a table with all supported datatypes.
|
|
184
213
|
"""
|
|
185
214
|
schema = {
|
|
186
|
-
'row_id': IntType(nullable=False),
|
|
187
|
-
'c_array': ArrayType(shape=(10,),
|
|
215
|
+
'row_id': IntType(nullable=False), # used for row selection
|
|
216
|
+
'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
|
|
188
217
|
'c_bool': BoolType(nullable=True),
|
|
189
218
|
'c_float': FloatType(nullable=True),
|
|
190
219
|
'c_image': ImageType(nullable=True),
|
|
@@ -197,12 +226,13 @@ def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
|
|
|
197
226
|
tbl = test_client.create_table('all_datatype_tbl', schema)
|
|
198
227
|
example_rows = create_table_data(tbl, num_rows=11)
|
|
199
228
|
|
|
200
|
-
for i,r in enumerate(example_rows):
|
|
201
|
-
r['row_id'] = i
|
|
229
|
+
for i, r in enumerate(example_rows):
|
|
230
|
+
r['row_id'] = i # row_id
|
|
202
231
|
|
|
203
232
|
tbl.insert(example_rows)
|
|
204
233
|
return tbl
|
|
205
234
|
|
|
235
|
+
|
|
206
236
|
def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
207
237
|
"""
|
|
208
238
|
Locate dir_name, create df out of file_name.
|
|
@@ -213,7 +243,7 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
|
|
|
213
243
|
"""
|
|
214
244
|
if path_col_names is None:
|
|
215
245
|
path_col_names = []
|
|
216
|
-
tests_dir = os.path.dirname(__file__)
|
|
246
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
217
247
|
glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
|
|
218
248
|
assert len(glob_result) == 1, f'Could not find {dir_name}'
|
|
219
249
|
abs_path = Path(glob_result[0])
|
|
@@ -225,8 +255,9 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
|
|
|
225
255
|
df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
|
|
226
256
|
return df.to_dict(orient='records')
|
|
227
257
|
|
|
258
|
+
|
|
228
259
|
def get_video_files(include_bad_video: bool = False) -> List[str]:
|
|
229
|
-
tests_dir = os.path.dirname(__file__)
|
|
260
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
230
261
|
glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
|
|
231
262
|
if not include_bad_video:
|
|
232
263
|
glob_result = [f for f in glob_result if 'bad_video' not in f]
|
|
@@ -234,18 +265,21 @@ def get_video_files(include_bad_video: bool = False) -> List[str]:
|
|
|
234
265
|
half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
|
|
235
266
|
return half_res
|
|
236
267
|
|
|
268
|
+
|
|
237
269
|
def get_test_video_files() -> List[str]:
|
|
238
|
-
tests_dir = os.path.dirname(__file__)
|
|
270
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
239
271
|
glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
|
|
240
272
|
return glob_result
|
|
241
273
|
|
|
274
|
+
|
|
242
275
|
def get_image_files(include_bad_image: bool = False) -> List[str]:
|
|
243
|
-
tests_dir = os.path.dirname(__file__)
|
|
276
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
244
277
|
glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
|
|
245
278
|
if not include_bad_image:
|
|
246
279
|
glob_result = [f for f in glob_result if 'bad_image' not in f]
|
|
247
280
|
return glob_result
|
|
248
281
|
|
|
282
|
+
|
|
249
283
|
def get_audio_files(include_bad_audio: bool = False) -> List[str]:
|
|
250
284
|
tests_dir = os.path.dirname(__file__)
|
|
251
285
|
glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
|
|
@@ -253,11 +287,13 @@ def get_audio_files(include_bad_audio: bool = False) -> List[str]:
|
|
|
253
287
|
glob_result = [f for f in glob_result if 'bad_audio' not in f]
|
|
254
288
|
return glob_result
|
|
255
289
|
|
|
290
|
+
|
|
256
291
|
def get_documents() -> List[str]:
|
|
257
292
|
tests_dir = os.path.dirname(__file__)
|
|
258
293
|
# for now, we can only handle .html and .md
|
|
259
294
|
return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
|
|
260
295
|
|
|
296
|
+
|
|
261
297
|
def get_sentences(n: int = 100) -> List[str]:
|
|
262
298
|
tests_dir = os.path.dirname(__file__)
|
|
263
299
|
path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
|
|
@@ -266,6 +302,7 @@ def get_sentences(n: int = 100) -> List[str]:
|
|
|
266
302
|
# this dataset contains \' around the questions
|
|
267
303
|
return [q['question'].replace("'", '') for q in questions_list[:n]]
|
|
268
304
|
|
|
305
|
+
|
|
269
306
|
def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
|
|
270
307
|
assert len(r1) == len(r2)
|
|
271
308
|
assert len(r1.column_names()) == len(r2.column_names()) # we don't care about the actual column names
|
|
@@ -280,6 +317,126 @@ def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
|
|
|
280
317
|
else:
|
|
281
318
|
assert s1.equals(s2)
|
|
282
319
|
|
|
320
|
+
|
|
283
321
|
def skip_test_if_not_installed(package) -> None:
|
|
284
322
|
if not Env.get().is_installed_package(package):
|
|
285
323
|
pytest.skip(f'Package `{package}` is not installed.')
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
|
|
327
|
+
assert status.num_excs == 0
|
|
328
|
+
if expected_rows is not None:
|
|
329
|
+
assert status.num_rows == expected_rows
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def make_test_arrow_table(output_path: Path) -> None:
|
|
333
|
+
import pyarrow as pa
|
|
334
|
+
|
|
335
|
+
value_dict = {
|
|
336
|
+
'c_id': [1, 2, 3, 4, 5],
|
|
337
|
+
'c_int64': [-10, -20, -30, -40, None],
|
|
338
|
+
'c_int32': [-1, -2, -3, -4, None],
|
|
339
|
+
'c_float32': [1.1, 2.2, 3.3, 4.4, None],
|
|
340
|
+
'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
|
|
341
|
+
'c_boolean': [True, False, True, False, None],
|
|
342
|
+
'c_timestamp': [
|
|
343
|
+
datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
|
|
344
|
+
datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
|
|
345
|
+
datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
|
|
346
|
+
datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
|
|
347
|
+
None,
|
|
348
|
+
],
|
|
349
|
+
# The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
|
|
350
|
+
# So, no nulls in this column
|
|
351
|
+
'c_array_float32': [
|
|
352
|
+
[
|
|
353
|
+
1.0,
|
|
354
|
+
2.0,
|
|
355
|
+
],
|
|
356
|
+
[
|
|
357
|
+
10.0,
|
|
358
|
+
20.0,
|
|
359
|
+
],
|
|
360
|
+
[
|
|
361
|
+
100.0,
|
|
362
|
+
200.0,
|
|
363
|
+
],
|
|
364
|
+
[
|
|
365
|
+
1000.0,
|
|
366
|
+
2000.0,
|
|
367
|
+
],
|
|
368
|
+
[10000.0, 20000.0],
|
|
369
|
+
],
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
arr_size = len(value_dict['c_array_float32'][0])
|
|
373
|
+
tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
|
|
374
|
+
|
|
375
|
+
schema = pa.schema(
|
|
376
|
+
[
|
|
377
|
+
('c_id', pa.int32()),
|
|
378
|
+
('c_int64', pa.int64()),
|
|
379
|
+
('c_int32', pa.int32()),
|
|
380
|
+
('c_float32', pa.float32()),
|
|
381
|
+
('c_string', pa.string()),
|
|
382
|
+
('c_boolean', pa.bool_()),
|
|
383
|
+
('c_timestamp', pa.timestamp('us')),
|
|
384
|
+
('c_array_float32', tensor_type),
|
|
385
|
+
]
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
test_table = pa.Table.from_pydict(value_dict, schema=schema)
|
|
389
|
+
pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
|
|
393
|
+
import datasets
|
|
394
|
+
assert df.count() == hf_dataset.num_rows
|
|
395
|
+
assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
|
|
396
|
+
|
|
397
|
+
# immutable so we can use it as in a set
|
|
398
|
+
DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
|
|
399
|
+
acc_dataset: Set[DatasetTuple] = set()
|
|
400
|
+
for tup in hf_dataset:
|
|
401
|
+
immutable_tup = {}
|
|
402
|
+
for k in tup:
|
|
403
|
+
if isinstance(tup[k], list):
|
|
404
|
+
immutable_tup[k] = tuple(tup[k])
|
|
405
|
+
else:
|
|
406
|
+
immutable_tup[k] = tup[k]
|
|
407
|
+
|
|
408
|
+
acc_dataset.add(DatasetTuple(**immutable_tup))
|
|
409
|
+
|
|
410
|
+
for tup in df.collect():
|
|
411
|
+
assert tup[split_column_name] in hf_dataset.split._name
|
|
412
|
+
|
|
413
|
+
encoded_tup = {}
|
|
414
|
+
for column_name, value in tup.items():
|
|
415
|
+
if column_name == split_column_name:
|
|
416
|
+
continue
|
|
417
|
+
feature_type = hf_dataset.features[column_name]
|
|
418
|
+
if isinstance(feature_type, datasets.ClassLabel):
|
|
419
|
+
assert value in feature_type.names
|
|
420
|
+
# must use the index of the class label as the value to
|
|
421
|
+
# compare with dataset iteration output.
|
|
422
|
+
value = feature_type.encode_example(value)
|
|
423
|
+
elif isinstance(feature_type, datasets.Sequence):
|
|
424
|
+
assert feature_type.feature.dtype == 'float32', 'may need to add more types'
|
|
425
|
+
value = tuple([float(x) for x in value])
|
|
426
|
+
|
|
427
|
+
encoded_tup[column_name] = value
|
|
428
|
+
|
|
429
|
+
check_tup = DatasetTuple(**encoded_tup)
|
|
430
|
+
assert check_tup in acc_dataset
|
|
431
|
+
|
|
432
|
+
@pxt.expr_udf
|
|
433
|
+
def img_embed(img: PIL.Image.Image) -> np.ndarray:
|
|
434
|
+
return clip_image(img, model_id='openai/clip-vit-base-patch32')
|
|
435
|
+
|
|
436
|
+
@pxt.expr_udf
|
|
437
|
+
def text_embed(txt: str) -> np.ndarray:
|
|
438
|
+
return clip_text(txt, model_id='openai/clip-vit-base-patch32')
|
|
439
|
+
|
|
440
|
+
SAMPLE_IMAGE_URL = \
|
|
441
|
+
'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'
|
|
442
|
+
|
|
@@ -136,6 +136,22 @@ class Dumper:
|
|
|
136
136
|
for i in range(num_rows)
|
|
137
137
|
]
|
|
138
138
|
t.insert(rows)
|
|
139
|
+
self.cl.create_dir('views')
|
|
140
|
+
v = self.cl.create_view('views.sample_view', t, filter=(t.c2 < 50))
|
|
141
|
+
_ = self.cl.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
|
|
142
|
+
# Computed column using a library function
|
|
143
|
+
v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
|
|
144
|
+
# Computed column using a bespoke udf
|
|
145
|
+
v['test_udf'] = test_udf(t.c2)
|
|
146
|
+
# astype
|
|
147
|
+
v['astype'] = t.c1.astype(pxt.FloatType())
|
|
148
|
+
# computed column using a stored function
|
|
149
|
+
v['stored'] = t.c1.apply(lambda x: f'Hello, {x}', col_type=pxt.StringType())
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@pxt.udf
|
|
153
|
+
def test_udf(n: int) -> int:
|
|
154
|
+
return n + 1
|
|
139
155
|
|
|
140
156
|
|
|
141
157
|
def main() -> None:
|