pixeltable 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show
  1. pixeltable/catalog/column.py +26 -49
  2. pixeltable/catalog/insertable_table.py +7 -4
  3. pixeltable/catalog/table.py +163 -57
  4. pixeltable/catalog/table_version.py +416 -140
  5. pixeltable/catalog/table_version_path.py +2 -2
  6. pixeltable/client.py +72 -6
  7. pixeltable/dataframe.py +65 -21
  8. pixeltable/env.py +52 -53
  9. pixeltable/exec/cache_prefetch_node.py +1 -1
  10. pixeltable/exec/in_memory_data_node.py +11 -7
  11. pixeltable/exprs/comparison.py +3 -3
  12. pixeltable/exprs/data_row.py +5 -1
  13. pixeltable/exprs/literal.py +16 -4
  14. pixeltable/exprs/row_builder.py +8 -40
  15. pixeltable/ext/__init__.py +5 -0
  16. pixeltable/ext/functions/yolox.py +92 -0
  17. pixeltable/func/aggregate_function.py +15 -15
  18. pixeltable/func/expr_template_function.py +9 -1
  19. pixeltable/func/globals.py +24 -14
  20. pixeltable/func/signature.py +18 -12
  21. pixeltable/func/udf.py +7 -2
  22. pixeltable/functions/__init__.py +9 -9
  23. pixeltable/functions/eval.py +7 -8
  24. pixeltable/functions/fireworks.py +10 -37
  25. pixeltable/functions/huggingface.py +47 -19
  26. pixeltable/functions/openai.py +192 -24
  27. pixeltable/functions/together.py +104 -9
  28. pixeltable/functions/util.py +11 -0
  29. pixeltable/index/__init__.py +2 -0
  30. pixeltable/index/base.py +49 -0
  31. pixeltable/index/embedding_index.py +95 -0
  32. pixeltable/metadata/schema.py +45 -22
  33. pixeltable/plan.py +15 -34
  34. pixeltable/store.py +38 -41
  35. pixeltable/tests/conftest.py +8 -14
  36. pixeltable/tests/ext/test_yolox.py +21 -0
  37. pixeltable/tests/functions/test_fireworks.py +43 -0
  38. pixeltable/tests/functions/test_functions.py +60 -0
  39. pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +7 -143
  40. pixeltable/tests/functions/test_openai.py +162 -0
  41. pixeltable/tests/functions/test_together.py +112 -0
  42. pixeltable/tests/test_component_view.py +14 -5
  43. pixeltable/tests/test_dataframe.py +23 -22
  44. pixeltable/tests/test_exprs.py +99 -102
  45. pixeltable/tests/test_function.py +51 -43
  46. pixeltable/tests/test_index.py +138 -0
  47. pixeltable/tests/test_migration.py +2 -1
  48. pixeltable/tests/test_snapshot.py +24 -1
  49. pixeltable/tests/test_table.py +205 -26
  50. pixeltable/tests/test_types.py +30 -0
  51. pixeltable/tests/test_video.py +16 -16
  52. pixeltable/tests/test_view.py +5 -0
  53. pixeltable/tests/utils.py +171 -14
  54. pixeltable/tool/create_test_db_dump.py +16 -0
  55. pixeltable/type_system.py +77 -128
  56. pixeltable/utils/arrow.py +98 -0
  57. pixeltable/utils/hf_datasets.py +157 -0
  58. pixeltable/utils/parquet.py +68 -27
  59. pixeltable/utils/pytorch.py +16 -97
  60. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/METADATA +35 -28
  61. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/RECORD +63 -50
  62. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
  63. {pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0
pixeltable/tests/utils.py CHANGED
@@ -2,9 +2,11 @@ import datetime
2
2
  import glob
3
3
  import json
4
4
  import os
5
+ from collections import namedtuple
5
6
  from pathlib import Path
6
- from typing import Dict, Any, List, Optional
7
+ from typing import Any, Dict, List, Optional, Set
7
8
 
9
+ import PIL.Image
8
10
  import numpy as np
9
11
  import pandas as pd
10
12
  import pytest
@@ -12,12 +14,22 @@ import pytest
12
14
  import pixeltable as pxt
13
15
  import pixeltable.type_system as ts
14
16
  from pixeltable import catalog
17
+ from pixeltable.catalog.globals import UpdateStatus
15
18
  from pixeltable.dataframe import DataFrameResultSet
16
19
  from pixeltable.env import Env
17
- from pixeltable.type_system import \
18
- ColumnType, StringType, IntType, FloatType, ArrayType, BoolType, TimestampType, JsonType, ImageType, VideoType
19
-
20
-
20
+ from pixeltable.functions.huggingface import clip_image, clip_text
21
+ from pixeltable.type_system import (
22
+ ArrayType,
23
+ BoolType,
24
+ ColumnType,
25
+ FloatType,
26
+ ImageType,
27
+ IntType,
28
+ JsonType,
29
+ StringType,
30
+ TimestampType,
31
+ VideoType,
32
+ )
21
33
 
22
34
 
23
35
  def make_default_type(t: ColumnType.Type) -> ColumnType:
@@ -33,6 +45,7 @@ def make_default_type(t: ColumnType.Type) -> ColumnType:
33
45
  return TimestampType()
34
46
  assert False
35
47
 
48
+
36
49
  def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
37
50
  if col_names is None:
38
51
  col_names = ['c1']
@@ -41,7 +54,9 @@ def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]]
41
54
  schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
42
55
  return cl.create_table(name, schema)
43
56
 
44
- def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[Dict[str, Any]]:
57
+
58
+ def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[
59
+ Dict[str, Any]]:
45
60
  if col_names is None:
46
61
  col_names = []
47
62
  data: Dict[str, Any] = {}
@@ -114,6 +129,7 @@ def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, n
114
129
  rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
115
130
  return rows
116
131
 
132
+
117
133
  def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
118
134
  schema = {
119
135
  'c1': StringType(nullable=False),
@@ -179,12 +195,25 @@ def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table
179
195
  t.insert(rows)
180
196
  return t
181
197
 
198
+
199
+ def create_img_tbl(cl: pxt.Client, name: str = 'test_img_tbl') -> catalog.Table:
200
+ schema = {
201
+ 'img': ImageType(nullable=False),
202
+ 'category': StringType(nullable=False),
203
+ 'split': StringType(nullable=False),
204
+ }
205
+ tbl = cl.create_table(name, schema)
206
+ rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
207
+ tbl.insert(rows)
208
+ return tbl
209
+
210
+
182
211
  def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
183
212
  """ Creates a table with all supported datatypes.
184
213
  """
185
214
  schema = {
186
- 'row_id': IntType(nullable=False), # used for row selection
187
- 'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
215
+ 'row_id': IntType(nullable=False), # used for row selection
216
+ 'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
188
217
  'c_bool': BoolType(nullable=True),
189
218
  'c_float': FloatType(nullable=True),
190
219
  'c_image': ImageType(nullable=True),
@@ -197,12 +226,13 @@ def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
197
226
  tbl = test_client.create_table('all_datatype_tbl', schema)
198
227
  example_rows = create_table_data(tbl, num_rows=11)
199
228
 
200
- for i,r in enumerate(example_rows):
201
- r['row_id'] = i # row_id
229
+ for i, r in enumerate(example_rows):
230
+ r['row_id'] = i # row_id
202
231
 
203
232
  tbl.insert(example_rows)
204
233
  return tbl
205
234
 
235
+
206
236
  def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
207
237
  """
208
238
  Locate dir_name, create df out of file_name.
@@ -213,7 +243,7 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
213
243
  """
214
244
  if path_col_names is None:
215
245
  path_col_names = []
216
- tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
246
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
217
247
  glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
218
248
  assert len(glob_result) == 1, f'Could not find {dir_name}'
219
249
  abs_path = Path(glob_result[0])
@@ -225,8 +255,9 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
225
255
  df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
226
256
  return df.to_dict(orient='records')
227
257
 
258
+
228
259
  def get_video_files(include_bad_video: bool = False) -> List[str]:
229
- tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
260
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
230
261
  glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
231
262
  if not include_bad_video:
232
263
  glob_result = [f for f in glob_result if 'bad_video' not in f]
@@ -234,18 +265,21 @@ def get_video_files(include_bad_video: bool = False) -> List[str]:
234
265
  half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
235
266
  return half_res
236
267
 
268
+
237
269
  def get_test_video_files() -> List[str]:
238
- tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
270
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
239
271
  glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
240
272
  return glob_result
241
273
 
274
+
242
275
  def get_image_files(include_bad_image: bool = False) -> List[str]:
243
- tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
276
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
244
277
  glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
245
278
  if not include_bad_image:
246
279
  glob_result = [f for f in glob_result if 'bad_image' not in f]
247
280
  return glob_result
248
281
 
282
+
249
283
  def get_audio_files(include_bad_audio: bool = False) -> List[str]:
250
284
  tests_dir = os.path.dirname(__file__)
251
285
  glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
@@ -253,11 +287,13 @@ def get_audio_files(include_bad_audio: bool = False) -> List[str]:
253
287
  glob_result = [f for f in glob_result if 'bad_audio' not in f]
254
288
  return glob_result
255
289
 
290
+
256
291
  def get_documents() -> List[str]:
257
292
  tests_dir = os.path.dirname(__file__)
258
293
  # for now, we can only handle .html and .md
259
294
  return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
260
295
 
296
+
261
297
  def get_sentences(n: int = 100) -> List[str]:
262
298
  tests_dir = os.path.dirname(__file__)
263
299
  path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
@@ -266,6 +302,7 @@ def get_sentences(n: int = 100) -> List[str]:
266
302
  # this dataset contains \' around the questions
267
303
  return [q['question'].replace("'", '') for q in questions_list[:n]]
268
304
 
305
+
269
306
  def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
270
307
  assert len(r1) == len(r2)
271
308
  assert len(r1.column_names()) == len(r2.column_names()) # we don't care about the actual column names
@@ -280,6 +317,126 @@ def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
280
317
  else:
281
318
  assert s1.equals(s2)
282
319
 
320
+
283
321
  def skip_test_if_not_installed(package) -> None:
284
322
  if not Env.get().is_installed_package(package):
285
323
  pytest.skip(f'Package `{package}` is not installed.')
324
+
325
+
326
+ def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
327
+ assert status.num_excs == 0
328
+ if expected_rows is not None:
329
+ assert status.num_rows == expected_rows
330
+
331
+
332
+ def make_test_arrow_table(output_path: Path) -> None:
333
+ import pyarrow as pa
334
+
335
+ value_dict = {
336
+ 'c_id': [1, 2, 3, 4, 5],
337
+ 'c_int64': [-10, -20, -30, -40, None],
338
+ 'c_int32': [-1, -2, -3, -4, None],
339
+ 'c_float32': [1.1, 2.2, 3.3, 4.4, None],
340
+ 'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
341
+ 'c_boolean': [True, False, True, False, None],
342
+ 'c_timestamp': [
343
+ datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
344
+ datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
345
+ datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
346
+ datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
347
+ None,
348
+ ],
349
+ # The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
350
+ # So, no nulls in this column
351
+ 'c_array_float32': [
352
+ [
353
+ 1.0,
354
+ 2.0,
355
+ ],
356
+ [
357
+ 10.0,
358
+ 20.0,
359
+ ],
360
+ [
361
+ 100.0,
362
+ 200.0,
363
+ ],
364
+ [
365
+ 1000.0,
366
+ 2000.0,
367
+ ],
368
+ [10000.0, 20000.0],
369
+ ],
370
+ }
371
+
372
+ arr_size = len(value_dict['c_array_float32'][0])
373
+ tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
374
+
375
+ schema = pa.schema(
376
+ [
377
+ ('c_id', pa.int32()),
378
+ ('c_int64', pa.int64()),
379
+ ('c_int32', pa.int32()),
380
+ ('c_float32', pa.float32()),
381
+ ('c_string', pa.string()),
382
+ ('c_boolean', pa.bool_()),
383
+ ('c_timestamp', pa.timestamp('us')),
384
+ ('c_array_float32', tensor_type),
385
+ ]
386
+ )
387
+
388
+ test_table = pa.Table.from_pydict(value_dict, schema=schema)
389
+ pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
390
+
391
+
392
+ def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
393
+ import datasets
394
+ assert df.count() == hf_dataset.num_rows
395
+ assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
396
+
397
+ # immutable so we can use it as in a set
398
+ DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
399
+ acc_dataset: Set[DatasetTuple] = set()
400
+ for tup in hf_dataset:
401
+ immutable_tup = {}
402
+ for k in tup:
403
+ if isinstance(tup[k], list):
404
+ immutable_tup[k] = tuple(tup[k])
405
+ else:
406
+ immutable_tup[k] = tup[k]
407
+
408
+ acc_dataset.add(DatasetTuple(**immutable_tup))
409
+
410
+ for tup in df.collect():
411
+ assert tup[split_column_name] in hf_dataset.split._name
412
+
413
+ encoded_tup = {}
414
+ for column_name, value in tup.items():
415
+ if column_name == split_column_name:
416
+ continue
417
+ feature_type = hf_dataset.features[column_name]
418
+ if isinstance(feature_type, datasets.ClassLabel):
419
+ assert value in feature_type.names
420
+ # must use the index of the class label as the value to
421
+ # compare with dataset iteration output.
422
+ value = feature_type.encode_example(value)
423
+ elif isinstance(feature_type, datasets.Sequence):
424
+ assert feature_type.feature.dtype == 'float32', 'may need to add more types'
425
+ value = tuple([float(x) for x in value])
426
+
427
+ encoded_tup[column_name] = value
428
+
429
+ check_tup = DatasetTuple(**encoded_tup)
430
+ assert check_tup in acc_dataset
431
+
432
+ @pxt.expr_udf
433
+ def img_embed(img: PIL.Image.Image) -> np.ndarray:
434
+ return clip_image(img, model_id='openai/clip-vit-base-patch32')
435
+
436
+ @pxt.expr_udf
437
+ def text_embed(txt: str) -> np.ndarray:
438
+ return clip_text(txt, model_id='openai/clip-vit-base-patch32')
439
+
440
+ SAMPLE_IMAGE_URL = \
441
+ 'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'
442
+
@@ -136,6 +136,22 @@ class Dumper:
136
136
  for i in range(num_rows)
137
137
  ]
138
138
  t.insert(rows)
139
+ self.cl.create_dir('views')
140
+ v = self.cl.create_view('views.sample_view', t, filter=(t.c2 < 50))
141
+ _ = self.cl.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
142
+ # Computed column using a library function
143
+ v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
144
+ # Computed column using a bespoke udf
145
+ v['test_udf'] = test_udf(t.c2)
146
+ # astype
147
+ v['astype'] = t.c1.astype(pxt.FloatType())
148
+ # computed column using a stored function
149
+ v['stored'] = t.c1.apply(lambda x: f'Hello, {x}', col_type=pxt.StringType())
150
+
151
+
152
+ @pxt.udf
153
+ def test_udf(n: int) -> int:
154
+ return n + 1
139
155
 
140
156
 
141
157
  def main() -> None: