PyPI - pixeltable - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl - Mend

pixeltable 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show

pixeltable/catalog/column.py +26 -49
pixeltable/catalog/insertable_table.py +7 -4
pixeltable/catalog/table.py +163 -57
pixeltable/catalog/table_version.py +416 -140
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/client.py +72 -6
pixeltable/dataframe.py +65 -21
pixeltable/env.py +52 -53
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/in_memory_data_node.py +11 -7
pixeltable/exprs/comparison.py +3 -3
pixeltable/exprs/data_row.py +5 -1
pixeltable/exprs/literal.py +16 -4
pixeltable/exprs/row_builder.py +8 -40
pixeltable/ext/__init__.py +5 -0
pixeltable/ext/functions/yolox.py +92 -0
pixeltable/func/aggregate_function.py +15 -15
pixeltable/func/expr_template_function.py +9 -1
pixeltable/func/globals.py +24 -14
pixeltable/func/signature.py +18 -12
pixeltable/func/udf.py +7 -2
pixeltable/functions/__init__.py +9 -9
pixeltable/functions/eval.py +7 -8
pixeltable/functions/fireworks.py +10 -37
pixeltable/functions/huggingface.py +47 -19
pixeltable/functions/openai.py +192 -24
pixeltable/functions/together.py +104 -9
pixeltable/functions/util.py +11 -0
pixeltable/index/__init__.py +2 -0
pixeltable/index/base.py +49 -0
pixeltable/index/embedding_index.py +95 -0
pixeltable/metadata/schema.py +45 -22
pixeltable/plan.py +15 -34
pixeltable/store.py +38 -41
pixeltable/tests/conftest.py +8 -14
pixeltable/tests/ext/test_yolox.py +21 -0
pixeltable/tests/functions/test_fireworks.py +43 -0
pixeltable/tests/functions/test_functions.py +60 -0
pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +7 -143
pixeltable/tests/functions/test_openai.py +162 -0
pixeltable/tests/functions/test_together.py +112 -0
pixeltable/tests/test_component_view.py +14 -5
pixeltable/tests/test_dataframe.py +23 -22
pixeltable/tests/test_exprs.py +99 -102
pixeltable/tests/test_function.py +51 -43
pixeltable/tests/test_index.py +138 -0
pixeltable/tests/test_migration.py +2 -1
pixeltable/tests/test_snapshot.py +24 -1
pixeltable/tests/test_table.py +205 -26
pixeltable/tests/test_types.py +30 -0
pixeltable/tests/test_video.py +16 -16
pixeltable/tests/test_view.py +5 -0
pixeltable/tests/utils.py +171 -14
pixeltable/tool/create_test_db_dump.py +16 -0
pixeltable/type_system.py +77 -128
pixeltable/utils/arrow.py +98 -0
pixeltable/utils/hf_datasets.py +157 -0
pixeltable/utils/parquet.py +68 -27
pixeltable/utils/pytorch.py +16 -97
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/METADATA +35 -28
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/RECORD +63 -50
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/LICENSE +0 -0
{pixeltable-0.2.3.dist-info → pixeltable-0.2.5.dist-info}/WHEEL +0 -0

pixeltable/tests/utils.py CHANGED Viewed

@@ -2,9 +2,11 @@ import datetime
 import glob
 import json
 import os
+from collections import namedtuple
 from pathlib import Path
-from typing import Dict, Any, List, Optional
+from typing import Any, Dict, List, Optional, Set
+import PIL.Image
 import numpy as np
 import pandas as pd
 import pytest
@@ -12,12 +14,22 @@ import pytest
 import pixeltable as pxt
 import pixeltable.type_system as ts
 from pixeltable import catalog
+from pixeltable.catalog.globals import UpdateStatus
 from pixeltable.dataframe import DataFrameResultSet
 from pixeltable.env import Env
-from pixeltable.type_system import \
-    ColumnType, StringType, IntType, FloatType, ArrayType, BoolType, TimestampType, JsonType, ImageType, VideoType
+from pixeltable.functions.huggingface import clip_image, clip_text
+from pixeltable.type_system import (
+    ArrayType,
+    BoolType,
+    ColumnType,
+    FloatType,
+    ImageType,
+    IntType,
+    JsonType,
+    StringType,
+    TimestampType,
+    VideoType,
+)
 def make_default_type(t: ColumnType.Type) -> ColumnType:
@@ -33,6 +45,7 @@ def make_default_type(t: ColumnType.Type) -> ColumnType:
         return TimestampType()
     assert False
 def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
     if col_names is None:
         col_names = ['c1']
@@ -41,7 +54,9 @@ def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]]
         schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
     return cl.create_table(name, schema)
-def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[Dict[str, Any]]:
+def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[
+    Dict[str, Any]]:
     if col_names is None:
         col_names = []
     data: Dict[str, Any] = {}
@@ -114,6 +129,7 @@ def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, n
     rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
     return rows
 def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
     schema = {
         'c1': StringType(nullable=False),
@@ -179,12 +195,25 @@ def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table
     t.insert(rows)
     return t
+def create_img_tbl(cl: pxt.Client, name: str = 'test_img_tbl') -> catalog.Table:
+    schema = {
+        'img': ImageType(nullable=False),
+        'category': StringType(nullable=False),
+        'split': StringType(nullable=False),
+    }
+    tbl = cl.create_table(name, schema)
+    rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
+    tbl.insert(rows)
+    return tbl
 def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
     """ Creates a table with all supported datatypes.
     """
     schema = {
-        'row_id': IntType(nullable=False), # used for row selection
-        'c_array': ArrayType(shape=(10,),  dtype=FloatType(), nullable=True),
+        'row_id': IntType(nullable=False),  # used for row selection
+        'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
         'c_bool': BoolType(nullable=True),
         'c_float': FloatType(nullable=True),
         'c_image': ImageType(nullable=True),
@@ -197,12 +226,13 @@ def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
     tbl = test_client.create_table('all_datatype_tbl', schema)
     example_rows = create_table_data(tbl, num_rows=11)
-    for i,r in enumerate(example_rows):
-        r['row_id'] = i # row_id
+    for i, r in enumerate(example_rows):
+        r['row_id'] = i  # row_id
     tbl.insert(example_rows)
     return tbl
 def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
     """
     Locate dir_name, create df out of file_name.
@@ -213,7 +243,7 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
     """
     if path_col_names is None:
         path_col_names = []
-    tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
+    tests_dir = os.path.dirname(__file__)  # search with respect to tests/ dir
     glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
     assert len(glob_result) == 1, f'Could not find {dir_name}'
     abs_path = Path(glob_result[0])
@@ -225,8 +255,9 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[
         df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
     return df.to_dict(orient='records')
 def get_video_files(include_bad_video: bool = False) -> List[str]:
-    tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
+    tests_dir = os.path.dirname(__file__)  # search with respect to tests/ dir
     glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
     if not include_bad_video:
         glob_result = [f for f in glob_result if 'bad_video' not in f]
@@ -234,18 +265,21 @@ def get_video_files(include_bad_video: bool = False) -> List[str]:
     half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
     return half_res
 def get_test_video_files() -> List[str]:
-    tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
+    tests_dir = os.path.dirname(__file__)  # search with respect to tests/ dir
     glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
     return glob_result
 def get_image_files(include_bad_image: bool = False) -> List[str]:
-    tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
+    tests_dir = os.path.dirname(__file__)  # search with respect to tests/ dir
     glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
     if not include_bad_image:
         glob_result = [f for f in glob_result if 'bad_image' not in f]
     return glob_result
 def get_audio_files(include_bad_audio: bool = False) -> List[str]:
     tests_dir = os.path.dirname(__file__)
     glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
@@ -253,11 +287,13 @@ def get_audio_files(include_bad_audio: bool = False) -> List[str]:
         glob_result = [f for f in glob_result if 'bad_audio' not in f]
     return glob_result
 def get_documents() -> List[str]:
     tests_dir = os.path.dirname(__file__)
     # for now, we can only handle .html and .md
     return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
 def get_sentences(n: int = 100) -> List[str]:
     tests_dir = os.path.dirname(__file__)
     path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
@@ -266,6 +302,7 @@ def get_sentences(n: int = 100) -> List[str]:
     # this dataset contains \' around the questions
     return [q['question'].replace("'", '') for q in questions_list[:n]]
 def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
     assert len(r1) == len(r2)
     assert len(r1.column_names()) == len(r2.column_names())  # we don't care about the actual column names
@@ -280,6 +317,126 @@ def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
         else:
             assert s1.equals(s2)
 def skip_test_if_not_installed(package) -> None:
     if not Env.get().is_installed_package(package):
         pytest.skip(f'Package `{package}` is not installed.')
+def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
+    assert status.num_excs == 0
+    if expected_rows is not None:
+        assert status.num_rows == expected_rows
+def make_test_arrow_table(output_path: Path) -> None:
+    import pyarrow as pa
+    value_dict = {
+        'c_id': [1, 2, 3, 4, 5],
+        'c_int64': [-10, -20, -30, -40, None],
+        'c_int32': [-1, -2, -3, -4, None],
+        'c_float32': [1.1, 2.2, 3.3, 4.4, None],
+        'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
+        'c_boolean': [True, False, True, False, None],
+        'c_timestamp': [
+            datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
+            datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
+            datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
+            datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
+            None,
+        ],
+        # The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
+        # So, no nulls in this column
+        'c_array_float32': [
+            [
+                1.0,
+                2.0,
+            ],
+            [
+                10.0,
+                20.0,
+            ],
+            [
+                100.0,
+                200.0,
+            ],
+            [
+                1000.0,
+                2000.0,
+            ],
+            [10000.0, 20000.0],
+        ],
+    }
+    arr_size = len(value_dict['c_array_float32'][0])
+    tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
+    schema = pa.schema(
+        [
+            ('c_id', pa.int32()),
+            ('c_int64', pa.int64()),
+            ('c_int32', pa.int32()),
+            ('c_float32', pa.float32()),
+            ('c_string', pa.string()),
+            ('c_boolean', pa.bool_()),
+            ('c_timestamp', pa.timestamp('us')),
+            ('c_array_float32', tensor_type),
+        ]
+    )
+    test_table = pa.Table.from_pydict(value_dict, schema=schema)
+    pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
+def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
+    import datasets
+    assert df.count() == hf_dataset.num_rows
+    assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
+    # immutable so we can use it as in a set
+    DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
+    acc_dataset: Set[DatasetTuple] = set()
+    for tup in hf_dataset:
+        immutable_tup = {}
+        for k in tup:
+            if isinstance(tup[k], list):
+                immutable_tup[k] = tuple(tup[k])
+            else:
+                immutable_tup[k] = tup[k]
+        acc_dataset.add(DatasetTuple(**immutable_tup))
+    for tup in df.collect():
+        assert tup[split_column_name] in hf_dataset.split._name
+        encoded_tup = {}
+        for column_name, value in tup.items():
+            if column_name == split_column_name:
+                continue
+            feature_type = hf_dataset.features[column_name]
+            if isinstance(feature_type, datasets.ClassLabel):
+                assert value in feature_type.names
+                # must use the index of the class label as the value to
+                # compare with dataset iteration output.
+                value = feature_type.encode_example(value)
+            elif isinstance(feature_type, datasets.Sequence):
+                assert feature_type.feature.dtype == 'float32', 'may need to add more types'
+                value = tuple([float(x) for x in value])
+            encoded_tup[column_name] = value
+        check_tup = DatasetTuple(**encoded_tup)
+        assert check_tup in acc_dataset
+@pxt.expr_udf
+def img_embed(img: PIL.Image.Image) -> np.ndarray:
+    return clip_image(img, model_id='openai/clip-vit-base-patch32')
+@pxt.expr_udf
+def text_embed(txt: str) -> np.ndarray:
+    return clip_text(txt, model_id='openai/clip-vit-base-patch32')
+SAMPLE_IMAGE_URL = \
+    'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'

pixeltable/tool/create_test_db_dump.py CHANGED Viewed

@@ -136,6 +136,22 @@ class Dumper:
             for i in range(num_rows)
         ]
         t.insert(rows)
+        self.cl.create_dir('views')
+        v = self.cl.create_view('views.sample_view', t, filter=(t.c2 < 50))
+        _ = self.cl.create_view('views.sample_snapshot', t, filter=(t.c2 >= 75), is_snapshot=True)
+        # Computed column using a library function
+        v['str_format'] = pxt.functions.string.str_format('{0} {key}', t.c1, key=t.c1)
+        # Computed column using a bespoke udf
+        v['test_udf'] = test_udf(t.c2)
+        # astype
+        v['astype'] = t.c1.astype(pxt.FloatType())
+        # computed column using a stored function
+        v['stored'] = t.c1.apply(lambda x: f'Hello, {x}', col_type=pxt.StringType())
+@pxt.udf
+def test_udf(n: int) -> int:
+    return n + 1
 def main() -> None:

pixeltable 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.3py3-none-any.whl → 0.2.5py3-none-any.whl