PyPI - pixeltable - Versions diffs - 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

pixeltable 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (25) hide show

pixeltable/catalog/column.py +1 -1
pixeltable/client.py +72 -2
pixeltable/env.py +36 -52
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/fireworks.py +10 -37
pixeltable/functions/openai.py +192 -24
pixeltable/functions/together.py +104 -9
pixeltable/tests/conftest.py +4 -4
pixeltable/tests/functions/test_fireworks.py +42 -0
pixeltable/tests/functions/test_functions.py +60 -0
pixeltable/tests/{test_functions.py → functions/test_huggingface.py} +5 -141
pixeltable/tests/functions/test_openai.py +152 -0
pixeltable/tests/functions/test_together.py +111 -0
pixeltable/tests/test_dataframe.py +4 -4
pixeltable/tests/test_table.py +105 -2
pixeltable/tests/utils.py +128 -5
pixeltable/type_system.py +41 -84
pixeltable/utils/arrow.py +98 -0
pixeltable/utils/hf_datasets.py +157 -0
pixeltable/utils/parquet.py +68 -27
pixeltable/utils/pytorch.py +16 -97
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/METADATA +33 -27
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/RECORD +25 -19
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/LICENSE +0 -0
{pixeltable-0.2.3.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +0 -0

pixeltable/tests/functions/test_together.py ADDED Viewed

@@ -0,0 +1,111 @@
+import pytest
+import pixeltable as pxt
+import pixeltable.exceptions as excs
+from pixeltable.tests.utils import skip_test_if_not_installed, validate_update_status
+class TestTogether:
+    def test_completions(self, test_client: pxt.Client) -> None:
+        skip_test_if_not_installed('together')
+        TestTogether.skip_test_if_no_together_client()
+        cl = test_client
+        t = cl.create_table('test_tbl', {'input': pxt.StringType()})
+        from pixeltable.functions.together import completions
+        t.add_column(output=completions(prompt=t.input, model='mistralai/Mixtral-8x7B-v0.1', stop=['\n']))
+        t.add_column(output_2=completions(
+            prompt=t.input,
+            model='mistralai/Mixtral-8x7B-v0.1',
+            max_tokens=300,
+            stop=['\n'],
+            temperature=0.7,
+            top_p=0.9,
+            top_k=40,
+            repetition_penalty=1.1,
+            logprobs=1,
+            echo=True,
+            n=3,
+            safety_model='Meta-Llama/Llama-Guard-7b'
+        ))
+        validate_update_status(t.insert(input='I am going to the '), 1)
+        result = t.collect()
+        assert len(result['output'][0]['choices'][0]['text']) > 0
+        assert len(result['output_2'][0]['choices'][0]['text']) > 0
+    def test_chat_completions(self, test_client: pxt.Client) -> None:
+        skip_test_if_not_installed('together')
+        TestTogether.skip_test_if_no_together_client()
+        cl = test_client
+        t = cl.create_table('test_tbl', {'input': pxt.StringType()})
+        messages = [{'role': 'user', 'content': t.input}]
+        from pixeltable.functions.together import chat_completions
+        t.add_column(output=chat_completions(messages=messages, model='mistralai/Mixtral-8x7B-v0.1', stop=['\n']))
+        t.add_column(output_2=chat_completions(
+            messages=messages,
+            model='mistralai/Mixtral-8x7B-Instruct-v0.1',
+            max_tokens=300,
+            stop=['\n'],
+            temperature=0.7,
+            top_p=0.9,
+            top_k=40,
+            repetition_penalty=1.1,
+            logprobs=1,
+            echo=True,
+            n=3,
+            safety_model='Meta-Llama/Llama-Guard-7b',
+            response_format={'type': 'json_object'}
+        ))
+        validate_update_status(t.insert(input='Give me a typical example of a JSON structure.'), 1)
+        result = t.collect()
+        assert len(result['output'][0]['choices'][0]['message']) > 0
+        assert len(result['output_2'][0]['choices'][0]['message']) > 0
+    def test_embeddings(self, test_client: pxt.Client) -> None:
+        skip_test_if_not_installed('together')
+        TestTogether.skip_test_if_no_together_client()
+        cl = test_client
+        t = cl.create_table('test_tbl', {'input': pxt.StringType()})
+        from pixeltable.functions.together import embeddings
+        t.add_column(embed=embeddings(input=t.input, model='togethercomputer/m2-bert-80M-8k-retrieval'))
+        validate_update_status(t.insert(input='Together AI provides a variety of embeddings models.'), 1)
+        assert len(t.collect()['embed'][0]) > 0
+    def test_image_generations(self, test_client: pxt.Client) -> None:
+        skip_test_if_not_installed('together')
+        TestTogether.skip_test_if_no_together_client()
+        cl = test_client
+        t = cl.create_table(
+            'test_tbl',
+            {'input': pxt.StringType(), 'negative_prompt': pxt.StringType(nullable=True)}
+        )
+        from pixeltable.functions.together import image_generations
+        t.add_column(img=image_generations(t.input, model='runwayml/stable-diffusion-v1-5'))
+        t.add_column(img_2=image_generations(
+            t.input,
+            model='stabilityai/stable-diffusion-2-1',
+            steps=30,
+            seed=4178780,
+            height=768,
+            width=512,
+            negative_prompt=t.negative_prompt
+        ))
+        validate_update_status(t.insert([
+            {'input': 'A friendly dinosaur playing tennis in a cornfield'},
+            {'input': 'A friendly dinosaur playing tennis in a cornfield',
+             'negative_prompt': 'tennis court'}
+        ]), 2)
+        assert t.collect()['img'][0].size == (512, 512)
+        assert t.collect()['img_2'][0].size == (512, 768)
+        assert t.collect()['img'][1].size == (512, 512)
+        assert t.collect()['img_2'][1].size == (512, 768)
+    # This ensures that the test will be skipped, rather than returning an error, when no API key is
+    # available (for example, when a PR runs in CI).
+    @staticmethod
+    def skip_test_if_no_together_client() -> None:
+        try:
+            import pixeltable.functions.together
+            _ = pixeltable.functions.together.together_client()
+        except excs.Error as exc:
+            pytest.skip(str(exc))

pixeltable/tests/test_dataframe.py CHANGED Viewed

@@ -33,7 +33,7 @@ class TestDataFrame:
         assert res1 == res4
         _ = t.where(t.c2 < 10).select(t.c2, t.c2).show(0) # repeated name no error
         # duplicate select list
         with pytest.raises(excs.Error) as exc_info:
             _ = t.select(t.c1).select(t.c2).show(0)
@@ -220,7 +220,7 @@ class TestDataFrame:
         for tup in ds:
             for col in df.get_column_names():
                 assert col in tup
             arrval = tup['c_array']
             assert isinstance(arrval, np.ndarray)
             col_type = type_dict['c_array']
@@ -304,7 +304,7 @@ class TestDataFrame:
         def restrict_json_for_default_collate(obj):
             keys = ['id', 'label', 'iscrowd', 'bounding_box']
             return {k: obj[k] for k in keys}
         t = all_datatypes_tbl
         df = t.select(
             t.row_id,
@@ -370,7 +370,7 @@ class TestDataFrame:
         #  check result cached
         ds1 = t.to_pytorch_dataset(image_format='pt')
         ds1_mtimes = _get_mtimes(ds1.path)
         ds2 = t.to_pytorch_dataset(image_format='pt')
         ds2_mtimes = _get_mtimes(ds2.path)
         assert ds2.path == ds1.path, 'result should be cached'

pixeltable/tests/test_table.py CHANGED Viewed

@@ -8,6 +8,7 @@ import PIL
 import cv2
 import numpy as np
 import pandas as pd
+import pathlib
 import pytest
 import pixeltable as pxt
@@ -17,7 +18,7 @@ from pixeltable import exceptions as excs
 from pixeltable.iterators import FrameIterator
 from pixeltable.tests.utils import \
     make_tbl, create_table_data, read_data_file, get_video_files, get_audio_files, get_image_files, get_documents, \
-    assert_resultset_eq
+    assert_resultset_eq, assert_hf_dataset_equal, make_test_arrow_table
 from pixeltable.tests.utils import skip_test_if_not_installed
 from pixeltable.type_system import \
     StringType, IntType, FloatType, TimestampType, ImageType, VideoType, JsonType, BoolType, ArrayType, AudioType, \
@@ -25,7 +26,6 @@ from pixeltable.type_system import \
 from pixeltable.utils.filecache import FileCache
 from pixeltable.utils.media_store import MediaStore
 class TestTable:
     # exc for a % 10 == 0
     @pxt.udf(return_type=FloatType(), param_types=[IntType()])
@@ -116,6 +116,100 @@ class TestTable:
         tbl.revert()
         assert tbl.num_retained_versions == num_retained_versions
+    def test_import_parquet(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
+        skip_test_if_not_installed('pyarrow')
+        import pyarrow as pa
+        from pixeltable.utils.arrow import iter_tuples
+        parquet_dir = tmp_path / 'test_data'
+        parquet_dir.mkdir()
+        make_test_arrow_table(parquet_dir)
+        tab = test_client.import_parquet('test_parquet', parquet_path=str(parquet_dir))
+        assert 'test_parquet' in test_client.list_tables()
+        assert tab is not None
+        num_elts = tab.count()
+        arrow_tab: pa.Table = pa.parquet.read_table(str(parquet_dir))
+        assert num_elts == arrow_tab.num_rows
+        assert set(tab.column_names()) == set(arrow_tab.column_names)
+        result_set = tab.order_by(tab.c_id).collect()
+        column_types = tab.column_types()
+        for tup, arrow_tup in zip(result_set, iter_tuples(arrow_tab)):
+            assert tup['c_id'] == arrow_tup['c_id']
+            for col, val in tup.items():
+                if val is None:
+                    assert arrow_tup[col] is None
+                    continue
+                if column_types[col].is_array_type():
+                    assert (val == arrow_tup[col]).all()
+                else:
+                    assert val == arrow_tup[col]
+    def test_import_huggingface_dataset(self, test_client: pxt.Client, tmp_path: pathlib.Path) -> None:
+        skip_test_if_not_installed('datasets')
+        import datasets
+        test_cases = [
+            # { # includes a timestamp. 20MB for specific slice
+            # Disbled this test case because download is failing, and its not critical.
+            #     'dataset_name': 'c4',
+            #     # see https://huggingface.co/datasets/allenai/c4/blob/main/realnewslike/c4-train.00000-of-00512.json.gz
+            #     'dataset': datasets.load_dataset(
+            #         "allenai/c4",
+            #         data_dir="realnewslike",
+            #         data_files="c4-train.00000-of-00512.json.gz",
+            #         split='train[:1000]',
+            #         cache_dir=tmp_path
+            #     ),
+            # },
+            {  # includes an embedding (array type), common in a few RAG datasets.
+                'dataset_name': 'cohere_wikipedia',
+                'dataset': datasets.load_dataset("Cohere/wikipedia-2023-11-embed-multilingual-v3",
+                                                 data_dir='cr').select_columns(['url', 'title', 'text', 'emb']),
+                # column with name `_id`` is not currently allowed by pixeltable rules,
+                # so filter out that column.
+                # cr subdir has a small number of rows, avoid running out of space in CI runner
+                # see https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/tree/main/cr
+                'schema_override': {'emb': ArrayType((1024,), dtype=FloatType(), nullable=False)}
+            },
+            # example of dataset dictionary with multiple splits
+            {
+                'dataset_name': 'rotten_tomatoes',
+                'dataset': datasets.load_dataset("rotten_tomatoes"),
+            },
+        ]
+        # test a column name for splits other than the default of 'split'
+        split_column_name = 'my_split_col'
+        for rec in test_cases:
+            dataset_name = rec['dataset_name']
+            hf_dataset = rec['dataset']
+            tab = test_client.import_huggingface_dataset(
+                dataset_name,
+                hf_dataset,
+                column_name_for_split=split_column_name,
+                schema_override=rec.get('schema_override', None),
+            )
+            if isinstance(hf_dataset, datasets.Dataset):
+                assert_hf_dataset_equal(hf_dataset, tab.df(), split_column_name)
+            elif isinstance(hf_dataset, datasets.DatasetDict):
+                assert tab.count() == sum(hf_dataset.num_rows.values())
+                assert split_column_name in tab.column_names()
+                for dataset_name in hf_dataset:
+                    df = tab.where(tab.my_split_col == dataset_name)
+                    assert_hf_dataset_equal(hf_dataset[dataset_name], df, split_column_name)
+            else:
+                assert False
+        with pytest.raises(excs.Error) as exc_info:
+            test_client.import_huggingface_dataset('test', {})
+        assert 'type(dataset)' in str(exc_info.value)
     def test_image_table(self, test_client: pxt.Client) -> None:
         n_sample_rows = 20
         cl = test_client
@@ -533,6 +627,15 @@ class TestTable:
             t.insert(c5=np.ndarray((3, 2)))
         assert 'expected ndarray((2, 3)' in str(exc_info.value)
+    def test_insert_string_with_null(self, test_client: pxt.Client) -> None:
+        cl = test_client
+        t = cl.create_table('test', {'c1': StringType()})
+        t.insert([{'c1': 'this is a python\x00string'}])
+        assert t.count() == 1
+        for tup in t.df().collect():
+            assert tup['c1'] == 'this is a python string'
     def test_query(self, test_client: pxt.Client) -> None:
         skip_test_if_not_installed('boto3')
         cl = test_client

pixeltable/tests/utils.py CHANGED Viewed

@@ -2,8 +2,9 @@ import datetime
 import glob
 import json
 import os
+from collections import namedtuple
 from pathlib import Path
-from typing import Dict, Any, List, Optional
+from typing import Any, Dict, List, Optional, Set
 import numpy as np
 import pandas as pd
@@ -12,12 +13,21 @@ import pytest
 import pixeltable as pxt
 import pixeltable.type_system as ts
 from pixeltable import catalog
+from pixeltable.catalog.globals import UpdateStatus
 from pixeltable.dataframe import DataFrameResultSet
 from pixeltable.env import Env
-from pixeltable.type_system import \
-    ColumnType, StringType, IntType, FloatType, ArrayType, BoolType, TimestampType, JsonType, ImageType, VideoType
+from pixeltable.type_system import (
+    ArrayType,
+    BoolType,
+    ColumnType,
+    FloatType,
+    ImageType,
+    IntType,
+    JsonType,
+    StringType,
+    TimestampType,
+    VideoType,
+)
 def make_default_type(t: ColumnType.Type) -> ColumnType:
@@ -266,6 +276,7 @@ def get_sentences(n: int = 100) -> List[str]:
     # this dataset contains \' around the questions
     return [q['question'].replace("'", '') for q in questions_list[:n]]
 def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
     assert len(r1) == len(r2)
     assert len(r1.column_names()) == len(r2.column_names())  # we don't care about the actual column names
@@ -280,6 +291,118 @@ def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
         else:
             assert s1.equals(s2)
 def skip_test_if_not_installed(package) -> None:
     if not Env.get().is_installed_package(package):
         pytest.skip(f'Package `{package}` is not installed.')
+def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
+    assert status.num_excs == 0
+    if expected_rows is not None:
+        assert status.num_rows == expected_rows
+def make_test_arrow_table(output_path: Path) -> None:
+    import pyarrow as pa
+    value_dict = {
+        'c_id': [1, 2, 3, 4, 5],
+        'c_int64': [-10, -20, -30, -40, None],
+        'c_int32': [-1, -2, -3, -4, None],
+        'c_float32': [1.1, 2.2, 3.3, 4.4, None],
+        'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
+        'c_boolean': [True, False, True, False, None],
+        'c_timestamp': [
+            datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
+            datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
+            datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
+            datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
+            None,
+        ],
+        # The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
+        # So, no nulls in this column
+        'c_array_float32': [
+            [
+                1.0,
+                2.0,
+            ],
+            [
+                10.0,
+                20.0,
+            ],
+            [
+                100.0,
+                200.0,
+            ],
+            [
+                1000.0,
+                2000.0,
+            ],
+            [10000.0, 20000.0],
+        ],
+    }
+    arr_size = len(value_dict['c_array_float32'][0])
+    tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
+    schema = pa.schema(
+        [
+            ('c_id', pa.int32()),
+            ('c_int64', pa.int64()),
+            ('c_int32', pa.int32()),
+            ('c_float32', pa.float32()),
+            ('c_string', pa.string()),
+            ('c_boolean', pa.bool_()),
+            ('c_timestamp', pa.timestamp('us')),
+            ('c_array_float32', tensor_type),
+        ]
+    )
+    test_table = pa.Table.from_pydict(value_dict, schema=schema)
+    pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
+def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
+    import datasets
+    assert df.count() == hf_dataset.num_rows
+    assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
+    # immutable so we can use it as in a set
+    DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
+    acc_dataset: Set[DatasetTuple] = set()
+    for tup in hf_dataset:
+        immutable_tup = {}
+        for k in tup:
+            if isinstance(tup[k], list):
+                immutable_tup[k] = tuple(tup[k])
+            else:
+                immutable_tup[k] = tup[k]
+        acc_dataset.add(DatasetTuple(**immutable_tup))
+    for tup in df.collect():
+        assert tup[split_column_name] in hf_dataset.split._name
+        encoded_tup = {}
+        for column_name, value in tup.items():
+            if column_name == split_column_name:
+                continue
+            feature_type = hf_dataset.features[column_name]
+            if isinstance(feature_type, datasets.ClassLabel):
+                assert value in feature_type.names
+                # must use the index of the class label as the value to
+                # compare with dataset iteration output.
+                value = feature_type.encode_example(value)
+            elif isinstance(feature_type, datasets.Sequence):
+                assert feature_type.feature.dtype == 'float32', 'may need to add more types'
+                value = tuple([float(x) for x in value])
+            encoded_tup[column_name] = value
+        check_tup = DatasetTuple(**encoded_tup)
+        assert check_tup in acc_dataset
+SAMPLE_IMAGE_URL = \
+    'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'

pixeltable 0.2.3__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.3py3-none-any.whl → 0.2.4py3-none-any.whl