pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +20 -9
- pixeltable/__version__.py +3 -0
- pixeltable/catalog/column.py +23 -7
- pixeltable/catalog/insertable_table.py +32 -19
- pixeltable/catalog/table.py +210 -20
- pixeltable/catalog/table_version.py +272 -111
- pixeltable/catalog/table_version_path.py +6 -1
- pixeltable/dataframe.py +184 -110
- pixeltable/datatransfer/__init__.py +1 -0
- pixeltable/datatransfer/label_studio.py +526 -0
- pixeltable/datatransfer/remote.py +113 -0
- pixeltable/env.py +213 -79
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/data_row_batch.py +6 -7
- pixeltable/exec/expr_eval_node.py +28 -28
- pixeltable/exec/sql_scan_node.py +7 -6
- pixeltable/exprs/__init__.py +4 -3
- pixeltable/exprs/column_ref.py +11 -2
- pixeltable/exprs/comparison.py +39 -1
- pixeltable/exprs/data_row.py +7 -0
- pixeltable/exprs/expr.py +26 -19
- pixeltable/exprs/function_call.py +17 -18
- pixeltable/exprs/globals.py +14 -2
- pixeltable/exprs/image_member_access.py +9 -28
- pixeltable/exprs/in_predicate.py +96 -0
- pixeltable/exprs/inline_array.py +13 -11
- pixeltable/exprs/inline_dict.py +15 -13
- pixeltable/exprs/row_builder.py +7 -1
- pixeltable/exprs/similarity_expr.py +67 -0
- pixeltable/ext/functions/whisperx.py +30 -0
- pixeltable/ext/functions/yolox.py +16 -0
- pixeltable/func/__init__.py +0 -2
- pixeltable/func/aggregate_function.py +5 -2
- pixeltable/func/callable_function.py +57 -13
- pixeltable/func/expr_template_function.py +14 -3
- pixeltable/func/function.py +35 -4
- pixeltable/func/signature.py +5 -15
- pixeltable/func/udf.py +8 -12
- pixeltable/functions/fireworks.py +9 -4
- pixeltable/functions/huggingface.py +48 -5
- pixeltable/functions/openai.py +49 -11
- pixeltable/functions/pil/image.py +61 -64
- pixeltable/functions/together.py +32 -6
- pixeltable/functions/util.py +0 -43
- pixeltable/functions/video.py +46 -8
- pixeltable/globals.py +443 -0
- pixeltable/index/__init__.py +1 -0
- pixeltable/index/base.py +9 -2
- pixeltable/index/btree.py +54 -0
- pixeltable/index/embedding_index.py +91 -15
- pixeltable/io/__init__.py +4 -0
- pixeltable/io/globals.py +59 -0
- pixeltable/{utils → io}/hf_datasets.py +48 -17
- pixeltable/io/pandas.py +148 -0
- pixeltable/{utils → io}/parquet.py +58 -33
- pixeltable/iterators/__init__.py +1 -1
- pixeltable/iterators/base.py +8 -4
- pixeltable/iterators/document.py +225 -93
- pixeltable/iterators/video.py +16 -9
- pixeltable/metadata/__init__.py +8 -4
- pixeltable/metadata/converters/convert_12.py +3 -0
- pixeltable/metadata/converters/convert_13.py +41 -0
- pixeltable/metadata/converters/convert_14.py +13 -0
- pixeltable/metadata/converters/convert_15.py +29 -0
- pixeltable/metadata/converters/util.py +63 -0
- pixeltable/metadata/schema.py +12 -6
- pixeltable/plan.py +11 -24
- pixeltable/store.py +16 -23
- pixeltable/tool/create_test_db_dump.py +49 -14
- pixeltable/type_system.py +27 -58
- pixeltable/utils/coco.py +94 -0
- pixeltable/utils/documents.py +42 -12
- pixeltable/utils/http_server.py +70 -0
- pixeltable-0.2.7.dist-info/METADATA +137 -0
- pixeltable-0.2.7.dist-info/RECORD +126 -0
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
- pixeltable/client.py +0 -600
- pixeltable/exprs/image_similarity_predicate.py +0 -58
- pixeltable/func/batched_function.py +0 -53
- pixeltable/func/nos_function.py +0 -202
- pixeltable/tests/conftest.py +0 -171
- pixeltable/tests/ext/test_yolox.py +0 -21
- pixeltable/tests/functions/test_fireworks.py +0 -43
- pixeltable/tests/functions/test_functions.py +0 -60
- pixeltable/tests/functions/test_huggingface.py +0 -158
- pixeltable/tests/functions/test_openai.py +0 -162
- pixeltable/tests/functions/test_together.py +0 -112
- pixeltable/tests/test_audio.py +0 -65
- pixeltable/tests/test_catalog.py +0 -27
- pixeltable/tests/test_client.py +0 -21
- pixeltable/tests/test_component_view.py +0 -379
- pixeltable/tests/test_dataframe.py +0 -440
- pixeltable/tests/test_dirs.py +0 -107
- pixeltable/tests/test_document.py +0 -120
- pixeltable/tests/test_exprs.py +0 -802
- pixeltable/tests/test_function.py +0 -332
- pixeltable/tests/test_index.py +0 -138
- pixeltable/tests/test_migration.py +0 -44
- pixeltable/tests/test_nos.py +0 -54
- pixeltable/tests/test_snapshot.py +0 -231
- pixeltable/tests/test_table.py +0 -1343
- pixeltable/tests/test_transactional_directory.py +0 -42
- pixeltable/tests/test_types.py +0 -52
- pixeltable/tests/test_video.py +0 -159
- pixeltable/tests/test_view.py +0 -535
- pixeltable/tests/utils.py +0 -442
- pixeltable/utils/clip.py +0 -18
- pixeltable-0.2.5.dist-info/METADATA +0 -128
- pixeltable-0.2.5.dist-info/RECORD +0 -139
- {pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
pixeltable/tests/utils.py
DELETED
|
@@ -1,442 +0,0 @@
|
|
|
1
|
-
import datetime
|
|
2
|
-
import glob
|
|
3
|
-
import json
|
|
4
|
-
import os
|
|
5
|
-
from collections import namedtuple
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
from typing import Any, Dict, List, Optional, Set
|
|
8
|
-
|
|
9
|
-
import PIL.Image
|
|
10
|
-
import numpy as np
|
|
11
|
-
import pandas as pd
|
|
12
|
-
import pytest
|
|
13
|
-
|
|
14
|
-
import pixeltable as pxt
|
|
15
|
-
import pixeltable.type_system as ts
|
|
16
|
-
from pixeltable import catalog
|
|
17
|
-
from pixeltable.catalog.globals import UpdateStatus
|
|
18
|
-
from pixeltable.dataframe import DataFrameResultSet
|
|
19
|
-
from pixeltable.env import Env
|
|
20
|
-
from pixeltable.functions.huggingface import clip_image, clip_text
|
|
21
|
-
from pixeltable.type_system import (
|
|
22
|
-
ArrayType,
|
|
23
|
-
BoolType,
|
|
24
|
-
ColumnType,
|
|
25
|
-
FloatType,
|
|
26
|
-
ImageType,
|
|
27
|
-
IntType,
|
|
28
|
-
JsonType,
|
|
29
|
-
StringType,
|
|
30
|
-
TimestampType,
|
|
31
|
-
VideoType,
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def make_default_type(t: ColumnType.Type) -> ColumnType:
|
|
36
|
-
if t == ColumnType.Type.STRING:
|
|
37
|
-
return StringType()
|
|
38
|
-
if t == ColumnType.Type.INT:
|
|
39
|
-
return IntType()
|
|
40
|
-
if t == ColumnType.Type.FLOAT:
|
|
41
|
-
return FloatType()
|
|
42
|
-
if t == ColumnType.Type.BOOL:
|
|
43
|
-
return BoolType()
|
|
44
|
-
if t == ColumnType.Type.TIMESTAMP:
|
|
45
|
-
return TimestampType()
|
|
46
|
-
assert False
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
|
|
50
|
-
if col_names is None:
|
|
51
|
-
col_names = ['c1']
|
|
52
|
-
schema: Dict[str, ts.ColumnType] = {}
|
|
53
|
-
for i, col_name in enumerate(col_names):
|
|
54
|
-
schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
|
|
55
|
-
return cl.create_table(name, schema)
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[
|
|
59
|
-
Dict[str, Any]]:
|
|
60
|
-
if col_names is None:
|
|
61
|
-
col_names = []
|
|
62
|
-
data: Dict[str, Any] = {}
|
|
63
|
-
|
|
64
|
-
sample_dict = {
|
|
65
|
-
'detections': [{
|
|
66
|
-
'id': '637e8e073b28441a453564cf',
|
|
67
|
-
'attributes': {},
|
|
68
|
-
'tags': [],
|
|
69
|
-
'label': 'potted plant',
|
|
70
|
-
'bounding_box': [
|
|
71
|
-
0.37028125,
|
|
72
|
-
0.3345305164319249,
|
|
73
|
-
0.038593749999999996,
|
|
74
|
-
0.16314553990610328,
|
|
75
|
-
],
|
|
76
|
-
'mask': None,
|
|
77
|
-
'confidence': None,
|
|
78
|
-
'index': None,
|
|
79
|
-
'supercategory': 'furniture',
|
|
80
|
-
'iscrowd': 0,
|
|
81
|
-
}, {
|
|
82
|
-
'id': '637e8e073b28441a453564cf',
|
|
83
|
-
'attributes': {},
|
|
84
|
-
'tags': [],
|
|
85
|
-
'label': 'potted plant',
|
|
86
|
-
'bounding_box': [
|
|
87
|
-
0.37028125,
|
|
88
|
-
0.3345305164319249,
|
|
89
|
-
0.038593749999999996,
|
|
90
|
-
0.16314553990610328,
|
|
91
|
-
],
|
|
92
|
-
'mask': None,
|
|
93
|
-
'confidence': None,
|
|
94
|
-
'index': None,
|
|
95
|
-
'supercategory': 'furniture',
|
|
96
|
-
'iscrowd': 0,
|
|
97
|
-
}]
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
if len(col_names) == 0:
|
|
101
|
-
col_names = [c.name for c in t.columns() if not c.is_computed]
|
|
102
|
-
|
|
103
|
-
col_types = t.column_types()
|
|
104
|
-
for col_name in col_names:
|
|
105
|
-
col_type = col_types[col_name]
|
|
106
|
-
col_data: Any = None
|
|
107
|
-
if col_type.is_string_type():
|
|
108
|
-
col_data = ['test string'] * num_rows
|
|
109
|
-
if col_type.is_int_type():
|
|
110
|
-
col_data = np.random.randint(0, 100, size=num_rows).tolist()
|
|
111
|
-
if col_type.is_float_type():
|
|
112
|
-
col_data = (np.random.random(size=num_rows) * 100).tolist()
|
|
113
|
-
if col_type.is_bool_type():
|
|
114
|
-
col_data = np.random.randint(0, 2, size=num_rows)
|
|
115
|
-
col_data = [False if i == 0 else True for i in col_data]
|
|
116
|
-
if col_type.is_timestamp_type():
|
|
117
|
-
col_data = [datetime.datetime.now()] * num_rows
|
|
118
|
-
if col_type.is_json_type():
|
|
119
|
-
col_data = [sample_dict] * num_rows
|
|
120
|
-
if col_type.is_array_type():
|
|
121
|
-
col_data = [np.ones(col_type.shape, dtype=col_type.numpy_dtype()) for i in range(num_rows)]
|
|
122
|
-
if col_type.is_image_type():
|
|
123
|
-
image_path = get_image_files()[0]
|
|
124
|
-
col_data = [image_path for i in range(num_rows)]
|
|
125
|
-
if col_type.is_video_type():
|
|
126
|
-
video_path = get_video_files()[0]
|
|
127
|
-
col_data = [video_path for i in range(num_rows)]
|
|
128
|
-
data[col_name] = col_data
|
|
129
|
-
rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
|
|
130
|
-
return rows
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
|
|
134
|
-
schema = {
|
|
135
|
-
'c1': StringType(nullable=False),
|
|
136
|
-
'c1n': StringType(nullable=True),
|
|
137
|
-
'c2': IntType(nullable=False),
|
|
138
|
-
'c3': FloatType(nullable=False),
|
|
139
|
-
'c4': BoolType(nullable=False),
|
|
140
|
-
'c5': TimestampType(nullable=False),
|
|
141
|
-
'c6': JsonType(nullable=False),
|
|
142
|
-
'c7': JsonType(nullable=False),
|
|
143
|
-
}
|
|
144
|
-
t = client.create_table(name, schema, primary_key='c2')
|
|
145
|
-
t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
|
|
146
|
-
|
|
147
|
-
num_rows = 100
|
|
148
|
-
d1 = {
|
|
149
|
-
'f1': 'test string 1',
|
|
150
|
-
'f2': 1,
|
|
151
|
-
'f3': 1.0,
|
|
152
|
-
'f4': True,
|
|
153
|
-
'f5': [1.0, 2.0, 3.0, 4.0],
|
|
154
|
-
'f6': {
|
|
155
|
-
'f7': 'test string 2',
|
|
156
|
-
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
157
|
-
},
|
|
158
|
-
}
|
|
159
|
-
d2 = [d1, d1]
|
|
160
|
-
|
|
161
|
-
c1_data = [f'test string {i}' for i in range(num_rows)]
|
|
162
|
-
c2_data = [i for i in range(num_rows)]
|
|
163
|
-
c3_data = [float(i) for i in range(num_rows)]
|
|
164
|
-
c4_data = [bool(i % 2) for i in range(num_rows)]
|
|
165
|
-
c5_data = [datetime.datetime.now()] * num_rows
|
|
166
|
-
c6_data = []
|
|
167
|
-
for i in range(num_rows):
|
|
168
|
-
d = {
|
|
169
|
-
'f1': f'test string {i}',
|
|
170
|
-
'f2': i,
|
|
171
|
-
'f3': float(i),
|
|
172
|
-
'f4': bool(i % 2),
|
|
173
|
-
'f5': [1.0, 2.0, 3.0, 4.0],
|
|
174
|
-
'f6': {
|
|
175
|
-
'f7': 'test string 2',
|
|
176
|
-
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
177
|
-
},
|
|
178
|
-
}
|
|
179
|
-
c6_data.append(d)
|
|
180
|
-
|
|
181
|
-
c7_data = [d2] * num_rows
|
|
182
|
-
rows = [
|
|
183
|
-
{
|
|
184
|
-
'c1': c1_data[i],
|
|
185
|
-
'c1n': c1_data[i] if i % 10 != 0 else None,
|
|
186
|
-
'c2': c2_data[i],
|
|
187
|
-
'c3': c3_data[i],
|
|
188
|
-
'c4': c4_data[i],
|
|
189
|
-
'c5': c5_data[i],
|
|
190
|
-
'c6': c6_data[i],
|
|
191
|
-
'c7': c7_data[i],
|
|
192
|
-
}
|
|
193
|
-
for i in range(num_rows)
|
|
194
|
-
]
|
|
195
|
-
t.insert(rows)
|
|
196
|
-
return t
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def create_img_tbl(cl: pxt.Client, name: str = 'test_img_tbl') -> catalog.Table:
|
|
200
|
-
schema = {
|
|
201
|
-
'img': ImageType(nullable=False),
|
|
202
|
-
'category': StringType(nullable=False),
|
|
203
|
-
'split': StringType(nullable=False),
|
|
204
|
-
}
|
|
205
|
-
tbl = cl.create_table(name, schema)
|
|
206
|
-
rows = read_data_file('imagenette2-160', 'manifest.csv', ['img'])
|
|
207
|
-
tbl.insert(rows)
|
|
208
|
-
return tbl
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
|
|
212
|
-
""" Creates a table with all supported datatypes.
|
|
213
|
-
"""
|
|
214
|
-
schema = {
|
|
215
|
-
'row_id': IntType(nullable=False), # used for row selection
|
|
216
|
-
'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
|
|
217
|
-
'c_bool': BoolType(nullable=True),
|
|
218
|
-
'c_float': FloatType(nullable=True),
|
|
219
|
-
'c_image': ImageType(nullable=True),
|
|
220
|
-
'c_int': IntType(nullable=True),
|
|
221
|
-
'c_json': JsonType(nullable=True),
|
|
222
|
-
'c_string': StringType(nullable=True),
|
|
223
|
-
'c_timestamp': TimestampType(nullable=True),
|
|
224
|
-
'c_video': VideoType(nullable=True),
|
|
225
|
-
}
|
|
226
|
-
tbl = test_client.create_table('all_datatype_tbl', schema)
|
|
227
|
-
example_rows = create_table_data(tbl, num_rows=11)
|
|
228
|
-
|
|
229
|
-
for i, r in enumerate(example_rows):
|
|
230
|
-
r['row_id'] = i # row_id
|
|
231
|
-
|
|
232
|
-
tbl.insert(example_rows)
|
|
233
|
-
return tbl
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
237
|
-
"""
|
|
238
|
-
Locate dir_name, create df out of file_name.
|
|
239
|
-
path_col_names: col names in csv file that contain file names; those will be converted to absolute paths
|
|
240
|
-
by adding the path to 'file_name' as a prefix.
|
|
241
|
-
Returns:
|
|
242
|
-
tuple of (list of rows, list of column names)
|
|
243
|
-
"""
|
|
244
|
-
if path_col_names is None:
|
|
245
|
-
path_col_names = []
|
|
246
|
-
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
247
|
-
glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
|
|
248
|
-
assert len(glob_result) == 1, f'Could not find {dir_name}'
|
|
249
|
-
abs_path = Path(glob_result[0])
|
|
250
|
-
data_file_path = abs_path / file_name
|
|
251
|
-
assert data_file_path.is_file(), f'Not a file: {str(data_file_path)}'
|
|
252
|
-
df = pd.read_csv(str(data_file_path))
|
|
253
|
-
for col_name in path_col_names:
|
|
254
|
-
assert col_name in df.columns
|
|
255
|
-
df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
|
|
256
|
-
return df.to_dict(orient='records')
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
def get_video_files(include_bad_video: bool = False) -> List[str]:
|
|
260
|
-
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
261
|
-
glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
|
|
262
|
-
if not include_bad_video:
|
|
263
|
-
glob_result = [f for f in glob_result if 'bad_video' not in f]
|
|
264
|
-
|
|
265
|
-
half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
|
|
266
|
-
return half_res
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def get_test_video_files() -> List[str]:
|
|
270
|
-
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
271
|
-
glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
|
|
272
|
-
return glob_result
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
def get_image_files(include_bad_image: bool = False) -> List[str]:
|
|
276
|
-
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
277
|
-
glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
|
|
278
|
-
if not include_bad_image:
|
|
279
|
-
glob_result = [f for f in glob_result if 'bad_image' not in f]
|
|
280
|
-
return glob_result
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
def get_audio_files(include_bad_audio: bool = False) -> List[str]:
|
|
284
|
-
tests_dir = os.path.dirname(__file__)
|
|
285
|
-
glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
|
|
286
|
-
if not include_bad_audio:
|
|
287
|
-
glob_result = [f for f in glob_result if 'bad_audio' not in f]
|
|
288
|
-
return glob_result
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
def get_documents() -> List[str]:
|
|
292
|
-
tests_dir = os.path.dirname(__file__)
|
|
293
|
-
# for now, we can only handle .html and .md
|
|
294
|
-
return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
def get_sentences(n: int = 100) -> List[str]:
|
|
298
|
-
tests_dir = os.path.dirname(__file__)
|
|
299
|
-
path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
|
|
300
|
-
with open(path, 'r', encoding='utf8') as f:
|
|
301
|
-
questions_list = json.load(f)
|
|
302
|
-
# this dataset contains \' around the questions
|
|
303
|
-
return [q['question'].replace("'", '') for q in questions_list[:n]]
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
|
|
307
|
-
assert len(r1) == len(r2)
|
|
308
|
-
assert len(r1.column_names()) == len(r2.column_names()) # we don't care about the actual column names
|
|
309
|
-
r1_pd = r1.to_pandas()
|
|
310
|
-
r2_pd = r2.to_pandas()
|
|
311
|
-
for i in range(len(r1.column_names())):
|
|
312
|
-
# only compare column values
|
|
313
|
-
s1 = r1_pd.iloc[:, i]
|
|
314
|
-
s2 = r2_pd.iloc[:, i]
|
|
315
|
-
if s1.dtype == np.float64:
|
|
316
|
-
assert np.allclose(s1, s2)
|
|
317
|
-
else:
|
|
318
|
-
assert s1.equals(s2)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
def skip_test_if_not_installed(package) -> None:
|
|
322
|
-
if not Env.get().is_installed_package(package):
|
|
323
|
-
pytest.skip(f'Package `{package}` is not installed.')
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
|
|
327
|
-
assert status.num_excs == 0
|
|
328
|
-
if expected_rows is not None:
|
|
329
|
-
assert status.num_rows == expected_rows
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
def make_test_arrow_table(output_path: Path) -> None:
|
|
333
|
-
import pyarrow as pa
|
|
334
|
-
|
|
335
|
-
value_dict = {
|
|
336
|
-
'c_id': [1, 2, 3, 4, 5],
|
|
337
|
-
'c_int64': [-10, -20, -30, -40, None],
|
|
338
|
-
'c_int32': [-1, -2, -3, -4, None],
|
|
339
|
-
'c_float32': [1.1, 2.2, 3.3, 4.4, None],
|
|
340
|
-
'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
|
|
341
|
-
'c_boolean': [True, False, True, False, None],
|
|
342
|
-
'c_timestamp': [
|
|
343
|
-
datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
|
|
344
|
-
datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
|
|
345
|
-
datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
|
|
346
|
-
datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
|
|
347
|
-
None,
|
|
348
|
-
],
|
|
349
|
-
# The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
|
|
350
|
-
# So, no nulls in this column
|
|
351
|
-
'c_array_float32': [
|
|
352
|
-
[
|
|
353
|
-
1.0,
|
|
354
|
-
2.0,
|
|
355
|
-
],
|
|
356
|
-
[
|
|
357
|
-
10.0,
|
|
358
|
-
20.0,
|
|
359
|
-
],
|
|
360
|
-
[
|
|
361
|
-
100.0,
|
|
362
|
-
200.0,
|
|
363
|
-
],
|
|
364
|
-
[
|
|
365
|
-
1000.0,
|
|
366
|
-
2000.0,
|
|
367
|
-
],
|
|
368
|
-
[10000.0, 20000.0],
|
|
369
|
-
],
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
arr_size = len(value_dict['c_array_float32'][0])
|
|
373
|
-
tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
|
|
374
|
-
|
|
375
|
-
schema = pa.schema(
|
|
376
|
-
[
|
|
377
|
-
('c_id', pa.int32()),
|
|
378
|
-
('c_int64', pa.int64()),
|
|
379
|
-
('c_int32', pa.int32()),
|
|
380
|
-
('c_float32', pa.float32()),
|
|
381
|
-
('c_string', pa.string()),
|
|
382
|
-
('c_boolean', pa.bool_()),
|
|
383
|
-
('c_timestamp', pa.timestamp('us')),
|
|
384
|
-
('c_array_float32', tensor_type),
|
|
385
|
-
]
|
|
386
|
-
)
|
|
387
|
-
|
|
388
|
-
test_table = pa.Table.from_pydict(value_dict, schema=schema)
|
|
389
|
-
pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
|
|
393
|
-
import datasets
|
|
394
|
-
assert df.count() == hf_dataset.num_rows
|
|
395
|
-
assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
|
|
396
|
-
|
|
397
|
-
# immutable so we can use it as in a set
|
|
398
|
-
DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
|
|
399
|
-
acc_dataset: Set[DatasetTuple] = set()
|
|
400
|
-
for tup in hf_dataset:
|
|
401
|
-
immutable_tup = {}
|
|
402
|
-
for k in tup:
|
|
403
|
-
if isinstance(tup[k], list):
|
|
404
|
-
immutable_tup[k] = tuple(tup[k])
|
|
405
|
-
else:
|
|
406
|
-
immutable_tup[k] = tup[k]
|
|
407
|
-
|
|
408
|
-
acc_dataset.add(DatasetTuple(**immutable_tup))
|
|
409
|
-
|
|
410
|
-
for tup in df.collect():
|
|
411
|
-
assert tup[split_column_name] in hf_dataset.split._name
|
|
412
|
-
|
|
413
|
-
encoded_tup = {}
|
|
414
|
-
for column_name, value in tup.items():
|
|
415
|
-
if column_name == split_column_name:
|
|
416
|
-
continue
|
|
417
|
-
feature_type = hf_dataset.features[column_name]
|
|
418
|
-
if isinstance(feature_type, datasets.ClassLabel):
|
|
419
|
-
assert value in feature_type.names
|
|
420
|
-
# must use the index of the class label as the value to
|
|
421
|
-
# compare with dataset iteration output.
|
|
422
|
-
value = feature_type.encode_example(value)
|
|
423
|
-
elif isinstance(feature_type, datasets.Sequence):
|
|
424
|
-
assert feature_type.feature.dtype == 'float32', 'may need to add more types'
|
|
425
|
-
value = tuple([float(x) for x in value])
|
|
426
|
-
|
|
427
|
-
encoded_tup[column_name] = value
|
|
428
|
-
|
|
429
|
-
check_tup = DatasetTuple(**encoded_tup)
|
|
430
|
-
assert check_tup in acc_dataset
|
|
431
|
-
|
|
432
|
-
@pxt.expr_udf
|
|
433
|
-
def img_embed(img: PIL.Image.Image) -> np.ndarray:
|
|
434
|
-
return clip_image(img, model_id='openai/clip-vit-base-patch32')
|
|
435
|
-
|
|
436
|
-
@pxt.expr_udf
|
|
437
|
-
def text_embed(txt: str) -> np.ndarray:
|
|
438
|
-
return clip_text(txt, model_id='openai/clip-vit-base-patch32')
|
|
439
|
-
|
|
440
|
-
SAMPLE_IMAGE_URL = \
|
|
441
|
-
'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'
|
|
442
|
-
|
pixeltable/utils/clip.py
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import PIL.Image
|
|
3
|
-
|
|
4
|
-
import pixeltable.func as func
|
|
5
|
-
from pixeltable.env import Env
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def embed_image(img: PIL.Image.Image) -> np.ndarray:
|
|
9
|
-
from pixeltable.functions.nos.image_embedding import openai_clip
|
|
10
|
-
model_info = openai_clip.model_spec
|
|
11
|
-
result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, images=[img.resize((224, 224))])
|
|
12
|
-
return result['embedding'].squeeze(0)
|
|
13
|
-
|
|
14
|
-
def embed_text(text: str) -> np.ndarray:
|
|
15
|
-
from pixeltable.functions.nos.text_embedding import openai_clip
|
|
16
|
-
model_info = openai_clip.model_spec
|
|
17
|
-
result = Env.get().nos_client.Run(task=model_info.task, model_name=model_info.name, texts=[text])
|
|
18
|
-
return result['embedding'].squeeze(0)
|
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: pixeltable
|
|
3
|
-
Version: 0.2.5
|
|
4
|
-
Summary: Pixeltable: The Multimodal AI Data Plane
|
|
5
|
-
Author: Marcel Kornacker
|
|
6
|
-
Author-email: marcelk@gmail.com
|
|
7
|
-
Requires-Python: >=3.9,<4.0
|
|
8
|
-
Classifier: Programming Language :: Python :: 3
|
|
9
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
-
Requires-Dist: av (>=10.0.0)
|
|
14
|
-
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
15
|
-
Requires-Dist: cloudpickle (>=2.2.1,<3.0.0)
|
|
16
|
-
Requires-Dist: jinja2 (>=3.1.3,<4.0.0)
|
|
17
|
-
Requires-Dist: jmespath (>=1.0.1,<2.0.0)
|
|
18
|
-
Requires-Dist: numpy (>=1.26)
|
|
19
|
-
Requires-Dist: opencv-python-headless (>=4.7.0.68,<5.0.0.0)
|
|
20
|
-
Requires-Dist: pandas (>=2.0,<3.0)
|
|
21
|
-
Requires-Dist: pgserver (==0.1.2)
|
|
22
|
-
Requires-Dist: pgvector (>=0.2.1,<0.3.0)
|
|
23
|
-
Requires-Dist: pillow (>=10.0)
|
|
24
|
-
Requires-Dist: psutil (>=5.9.5,<6.0.0)
|
|
25
|
-
Requires-Dist: psycopg2-binary (>=2.9.5,<3.0.0)
|
|
26
|
-
Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
|
|
27
|
-
Requires-Dist: regex (>=2022.10.31,<2023.0.0)
|
|
28
|
-
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
29
|
-
Requires-Dist: sqlalchemy-utils (>=0.41.1,<0.42.0)
|
|
30
|
-
Requires-Dist: sqlalchemy[mypy] (>=2.0.23,<3.0.0)
|
|
31
|
-
Requires-Dist: tenacity (>=8.2,<9.0)
|
|
32
|
-
Requires-Dist: tqdm (>=4.64.1,<5.0.0)
|
|
33
|
-
Description-Content-Type: text/markdown
|
|
34
|
-
|
|
35
|
-
<div align="center">
|
|
36
|
-
<img src="docs/pixeltable-banner.png" width="45%"/>
|
|
37
|
-
|
|
38
|
-
# Unifying Data, Models, and Orchestration for AI Products
|
|
39
|
-
|
|
40
|
-
[](https://opensource.org/licenses/Apache-2.0)
|
|
41
|
-
|
|
42
|
-

|
|
43
|
-
|
|
44
|
-
[Installation](https://pixeltable.github.io/pixeltable/getting-started/) | [Documentation](https://pixeltable.github.io/pixeltable/)
|
|
45
|
-
</div>
|
|
46
|
-
|
|
47
|
-
Pixeltable is a Python library that lets AI engineers and data scientists focus on exploration, modeling, and app development without dealing with the customary data plumbing.
|
|
48
|
-
|
|
49
|
-
## What problems does Pixeltable solve?
|
|
50
|
-
|
|
51
|
-
Today’s solutions for AI app development require extensive custom coding and infrastructure
|
|
52
|
-
plumbing. Tracking lineage and versions between and across data transformations, models, and
|
|
53
|
-
deployment is cumbersome. Pixeltable is a replacement for traditional data plumbing, providing
|
|
54
|
-
a unified plane for data, models, and orchestration. It removes the data plumbing overhead in
|
|
55
|
-
building and productionizing AI applications.
|
|
56
|
-
|
|
57
|
-
## ⚡Quick Start
|
|
58
|
-
Learn the basics of Pixeltable through interactive examples. View the notebooks on Google Colab or Kaggle, for free.
|
|
59
|
-
|
|
60
|
-
### Pixeltable Basics
|
|
61
|
-
In this tutorial, we'll survey how to create tables, populate them with data, and enhance them with built-in and user-defined transformations and AI operations.
|
|
62
|
-
|
|
63
|
-
[](https://kaggle.com/kernels/welcome?src=https://github.com/pixeltable/pixeltable/blob/master/docs/tutorials/pixeltable-basics.ipynb)
|
|
64
|
-
<a target="_blank" href="https://colab.research.google.com/github/pixeltable/pixeltable/blob/master/docs/tutorials/pixeltable-basics.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/> </a>
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
## 💾 Installation
|
|
68
|
-
Pixeltable works with Python 3.9, 3.10, 3.11, or 3.12 running on Linux, MacOS, or Windows.
|
|
69
|
-
|
|
70
|
-
```
|
|
71
|
-
pip install pixeltable
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
To verify that it's working:
|
|
75
|
-
|
|
76
|
-
```
|
|
77
|
-
import pixeltable as pxt
|
|
78
|
-
cl = pxt.Client()
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
For more detailed installation instructions, see the
|
|
82
|
-
[Getting Started with Pixeltable](https://pixeltable.github.io/pixeltable/getting-started/)
|
|
83
|
-
guide. Then, check out the
|
|
84
|
-
[Pixeltable Basics](https://pixeltable.github.io/pixeltable/tutorials/pixeltable-basics/)
|
|
85
|
-
tutorial for a tour of its most important features.
|
|
86
|
-
|
|
87
|
-
## Why should you use Pixeltable?
|
|
88
|
-
|
|
89
|
-
- It gives you transparency and reproducibility
|
|
90
|
-
- All generated data is automatically recorded and versioned
|
|
91
|
-
- You will never need to re-run a workload because you lost track of the input data
|
|
92
|
-
- It saves you money
|
|
93
|
-
- All data changes are automatically incremental
|
|
94
|
-
- You never need to re-run pipelines from scratch because you’re adding data
|
|
95
|
-
- It integrates with any existing Python code or libraries
|
|
96
|
-
- Bring your ever-changing code and workloads
|
|
97
|
-
- You choose the models, tools, and AI practices (e.g., your embedding model for a vector index); Pixeltable orchestrates the data
|
|
98
|
-
|
|
99
|
-
## Example Use Cases
|
|
100
|
-
|
|
101
|
-
* Interact with video data at the frame level without having to think about frame extraction,
|
|
102
|
-
intermediate file storage, or storage space explosion.
|
|
103
|
-
* Augment your data incrementally and interactively with built-in functions and UDFs, such as
|
|
104
|
-
image transformations, model inference, and visualizations, without having to think about data pipelines,
|
|
105
|
-
incremental updates, or capturing function output.
|
|
106
|
-
* Interact with all the data relevant to your AI application (video, images, documents, audio, structured data, JSON) through
|
|
107
|
-
a simple dataframe-style API directly in Python. This includes:
|
|
108
|
-
* similarity search on embeddings, supported by high-dimensional vector indexing
|
|
109
|
-
* path expressions and transformations on JSON data
|
|
110
|
-
* PIL and OpenCV image operations
|
|
111
|
-
* assembling frames into videos
|
|
112
|
-
* Perform keyword and image similarity search at the video frame level without having to worry about frame
|
|
113
|
-
storage.
|
|
114
|
-
* Access all Pixeltable-resident data directly as a PyTorch dataset in your training scripts.
|
|
115
|
-
* Understand the compute and storage costs of your data at the granularity of individual augmentations and
|
|
116
|
-
get cost projections before adding new data and new augmentations.
|
|
117
|
-
* Rely on Pixeltable's automatic versioning and snapshot functionality to protect against regressions
|
|
118
|
-
and to ensure reproducibility.
|
|
119
|
-
|
|
120
|
-
## Contributions & Feedback
|
|
121
|
-
|
|
122
|
-
Are you experiencing issues or bugs with Pixeltable? File an [Issue](https://github.com/pixeltable/pixeltable/issues).
|
|
123
|
-
</br>Do you want to contribute? Feel free to open a [PR](https://github.com/pixeltable/pixeltable/pulls).
|
|
124
|
-
|
|
125
|
-
## :classical_building: License
|
|
126
|
-
|
|
127
|
-
This library is licensed under the Apache 2.0 License.
|
|
128
|
-
|