pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +34 -6
- pixeltable/catalog/__init__.py +13 -0
- pixeltable/catalog/catalog.py +159 -0
- pixeltable/catalog/column.py +200 -0
- pixeltable/catalog/dir.py +32 -0
- pixeltable/catalog/globals.py +33 -0
- pixeltable/catalog/insertable_table.py +191 -0
- pixeltable/catalog/named_function.py +36 -0
- pixeltable/catalog/path.py +58 -0
- pixeltable/catalog/path_dict.py +139 -0
- pixeltable/catalog/schema_object.py +39 -0
- pixeltable/catalog/table.py +581 -0
- pixeltable/catalog/table_version.py +749 -0
- pixeltable/catalog/table_version_path.py +133 -0
- pixeltable/catalog/view.py +203 -0
- pixeltable/client.py +590 -30
- pixeltable/dataframe.py +540 -349
- pixeltable/env.py +359 -45
- pixeltable/exceptions.py +12 -21
- pixeltable/exec/__init__.py +9 -0
- pixeltable/exec/aggregation_node.py +78 -0
- pixeltable/exec/cache_prefetch_node.py +116 -0
- pixeltable/exec/component_iteration_node.py +79 -0
- pixeltable/exec/data_row_batch.py +95 -0
- pixeltable/exec/exec_context.py +22 -0
- pixeltable/exec/exec_node.py +61 -0
- pixeltable/exec/expr_eval_node.py +217 -0
- pixeltable/exec/in_memory_data_node.py +69 -0
- pixeltable/exec/media_validation_node.py +43 -0
- pixeltable/exec/sql_scan_node.py +225 -0
- pixeltable/exprs/__init__.py +24 -0
- pixeltable/exprs/arithmetic_expr.py +102 -0
- pixeltable/exprs/array_slice.py +71 -0
- pixeltable/exprs/column_property_ref.py +77 -0
- pixeltable/exprs/column_ref.py +105 -0
- pixeltable/exprs/comparison.py +77 -0
- pixeltable/exprs/compound_predicate.py +98 -0
- pixeltable/exprs/data_row.py +195 -0
- pixeltable/exprs/expr.py +586 -0
- pixeltable/exprs/expr_set.py +39 -0
- pixeltable/exprs/function_call.py +380 -0
- pixeltable/exprs/globals.py +69 -0
- pixeltable/exprs/image_member_access.py +115 -0
- pixeltable/exprs/image_similarity_predicate.py +58 -0
- pixeltable/exprs/inline_array.py +107 -0
- pixeltable/exprs/inline_dict.py +101 -0
- pixeltable/exprs/is_null.py +38 -0
- pixeltable/exprs/json_mapper.py +121 -0
- pixeltable/exprs/json_path.py +159 -0
- pixeltable/exprs/literal.py +54 -0
- pixeltable/exprs/object_ref.py +41 -0
- pixeltable/exprs/predicate.py +44 -0
- pixeltable/exprs/row_builder.py +355 -0
- pixeltable/exprs/rowid_ref.py +94 -0
- pixeltable/exprs/type_cast.py +53 -0
- pixeltable/exprs/variable.py +45 -0
- pixeltable/func/__init__.py +9 -0
- pixeltable/func/aggregate_function.py +194 -0
- pixeltable/func/batched_function.py +53 -0
- pixeltable/func/callable_function.py +69 -0
- pixeltable/func/expr_template_function.py +82 -0
- pixeltable/func/function.py +110 -0
- pixeltable/func/function_registry.py +227 -0
- pixeltable/func/globals.py +36 -0
- pixeltable/func/nos_function.py +202 -0
- pixeltable/func/signature.py +166 -0
- pixeltable/func/udf.py +163 -0
- pixeltable/functions/__init__.py +52 -103
- pixeltable/functions/eval.py +216 -0
- pixeltable/functions/fireworks.py +34 -0
- pixeltable/functions/huggingface.py +120 -0
- pixeltable/functions/image.py +16 -0
- pixeltable/functions/openai.py +256 -0
- pixeltable/functions/pil/image.py +148 -7
- pixeltable/functions/string.py +13 -0
- pixeltable/functions/together.py +122 -0
- pixeltable/functions/util.py +41 -0
- pixeltable/functions/video.py +62 -0
- pixeltable/iterators/__init__.py +3 -0
- pixeltable/iterators/base.py +48 -0
- pixeltable/iterators/document.py +311 -0
- pixeltable/iterators/video.py +89 -0
- pixeltable/metadata/__init__.py +54 -0
- pixeltable/metadata/converters/convert_10.py +18 -0
- pixeltable/metadata/schema.py +211 -0
- pixeltable/plan.py +656 -0
- pixeltable/store.py +418 -182
- pixeltable/tests/conftest.py +146 -88
- pixeltable/tests/functions/test_fireworks.py +42 -0
- pixeltable/tests/functions/test_functions.py +60 -0
- pixeltable/tests/functions/test_huggingface.py +158 -0
- pixeltable/tests/functions/test_openai.py +152 -0
- pixeltable/tests/functions/test_together.py +111 -0
- pixeltable/tests/test_audio.py +65 -0
- pixeltable/tests/test_catalog.py +27 -0
- pixeltable/tests/test_client.py +14 -14
- pixeltable/tests/test_component_view.py +370 -0
- pixeltable/tests/test_dataframe.py +439 -0
- pixeltable/tests/test_dirs.py +78 -62
- pixeltable/tests/test_document.py +120 -0
- pixeltable/tests/test_exprs.py +592 -135
- pixeltable/tests/test_function.py +297 -67
- pixeltable/tests/test_migration.py +43 -0
- pixeltable/tests/test_nos.py +54 -0
- pixeltable/tests/test_snapshot.py +208 -0
- pixeltable/tests/test_table.py +1195 -263
- pixeltable/tests/test_transactional_directory.py +42 -0
- pixeltable/tests/test_types.py +5 -11
- pixeltable/tests/test_video.py +151 -34
- pixeltable/tests/test_view.py +530 -0
- pixeltable/tests/utils.py +320 -45
- pixeltable/tool/create_test_db_dump.py +149 -0
- pixeltable/tool/create_test_video.py +81 -0
- pixeltable/type_system.py +445 -124
- pixeltable/utils/__init__.py +17 -46
- pixeltable/utils/arrow.py +98 -0
- pixeltable/utils/clip.py +12 -15
- pixeltable/utils/coco.py +136 -0
- pixeltable/utils/documents.py +39 -0
- pixeltable/utils/filecache.py +195 -0
- pixeltable/utils/help.py +11 -0
- pixeltable/utils/hf_datasets.py +157 -0
- pixeltable/utils/media_store.py +76 -0
- pixeltable/utils/parquet.py +167 -0
- pixeltable/utils/pytorch.py +91 -0
- pixeltable/utils/s3.py +13 -0
- pixeltable/utils/sql.py +17 -0
- pixeltable/utils/transactional_directory.py +35 -0
- pixeltable-0.2.4.dist-info/LICENSE +18 -0
- pixeltable-0.2.4.dist-info/METADATA +127 -0
- pixeltable-0.2.4.dist-info/RECORD +132 -0
- {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
- pixeltable/catalog.py +0 -1421
- pixeltable/exprs.py +0 -1745
- pixeltable/function.py +0 -269
- pixeltable/functions/clip.py +0 -10
- pixeltable/functions/pil/__init__.py +0 -23
- pixeltable/functions/tf.py +0 -21
- pixeltable/index.py +0 -57
- pixeltable/tests/test_dict.py +0 -24
- pixeltable/tests/test_functions.py +0 -11
- pixeltable/tests/test_tf.py +0 -69
- pixeltable/tf.py +0 -33
- pixeltable/utils/tf.py +0 -33
- pixeltable/utils/video.py +0 -32
- pixeltable-0.1.0.dist-info/METADATA +0 -34
- pixeltable-0.1.0.dist-info/RECORD +0 -36
pixeltable/tests/utils.py
CHANGED
|
@@ -1,16 +1,34 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import glob
|
|
3
|
+
import json
|
|
3
4
|
import os
|
|
5
|
+
from collections import namedtuple
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Dict,
|
|
7
|
+
from typing import Any, Dict, List, Optional, Set
|
|
6
8
|
|
|
7
9
|
import numpy as np
|
|
8
10
|
import pandas as pd
|
|
11
|
+
import pytest
|
|
9
12
|
|
|
10
|
-
import pixeltable as
|
|
13
|
+
import pixeltable as pxt
|
|
14
|
+
import pixeltable.type_system as ts
|
|
11
15
|
from pixeltable import catalog
|
|
12
|
-
from pixeltable.
|
|
13
|
-
from pixeltable.
|
|
16
|
+
from pixeltable.catalog.globals import UpdateStatus
|
|
17
|
+
from pixeltable.dataframe import DataFrameResultSet
|
|
18
|
+
from pixeltable.env import Env
|
|
19
|
+
from pixeltable.type_system import (
|
|
20
|
+
ArrayType,
|
|
21
|
+
BoolType,
|
|
22
|
+
ColumnType,
|
|
23
|
+
FloatType,
|
|
24
|
+
ImageType,
|
|
25
|
+
IntType,
|
|
26
|
+
JsonType,
|
|
27
|
+
StringType,
|
|
28
|
+
TimestampType,
|
|
29
|
+
VideoType,
|
|
30
|
+
)
|
|
31
|
+
|
|
14
32
|
|
|
15
33
|
def make_default_type(t: ColumnType.Type) -> ColumnType:
|
|
16
34
|
if t == ColumnType.Type.STRING:
|
|
@@ -25,14 +43,19 @@ def make_default_type(t: ColumnType.Type) -> ColumnType:
|
|
|
25
43
|
return TimestampType()
|
|
26
44
|
assert False
|
|
27
45
|
|
|
28
|
-
def make_tbl(
|
|
29
|
-
|
|
46
|
+
def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
|
|
47
|
+
if col_names is None:
|
|
48
|
+
col_names = ['c1']
|
|
49
|
+
schema: Dict[str, ts.ColumnType] = {}
|
|
30
50
|
for i, col_name in enumerate(col_names):
|
|
31
|
-
schema
|
|
32
|
-
return
|
|
51
|
+
schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
|
|
52
|
+
return cl.create_table(name, schema)
|
|
33
53
|
|
|
34
|
-
def create_table_data(t: catalog.Table, col_names: List[str] =
|
|
54
|
+
def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[Dict[str, Any]]:
|
|
55
|
+
if col_names is None:
|
|
56
|
+
col_names = []
|
|
35
57
|
data: Dict[str, Any] = {}
|
|
58
|
+
|
|
36
59
|
sample_dict = {
|
|
37
60
|
'detections': [{
|
|
38
61
|
'id': '637e8e073b28441a453564cf',
|
|
@@ -70,38 +93,138 @@ def create_table_data(t: catalog.Table, col_names: List[str] = [], num_rows: int
|
|
|
70
93
|
}
|
|
71
94
|
|
|
72
95
|
if len(col_names) == 0:
|
|
73
|
-
col_names = [c.name for c in t.columns]
|
|
96
|
+
col_names = [c.name for c in t.columns() if not c.is_computed]
|
|
74
97
|
|
|
98
|
+
col_types = t.column_types()
|
|
75
99
|
for col_name in col_names:
|
|
76
|
-
|
|
100
|
+
col_type = col_types[col_name]
|
|
77
101
|
col_data: Any = None
|
|
78
|
-
if
|
|
102
|
+
if col_type.is_string_type():
|
|
79
103
|
col_data = ['test string'] * num_rows
|
|
80
|
-
if
|
|
81
|
-
col_data = np.random.randint(0, 100, size=num_rows)
|
|
82
|
-
if
|
|
83
|
-
col_data = np.random.random(size=num_rows) * 100
|
|
84
|
-
if
|
|
104
|
+
if col_type.is_int_type():
|
|
105
|
+
col_data = np.random.randint(0, 100, size=num_rows).tolist()
|
|
106
|
+
if col_type.is_float_type():
|
|
107
|
+
col_data = (np.random.random(size=num_rows) * 100).tolist()
|
|
108
|
+
if col_type.is_bool_type():
|
|
85
109
|
col_data = np.random.randint(0, 2, size=num_rows)
|
|
86
110
|
col_data = [False if i == 0 else True for i in col_data]
|
|
87
|
-
if
|
|
88
|
-
col_data = datetime.datetime.now()
|
|
89
|
-
if
|
|
111
|
+
if col_type.is_timestamp_type():
|
|
112
|
+
col_data = [datetime.datetime.now()] * num_rows
|
|
113
|
+
if col_type.is_json_type():
|
|
90
114
|
col_data = [sample_dict] * num_rows
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
115
|
+
if col_type.is_array_type():
|
|
116
|
+
col_data = [np.ones(col_type.shape, dtype=col_type.numpy_dtype()) for i in range(num_rows)]
|
|
117
|
+
if col_type.is_image_type():
|
|
118
|
+
image_path = get_image_files()[0]
|
|
119
|
+
col_data = [image_path for i in range(num_rows)]
|
|
120
|
+
if col_type.is_video_type():
|
|
121
|
+
video_path = get_video_files()[0]
|
|
122
|
+
col_data = [video_path for i in range(num_rows)]
|
|
123
|
+
data[col_name] = col_data
|
|
124
|
+
rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
|
|
125
|
+
return rows
|
|
126
|
+
|
|
127
|
+
def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
|
|
128
|
+
schema = {
|
|
129
|
+
'c1': StringType(nullable=False),
|
|
130
|
+
'c1n': StringType(nullable=True),
|
|
131
|
+
'c2': IntType(nullable=False),
|
|
132
|
+
'c3': FloatType(nullable=False),
|
|
133
|
+
'c4': BoolType(nullable=False),
|
|
134
|
+
'c5': TimestampType(nullable=False),
|
|
135
|
+
'c6': JsonType(nullable=False),
|
|
136
|
+
'c7': JsonType(nullable=False),
|
|
137
|
+
}
|
|
138
|
+
t = client.create_table(name, schema, primary_key='c2')
|
|
139
|
+
t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
|
|
140
|
+
|
|
141
|
+
num_rows = 100
|
|
142
|
+
d1 = {
|
|
143
|
+
'f1': 'test string 1',
|
|
144
|
+
'f2': 1,
|
|
145
|
+
'f3': 1.0,
|
|
146
|
+
'f4': True,
|
|
147
|
+
'f5': [1.0, 2.0, 3.0, 4.0],
|
|
148
|
+
'f6': {
|
|
149
|
+
'f7': 'test string 2',
|
|
150
|
+
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
151
|
+
},
|
|
152
|
+
}
|
|
153
|
+
d2 = [d1, d1]
|
|
96
154
|
|
|
97
|
-
|
|
155
|
+
c1_data = [f'test string {i}' for i in range(num_rows)]
|
|
156
|
+
c2_data = [i for i in range(num_rows)]
|
|
157
|
+
c3_data = [float(i) for i in range(num_rows)]
|
|
158
|
+
c4_data = [bool(i % 2) for i in range(num_rows)]
|
|
159
|
+
c5_data = [datetime.datetime.now()] * num_rows
|
|
160
|
+
c6_data = []
|
|
161
|
+
for i in range(num_rows):
|
|
162
|
+
d = {
|
|
163
|
+
'f1': f'test string {i}',
|
|
164
|
+
'f2': i,
|
|
165
|
+
'f3': float(i),
|
|
166
|
+
'f4': bool(i % 2),
|
|
167
|
+
'f5': [1.0, 2.0, 3.0, 4.0],
|
|
168
|
+
'f6': {
|
|
169
|
+
'f7': 'test string 2',
|
|
170
|
+
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
171
|
+
},
|
|
172
|
+
}
|
|
173
|
+
c6_data.append(d)
|
|
174
|
+
|
|
175
|
+
c7_data = [d2] * num_rows
|
|
176
|
+
rows = [
|
|
177
|
+
{
|
|
178
|
+
'c1': c1_data[i],
|
|
179
|
+
'c1n': c1_data[i] if i % 10 != 0 else None,
|
|
180
|
+
'c2': c2_data[i],
|
|
181
|
+
'c3': c3_data[i],
|
|
182
|
+
'c4': c4_data[i],
|
|
183
|
+
'c5': c5_data[i],
|
|
184
|
+
'c6': c6_data[i],
|
|
185
|
+
'c7': c7_data[i],
|
|
186
|
+
}
|
|
187
|
+
for i in range(num_rows)
|
|
188
|
+
]
|
|
189
|
+
t.insert(rows)
|
|
190
|
+
return t
|
|
191
|
+
|
|
192
|
+
def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
|
|
193
|
+
""" Creates a table with all supported datatypes.
|
|
194
|
+
"""
|
|
195
|
+
schema = {
|
|
196
|
+
'row_id': IntType(nullable=False), # used for row selection
|
|
197
|
+
'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
|
|
198
|
+
'c_bool': BoolType(nullable=True),
|
|
199
|
+
'c_float': FloatType(nullable=True),
|
|
200
|
+
'c_image': ImageType(nullable=True),
|
|
201
|
+
'c_int': IntType(nullable=True),
|
|
202
|
+
'c_json': JsonType(nullable=True),
|
|
203
|
+
'c_string': StringType(nullable=True),
|
|
204
|
+
'c_timestamp': TimestampType(nullable=True),
|
|
205
|
+
'c_video': VideoType(nullable=True),
|
|
206
|
+
}
|
|
207
|
+
tbl = test_client.create_table('all_datatype_tbl', schema)
|
|
208
|
+
example_rows = create_table_data(tbl, num_rows=11)
|
|
209
|
+
|
|
210
|
+
for i,r in enumerate(example_rows):
|
|
211
|
+
r['row_id'] = i # row_id
|
|
212
|
+
|
|
213
|
+
tbl.insert(example_rows)
|
|
214
|
+
return tbl
|
|
215
|
+
|
|
216
|
+
def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
98
217
|
"""
|
|
99
218
|
Locate dir_name, create df out of file_name.
|
|
100
|
-
transform columns 'file_name' to column 'file_path' with absolute paths
|
|
101
219
|
path_col_names: col names in csv file that contain file names; those will be converted to absolute paths
|
|
102
220
|
by adding the path to 'file_name' as a prefix.
|
|
221
|
+
Returns:
|
|
222
|
+
tuple of (list of rows, list of column names)
|
|
103
223
|
"""
|
|
104
|
-
|
|
224
|
+
if path_col_names is None:
|
|
225
|
+
path_col_names = []
|
|
226
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
227
|
+
glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
|
|
105
228
|
assert len(glob_result) == 1, f'Could not find {dir_name}'
|
|
106
229
|
abs_path = Path(glob_result[0])
|
|
107
230
|
data_file_path = abs_path / file_name
|
|
@@ -110,24 +233,176 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: List[str] = []
|
|
|
110
233
|
for col_name in path_col_names:
|
|
111
234
|
assert col_name in df.columns
|
|
112
235
|
df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
|
|
113
|
-
|
|
236
|
+
return df.to_dict(orient='records')
|
|
237
|
+
|
|
238
|
+
def get_video_files(include_bad_video: bool = False) -> List[str]:
|
|
239
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
240
|
+
glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
|
|
241
|
+
if not include_bad_video:
|
|
242
|
+
glob_result = [f for f in glob_result if 'bad_video' not in f]
|
|
243
|
+
|
|
244
|
+
half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
|
|
245
|
+
return half_res
|
|
114
246
|
|
|
115
|
-
def
|
|
116
|
-
|
|
247
|
+
def get_test_video_files() -> List[str]:
|
|
248
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
249
|
+
glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
|
|
117
250
|
return glob_result
|
|
118
251
|
|
|
252
|
+
def get_image_files(include_bad_image: bool = False) -> List[str]:
|
|
253
|
+
tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
|
|
254
|
+
glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
|
|
255
|
+
if not include_bad_image:
|
|
256
|
+
glob_result = [f for f in glob_result if 'bad_image' not in f]
|
|
257
|
+
return glob_result
|
|
258
|
+
|
|
259
|
+
def get_audio_files(include_bad_audio: bool = False) -> List[str]:
|
|
260
|
+
tests_dir = os.path.dirname(__file__)
|
|
261
|
+
glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
|
|
262
|
+
if not include_bad_audio:
|
|
263
|
+
glob_result = [f for f in glob_result if 'bad_audio' not in f]
|
|
264
|
+
return glob_result
|
|
265
|
+
|
|
266
|
+
def get_documents() -> List[str]:
|
|
267
|
+
tests_dir = os.path.dirname(__file__)
|
|
268
|
+
# for now, we can only handle .html and .md
|
|
269
|
+
return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
|
|
270
|
+
|
|
271
|
+
def get_sentences(n: int = 100) -> List[str]:
|
|
272
|
+
tests_dir = os.path.dirname(__file__)
|
|
273
|
+
path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
|
|
274
|
+
with open(path, 'r', encoding='utf8') as f:
|
|
275
|
+
questions_list = json.load(f)
|
|
276
|
+
# this dataset contains \' around the questions
|
|
277
|
+
return [q['question'].replace("'", '') for q in questions_list[:n]]
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
|
|
281
|
+
assert len(r1) == len(r2)
|
|
282
|
+
assert len(r1.column_names()) == len(r2.column_names()) # we don't care about the actual column names
|
|
283
|
+
r1_pd = r1.to_pandas()
|
|
284
|
+
r2_pd = r2.to_pandas()
|
|
285
|
+
for i in range(len(r1.column_names())):
|
|
286
|
+
# only compare column values
|
|
287
|
+
s1 = r1_pd.iloc[:, i]
|
|
288
|
+
s2 = r2_pd.iloc[:, i]
|
|
289
|
+
if s1.dtype == np.float64:
|
|
290
|
+
assert np.allclose(s1, s2)
|
|
291
|
+
else:
|
|
292
|
+
assert s1.equals(s2)
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def skip_test_if_not_installed(package) -> None:
|
|
296
|
+
if not Env.get().is_installed_package(package):
|
|
297
|
+
pytest.skip(f'Package `{package}` is not installed.')
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
|
|
301
|
+
assert status.num_excs == 0
|
|
302
|
+
if expected_rows is not None:
|
|
303
|
+
assert status.num_rows == expected_rows
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def make_test_arrow_table(output_path: Path) -> None:
|
|
307
|
+
import pyarrow as pa
|
|
308
|
+
|
|
309
|
+
value_dict = {
|
|
310
|
+
'c_id': [1, 2, 3, 4, 5],
|
|
311
|
+
'c_int64': [-10, -20, -30, -40, None],
|
|
312
|
+
'c_int32': [-1, -2, -3, -4, None],
|
|
313
|
+
'c_float32': [1.1, 2.2, 3.3, 4.4, None],
|
|
314
|
+
'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
|
|
315
|
+
'c_boolean': [True, False, True, False, None],
|
|
316
|
+
'c_timestamp': [
|
|
317
|
+
datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
|
|
318
|
+
datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
|
|
319
|
+
datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
|
|
320
|
+
datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
|
|
321
|
+
None,
|
|
322
|
+
],
|
|
323
|
+
# The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
|
|
324
|
+
# So, no nulls in this column
|
|
325
|
+
'c_array_float32': [
|
|
326
|
+
[
|
|
327
|
+
1.0,
|
|
328
|
+
2.0,
|
|
329
|
+
],
|
|
330
|
+
[
|
|
331
|
+
10.0,
|
|
332
|
+
20.0,
|
|
333
|
+
],
|
|
334
|
+
[
|
|
335
|
+
100.0,
|
|
336
|
+
200.0,
|
|
337
|
+
],
|
|
338
|
+
[
|
|
339
|
+
1000.0,
|
|
340
|
+
2000.0,
|
|
341
|
+
],
|
|
342
|
+
[10000.0, 20000.0],
|
|
343
|
+
],
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
arr_size = len(value_dict['c_array_float32'][0])
|
|
347
|
+
tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
|
|
348
|
+
|
|
349
|
+
schema = pa.schema(
|
|
350
|
+
[
|
|
351
|
+
('c_id', pa.int32()),
|
|
352
|
+
('c_int64', pa.int64()),
|
|
353
|
+
('c_int32', pa.int32()),
|
|
354
|
+
('c_float32', pa.float32()),
|
|
355
|
+
('c_string', pa.string()),
|
|
356
|
+
('c_boolean', pa.bool_()),
|
|
357
|
+
('c_timestamp', pa.timestamp('us')),
|
|
358
|
+
('c_array_float32', tensor_type),
|
|
359
|
+
]
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
test_table = pa.Table.from_pydict(value_dict, schema=schema)
|
|
363
|
+
pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
|
|
367
|
+
import datasets
|
|
368
|
+
assert df.count() == hf_dataset.num_rows
|
|
369
|
+
assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
|
|
370
|
+
|
|
371
|
+
# immutable so we can use it as in a set
|
|
372
|
+
DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
|
|
373
|
+
acc_dataset: Set[DatasetTuple] = set()
|
|
374
|
+
for tup in hf_dataset:
|
|
375
|
+
immutable_tup = {}
|
|
376
|
+
for k in tup:
|
|
377
|
+
if isinstance(tup[k], list):
|
|
378
|
+
immutable_tup[k] = tuple(tup[k])
|
|
379
|
+
else:
|
|
380
|
+
immutable_tup[k] = tup[k]
|
|
381
|
+
|
|
382
|
+
acc_dataset.add(DatasetTuple(**immutable_tup))
|
|
383
|
+
|
|
384
|
+
for tup in df.collect():
|
|
385
|
+
assert tup[split_column_name] in hf_dataset.split._name
|
|
386
|
+
|
|
387
|
+
encoded_tup = {}
|
|
388
|
+
for column_name, value in tup.items():
|
|
389
|
+
if column_name == split_column_name:
|
|
390
|
+
continue
|
|
391
|
+
feature_type = hf_dataset.features[column_name]
|
|
392
|
+
if isinstance(feature_type, datasets.ClassLabel):
|
|
393
|
+
assert value in feature_type.names
|
|
394
|
+
# must use the index of the class label as the value to
|
|
395
|
+
# compare with dataset iteration output.
|
|
396
|
+
value = feature_type.encode_example(value)
|
|
397
|
+
elif isinstance(feature_type, datasets.Sequence):
|
|
398
|
+
assert feature_type.feature.dtype == 'float32', 'may need to add more types'
|
|
399
|
+
value = tuple([float(x) for x in value])
|
|
400
|
+
|
|
401
|
+
encoded_tup[column_name] = value
|
|
402
|
+
|
|
403
|
+
check_tup = DatasetTuple(**encoded_tup)
|
|
404
|
+
assert check_tup in acc_dataset
|
|
405
|
+
|
|
119
406
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
self.sum = 0
|
|
123
|
-
@classmethod
|
|
124
|
-
def make_aggregator(cls) -> 'SumAggregator':
|
|
125
|
-
return cls()
|
|
126
|
-
def update(self, val: int) -> None:
|
|
127
|
-
self.sum += val
|
|
128
|
-
def value(self) -> int:
|
|
129
|
-
return self.sum
|
|
130
|
-
|
|
131
|
-
sum_uda = Function(
|
|
132
|
-
IntType(), [IntType()],
|
|
133
|
-
init_fn=SumAggregator.make_aggregator, update_fn=SumAggregator.update, value_fn=SumAggregator.value)
|
|
407
|
+
SAMPLE_IMAGE_URL = \
|
|
408
|
+
'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import pathlib
|
|
6
|
+
import subprocess
|
|
7
|
+
|
|
8
|
+
import pgserver
|
|
9
|
+
import toml
|
|
10
|
+
|
|
11
|
+
import pixeltable as pxt
|
|
12
|
+
import pixeltable.metadata as metadata
|
|
13
|
+
from pixeltable.env import Env
|
|
14
|
+
from pixeltable.type_system import \
|
|
15
|
+
StringType, IntType, FloatType, BoolType, TimestampType, JsonType
|
|
16
|
+
|
|
17
|
+
_logger = logging.getLogger('pixeltable')
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Dumper:
|
|
21
|
+
|
|
22
|
+
def __init__(self, output_dir='target', db_name='pxtdump') -> None:
|
|
23
|
+
self.output_dir = pathlib.Path(output_dir)
|
|
24
|
+
shared_home = pathlib.Path(os.environ.get('PIXELTABLE_HOME', '~/.pixeltable')).expanduser()
|
|
25
|
+
mock_home_dir = self.output_dir / '.pixeltable'
|
|
26
|
+
mock_home_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
os.environ['PIXELTABLE_HOME'] = str(mock_home_dir)
|
|
28
|
+
os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.yaml')
|
|
29
|
+
os.environ['PIXELTABLE_DB'] = db_name
|
|
30
|
+
os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
|
|
31
|
+
|
|
32
|
+
Env.get().set_up(reinit_db=True)
|
|
33
|
+
self.cl = pxt.Client()
|
|
34
|
+
self.cl.logging(level=logging.DEBUG, to_stdout=True)
|
|
35
|
+
|
|
36
|
+
def dump_db(self) -> None:
|
|
37
|
+
md_version = metadata.VERSION
|
|
38
|
+
dump_file = self.output_dir / f'pixeltable-v{md_version:03d}-test.dump.gz'
|
|
39
|
+
_logger.info(f'Creating database dump at: {dump_file}')
|
|
40
|
+
pg_package_dir = os.path.dirname(pgserver.__file__)
|
|
41
|
+
pg_dump_binary = f'{pg_package_dir}/pginstall/bin/pg_dump'
|
|
42
|
+
_logger.info(f'Using pg_dump binary at: {pg_dump_binary}')
|
|
43
|
+
with open(dump_file, 'wb') as dump:
|
|
44
|
+
pg_dump_process = subprocess.Popen(
|
|
45
|
+
[pg_dump_binary, Env.get().db_url, '-U', 'postgres', '-Fc'],
|
|
46
|
+
stdout=subprocess.PIPE
|
|
47
|
+
)
|
|
48
|
+
subprocess.run(
|
|
49
|
+
["gzip", "-9"],
|
|
50
|
+
stdin=pg_dump_process.stdout,
|
|
51
|
+
stdout=dump,
|
|
52
|
+
check=True
|
|
53
|
+
)
|
|
54
|
+
info_file = self.output_dir / f'pixeltable-v{md_version:03d}-test-info.toml'
|
|
55
|
+
git_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()
|
|
56
|
+
user = os.environ.get('USER', os.environ.get('USERNAME'))
|
|
57
|
+
info_dict = {'pixeltable-dump': {
|
|
58
|
+
'metadata-version': md_version,
|
|
59
|
+
'git-sha': git_sha,
|
|
60
|
+
'datetime': datetime.datetime.utcnow(),
|
|
61
|
+
'user': user
|
|
62
|
+
}}
|
|
63
|
+
with open(info_file, 'w') as info:
|
|
64
|
+
toml.dump(info_dict, info)
|
|
65
|
+
|
|
66
|
+
# TODO: Add additional features to the test DB dump (ideally it should exercise
|
|
67
|
+
# every major pixeltable DB feature)
|
|
68
|
+
def create_tables(self) -> None:
|
|
69
|
+
schema = {
|
|
70
|
+
'c1': StringType(nullable=False),
|
|
71
|
+
'c1n': StringType(nullable=True),
|
|
72
|
+
'c2': IntType(nullable=False),
|
|
73
|
+
'c3': FloatType(nullable=False),
|
|
74
|
+
'c4': BoolType(nullable=False),
|
|
75
|
+
'c5': TimestampType(nullable=False),
|
|
76
|
+
'c6': JsonType(nullable=False),
|
|
77
|
+
'c7': JsonType(nullable=False),
|
|
78
|
+
}
|
|
79
|
+
t = self.cl.create_table('sample_table', schema, primary_key='c2')
|
|
80
|
+
t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
|
|
81
|
+
|
|
82
|
+
# Add columns for .astype converters to ensure they're persisted properly
|
|
83
|
+
t.add_column(c2_as_float=t.c2.astype(FloatType()))
|
|
84
|
+
|
|
85
|
+
# Add columns for .apply
|
|
86
|
+
t.add_column(c2_to_string=t.c2.apply(str))
|
|
87
|
+
t.add_column(c6_to_string=t.c6.apply(json.dumps))
|
|
88
|
+
t.add_column(c6_back_to_json=t.c6_to_string.apply(json.loads))
|
|
89
|
+
|
|
90
|
+
num_rows = 100
|
|
91
|
+
d1 = {
|
|
92
|
+
'f1': 'test string 1',
|
|
93
|
+
'f2': 1,
|
|
94
|
+
'f3': 1.0,
|
|
95
|
+
'f4': True,
|
|
96
|
+
'f5': [1.0, 2.0, 3.0, 4.0],
|
|
97
|
+
'f6': {
|
|
98
|
+
'f7': 'test string 2',
|
|
99
|
+
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
100
|
+
},
|
|
101
|
+
}
|
|
102
|
+
d2 = [d1, d1]
|
|
103
|
+
|
|
104
|
+
c1_data = [f'test string {i}' for i in range(num_rows)]
|
|
105
|
+
c2_data = [i for i in range(num_rows)]
|
|
106
|
+
c3_data = [float(i) for i in range(num_rows)]
|
|
107
|
+
c4_data = [bool(i % 2) for i in range(num_rows)]
|
|
108
|
+
c5_data = [datetime.datetime.now()] * num_rows
|
|
109
|
+
c6_data = []
|
|
110
|
+
for i in range(num_rows):
|
|
111
|
+
d = {
|
|
112
|
+
'f1': f'test string {i}',
|
|
113
|
+
'f2': i,
|
|
114
|
+
'f3': float(i),
|
|
115
|
+
'f4': bool(i % 2),
|
|
116
|
+
'f5': [1.0, 2.0, 3.0, 4.0],
|
|
117
|
+
'f6': {
|
|
118
|
+
'f7': 'test string 2',
|
|
119
|
+
'f8': [1.0, 2.0, 3.0, 4.0],
|
|
120
|
+
},
|
|
121
|
+
}
|
|
122
|
+
c6_data.append(d)
|
|
123
|
+
|
|
124
|
+
c7_data = [d2] * num_rows
|
|
125
|
+
rows = [
|
|
126
|
+
{
|
|
127
|
+
'c1': c1_data[i],
|
|
128
|
+
'c1n': c1_data[i] if i % 10 != 0 else None,
|
|
129
|
+
'c2': c2_data[i],
|
|
130
|
+
'c3': c3_data[i],
|
|
131
|
+
'c4': c4_data[i],
|
|
132
|
+
'c5': c5_data[i],
|
|
133
|
+
'c6': c6_data[i],
|
|
134
|
+
'c7': c7_data[i],
|
|
135
|
+
}
|
|
136
|
+
for i in range(num_rows)
|
|
137
|
+
]
|
|
138
|
+
t.insert(rows)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def main() -> None:
|
|
142
|
+
_logger.info("Creating pixeltable test artifact.")
|
|
143
|
+
dumper = Dumper()
|
|
144
|
+
dumper.create_tables()
|
|
145
|
+
dumper.dump_db()
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
if __name__ == "__main__":
|
|
149
|
+
main()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import av
|
|
2
|
+
import PIL.Image
|
|
3
|
+
import PIL.ImageDraw
|
|
4
|
+
import PIL.ImageFont
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Optional
|
|
8
|
+
import tempfile
|
|
9
|
+
import math
|
|
10
|
+
|
|
11
|
+
def create_test_video(
|
|
12
|
+
frame_count: int,
|
|
13
|
+
frame_rate: float = 1.0,
|
|
14
|
+
frame_width: int = 224,
|
|
15
|
+
aspect_ratio: str = '16:9',
|
|
16
|
+
frame_height: Optional[int] = None,
|
|
17
|
+
output_path: Optional[Path] = None,
|
|
18
|
+
font_file: str = '/Library/Fonts/Arial Unicode.ttf',
|
|
19
|
+
) -> Path:
|
|
20
|
+
"""
|
|
21
|
+
Creates an .mp4 video file such as the ones in /tests/data/test_videos
|
|
22
|
+
The video contains a frame number in each frame (for visual sanity check).
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
frame_count: Number of frames to create
|
|
26
|
+
frame_rate: Frame rate of the video
|
|
27
|
+
frame_width (int): Width in pixels of the video frame. Note: cost of decoding increases dramatically
|
|
28
|
+
with frame width * frame height.
|
|
29
|
+
aspect_ratio: Aspect ratio (width/height) of the video frames string of form 'width:height'
|
|
30
|
+
frame_height: Height of the video frame, if given, aspect_ratio is ignored
|
|
31
|
+
output_path: Path to save the video file
|
|
32
|
+
font_file: Path to the font file used for text.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
if output_path is None:
|
|
36
|
+
output_path = Path(tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name)
|
|
37
|
+
|
|
38
|
+
parts = [int(p) for p in aspect_ratio.split(':')]
|
|
39
|
+
assert len(parts) == 2
|
|
40
|
+
aspect_ratio = parts[0] / parts[1]
|
|
41
|
+
|
|
42
|
+
if frame_height is None:
|
|
43
|
+
frame_height = math.ceil(frame_width / aspect_ratio)
|
|
44
|
+
|
|
45
|
+
frame_size = (frame_width, frame_height)
|
|
46
|
+
|
|
47
|
+
font_size = min(frame_height, frame_width) // 4
|
|
48
|
+
font = PIL.ImageFont.truetype(font=font_file, size=font_size)
|
|
49
|
+
font_fill = 0xFFFFFF # white
|
|
50
|
+
frame_color = 0xFFFFFF - font_fill # black
|
|
51
|
+
# Create a video container
|
|
52
|
+
container = av.open(str(output_path), mode='w')
|
|
53
|
+
|
|
54
|
+
# Add a video stream
|
|
55
|
+
stream = container.add_stream('h264', rate=frame_rate)
|
|
56
|
+
stream.width, stream.height = frame_size
|
|
57
|
+
stream.pix_fmt = 'yuv420p'
|
|
58
|
+
|
|
59
|
+
for frame_number in range(frame_count):
|
|
60
|
+
# Create an image with a number in it
|
|
61
|
+
image = PIL.Image.new('RGB', frame_size, color=frame_color)
|
|
62
|
+
draw = PIL.ImageDraw.Draw(image)
|
|
63
|
+
# Optionally, add a font here if you have one
|
|
64
|
+
text = str(frame_number)
|
|
65
|
+
_, _, text_width, text_height = draw.textbbox((0, 0), text, font=font)
|
|
66
|
+
text_position = ((frame_size[0] - text_width) // 2, (frame_size[1] - text_height) // 2)
|
|
67
|
+
draw.text(text_position, text, font=font, fill=font_fill)
|
|
68
|
+
|
|
69
|
+
# Convert the PIL image to an AVFrame
|
|
70
|
+
frame = av.VideoFrame.from_image(image)
|
|
71
|
+
|
|
72
|
+
# Encode and write the frame
|
|
73
|
+
for packet in stream.encode(frame):
|
|
74
|
+
container.mux(packet)
|
|
75
|
+
|
|
76
|
+
# Flush and close the stream
|
|
77
|
+
for packet in stream.encode():
|
|
78
|
+
container.mux(packet)
|
|
79
|
+
|
|
80
|
+
container.close()
|
|
81
|
+
return output_path
|