pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show
  1. pixeltable/__init__.py +34 -6
  2. pixeltable/catalog/__init__.py +13 -0
  3. pixeltable/catalog/catalog.py +159 -0
  4. pixeltable/catalog/column.py +200 -0
  5. pixeltable/catalog/dir.py +32 -0
  6. pixeltable/catalog/globals.py +33 -0
  7. pixeltable/catalog/insertable_table.py +191 -0
  8. pixeltable/catalog/named_function.py +36 -0
  9. pixeltable/catalog/path.py +58 -0
  10. pixeltable/catalog/path_dict.py +139 -0
  11. pixeltable/catalog/schema_object.py +39 -0
  12. pixeltable/catalog/table.py +581 -0
  13. pixeltable/catalog/table_version.py +749 -0
  14. pixeltable/catalog/table_version_path.py +133 -0
  15. pixeltable/catalog/view.py +203 -0
  16. pixeltable/client.py +590 -30
  17. pixeltable/dataframe.py +540 -349
  18. pixeltable/env.py +359 -45
  19. pixeltable/exceptions.py +12 -21
  20. pixeltable/exec/__init__.py +9 -0
  21. pixeltable/exec/aggregation_node.py +78 -0
  22. pixeltable/exec/cache_prefetch_node.py +116 -0
  23. pixeltable/exec/component_iteration_node.py +79 -0
  24. pixeltable/exec/data_row_batch.py +95 -0
  25. pixeltable/exec/exec_context.py +22 -0
  26. pixeltable/exec/exec_node.py +61 -0
  27. pixeltable/exec/expr_eval_node.py +217 -0
  28. pixeltable/exec/in_memory_data_node.py +69 -0
  29. pixeltable/exec/media_validation_node.py +43 -0
  30. pixeltable/exec/sql_scan_node.py +225 -0
  31. pixeltable/exprs/__init__.py +24 -0
  32. pixeltable/exprs/arithmetic_expr.py +102 -0
  33. pixeltable/exprs/array_slice.py +71 -0
  34. pixeltable/exprs/column_property_ref.py +77 -0
  35. pixeltable/exprs/column_ref.py +105 -0
  36. pixeltable/exprs/comparison.py +77 -0
  37. pixeltable/exprs/compound_predicate.py +98 -0
  38. pixeltable/exprs/data_row.py +195 -0
  39. pixeltable/exprs/expr.py +586 -0
  40. pixeltable/exprs/expr_set.py +39 -0
  41. pixeltable/exprs/function_call.py +380 -0
  42. pixeltable/exprs/globals.py +69 -0
  43. pixeltable/exprs/image_member_access.py +115 -0
  44. pixeltable/exprs/image_similarity_predicate.py +58 -0
  45. pixeltable/exprs/inline_array.py +107 -0
  46. pixeltable/exprs/inline_dict.py +101 -0
  47. pixeltable/exprs/is_null.py +38 -0
  48. pixeltable/exprs/json_mapper.py +121 -0
  49. pixeltable/exprs/json_path.py +159 -0
  50. pixeltable/exprs/literal.py +54 -0
  51. pixeltable/exprs/object_ref.py +41 -0
  52. pixeltable/exprs/predicate.py +44 -0
  53. pixeltable/exprs/row_builder.py +355 -0
  54. pixeltable/exprs/rowid_ref.py +94 -0
  55. pixeltable/exprs/type_cast.py +53 -0
  56. pixeltable/exprs/variable.py +45 -0
  57. pixeltable/func/__init__.py +9 -0
  58. pixeltable/func/aggregate_function.py +194 -0
  59. pixeltable/func/batched_function.py +53 -0
  60. pixeltable/func/callable_function.py +69 -0
  61. pixeltable/func/expr_template_function.py +82 -0
  62. pixeltable/func/function.py +110 -0
  63. pixeltable/func/function_registry.py +227 -0
  64. pixeltable/func/globals.py +36 -0
  65. pixeltable/func/nos_function.py +202 -0
  66. pixeltable/func/signature.py +166 -0
  67. pixeltable/func/udf.py +163 -0
  68. pixeltable/functions/__init__.py +52 -103
  69. pixeltable/functions/eval.py +216 -0
  70. pixeltable/functions/fireworks.py +34 -0
  71. pixeltable/functions/huggingface.py +120 -0
  72. pixeltable/functions/image.py +16 -0
  73. pixeltable/functions/openai.py +256 -0
  74. pixeltable/functions/pil/image.py +148 -7
  75. pixeltable/functions/string.py +13 -0
  76. pixeltable/functions/together.py +122 -0
  77. pixeltable/functions/util.py +41 -0
  78. pixeltable/functions/video.py +62 -0
  79. pixeltable/iterators/__init__.py +3 -0
  80. pixeltable/iterators/base.py +48 -0
  81. pixeltable/iterators/document.py +311 -0
  82. pixeltable/iterators/video.py +89 -0
  83. pixeltable/metadata/__init__.py +54 -0
  84. pixeltable/metadata/converters/convert_10.py +18 -0
  85. pixeltable/metadata/schema.py +211 -0
  86. pixeltable/plan.py +656 -0
  87. pixeltable/store.py +418 -182
  88. pixeltable/tests/conftest.py +146 -88
  89. pixeltable/tests/functions/test_fireworks.py +42 -0
  90. pixeltable/tests/functions/test_functions.py +60 -0
  91. pixeltable/tests/functions/test_huggingface.py +158 -0
  92. pixeltable/tests/functions/test_openai.py +152 -0
  93. pixeltable/tests/functions/test_together.py +111 -0
  94. pixeltable/tests/test_audio.py +65 -0
  95. pixeltable/tests/test_catalog.py +27 -0
  96. pixeltable/tests/test_client.py +14 -14
  97. pixeltable/tests/test_component_view.py +370 -0
  98. pixeltable/tests/test_dataframe.py +439 -0
  99. pixeltable/tests/test_dirs.py +78 -62
  100. pixeltable/tests/test_document.py +120 -0
  101. pixeltable/tests/test_exprs.py +592 -135
  102. pixeltable/tests/test_function.py +297 -67
  103. pixeltable/tests/test_migration.py +43 -0
  104. pixeltable/tests/test_nos.py +54 -0
  105. pixeltable/tests/test_snapshot.py +208 -0
  106. pixeltable/tests/test_table.py +1195 -263
  107. pixeltable/tests/test_transactional_directory.py +42 -0
  108. pixeltable/tests/test_types.py +5 -11
  109. pixeltable/tests/test_video.py +151 -34
  110. pixeltable/tests/test_view.py +530 -0
  111. pixeltable/tests/utils.py +320 -45
  112. pixeltable/tool/create_test_db_dump.py +149 -0
  113. pixeltable/tool/create_test_video.py +81 -0
  114. pixeltable/type_system.py +445 -124
  115. pixeltable/utils/__init__.py +17 -46
  116. pixeltable/utils/arrow.py +98 -0
  117. pixeltable/utils/clip.py +12 -15
  118. pixeltable/utils/coco.py +136 -0
  119. pixeltable/utils/documents.py +39 -0
  120. pixeltable/utils/filecache.py +195 -0
  121. pixeltable/utils/help.py +11 -0
  122. pixeltable/utils/hf_datasets.py +157 -0
  123. pixeltable/utils/media_store.py +76 -0
  124. pixeltable/utils/parquet.py +167 -0
  125. pixeltable/utils/pytorch.py +91 -0
  126. pixeltable/utils/s3.py +13 -0
  127. pixeltable/utils/sql.py +17 -0
  128. pixeltable/utils/transactional_directory.py +35 -0
  129. pixeltable-0.2.4.dist-info/LICENSE +18 -0
  130. pixeltable-0.2.4.dist-info/METADATA +127 -0
  131. pixeltable-0.2.4.dist-info/RECORD +132 -0
  132. {pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
  133. pixeltable/catalog.py +0 -1421
  134. pixeltable/exprs.py +0 -1745
  135. pixeltable/function.py +0 -269
  136. pixeltable/functions/clip.py +0 -10
  137. pixeltable/functions/pil/__init__.py +0 -23
  138. pixeltable/functions/tf.py +0 -21
  139. pixeltable/index.py +0 -57
  140. pixeltable/tests/test_dict.py +0 -24
  141. pixeltable/tests/test_functions.py +0 -11
  142. pixeltable/tests/test_tf.py +0 -69
  143. pixeltable/tf.py +0 -33
  144. pixeltable/utils/tf.py +0 -33
  145. pixeltable/utils/video.py +0 -32
  146. pixeltable-0.1.0.dist-info/METADATA +0 -34
  147. pixeltable-0.1.0.dist-info/RECORD +0 -36
pixeltable/tests/utils.py CHANGED
@@ -1,16 +1,34 @@
1
1
  import datetime
2
2
  import glob
3
+ import json
3
4
  import os
5
+ from collections import namedtuple
4
6
  from pathlib import Path
5
- from typing import Dict, Any, List
7
+ from typing import Any, Dict, List, Optional, Set
6
8
 
7
9
  import numpy as np
8
10
  import pandas as pd
11
+ import pytest
9
12
 
10
- import pixeltable as pt
13
+ import pixeltable as pxt
14
+ import pixeltable.type_system as ts
11
15
  from pixeltable import catalog
12
- from pixeltable.type_system import ColumnType, StringType, IntType, FloatType, BoolType, TimestampType
13
- from pixeltable.function import Function
16
+ from pixeltable.catalog.globals import UpdateStatus
17
+ from pixeltable.dataframe import DataFrameResultSet
18
+ from pixeltable.env import Env
19
+ from pixeltable.type_system import (
20
+ ArrayType,
21
+ BoolType,
22
+ ColumnType,
23
+ FloatType,
24
+ ImageType,
25
+ IntType,
26
+ JsonType,
27
+ StringType,
28
+ TimestampType,
29
+ VideoType,
30
+ )
31
+
14
32
 
15
33
  def make_default_type(t: ColumnType.Type) -> ColumnType:
16
34
  if t == ColumnType.Type.STRING:
@@ -25,14 +43,19 @@ def make_default_type(t: ColumnType.Type) -> ColumnType:
25
43
  return TimestampType()
26
44
  assert False
27
45
 
28
- def make_tbl(db: pt.Db, name: str = 'test', col_names: List[str] = ['c1']) -> pt.MutableTable:
29
- schema: List[catalog.Column] = []
46
+ def make_tbl(cl: pxt.Client, name: str = 'test', col_names: Optional[List[str]] = None) -> catalog.InsertableTable:
47
+ if col_names is None:
48
+ col_names = ['c1']
49
+ schema: Dict[str, ts.ColumnType] = {}
30
50
  for i, col_name in enumerate(col_names):
31
- schema.append(catalog.Column(f'{col_name}', make_default_type(ColumnType.Type(i % 5))))
32
- return db.create_table(name, schema)
51
+ schema[f'{col_name}'] = make_default_type(ColumnType.Type(i % 5))
52
+ return cl.create_table(name, schema)
33
53
 
34
- def create_table_data(t: catalog.Table, col_names: List[str] = [], num_rows: int = 10) -> pd.DataFrame:
54
+ def create_table_data(t: catalog.Table, col_names: Optional[List[str]] = None, num_rows: int = 10) -> List[Dict[str, Any]]:
55
+ if col_names is None:
56
+ col_names = []
35
57
  data: Dict[str, Any] = {}
58
+
36
59
  sample_dict = {
37
60
  'detections': [{
38
61
  'id': '637e8e073b28441a453564cf',
@@ -70,38 +93,138 @@ def create_table_data(t: catalog.Table, col_names: List[str] = [], num_rows: int
70
93
  }
71
94
 
72
95
  if len(col_names) == 0:
73
- col_names = [c.name for c in t.columns]
96
+ col_names = [c.name for c in t.columns() if not c.is_computed]
74
97
 
98
+ col_types = t.column_types()
75
99
  for col_name in col_names:
76
- col = t.cols_by_name[col_name]
100
+ col_type = col_types[col_name]
77
101
  col_data: Any = None
78
- if col.col_type.is_string_type():
102
+ if col_type.is_string_type():
79
103
  col_data = ['test string'] * num_rows
80
- if col.col_type.is_int_type():
81
- col_data = np.random.randint(0, 100, size=num_rows)
82
- if col.col_type.is_float_type():
83
- col_data = np.random.random(size=num_rows) * 100
84
- if col.col_type.is_bool_type():
104
+ if col_type.is_int_type():
105
+ col_data = np.random.randint(0, 100, size=num_rows).tolist()
106
+ if col_type.is_float_type():
107
+ col_data = (np.random.random(size=num_rows) * 100).tolist()
108
+ if col_type.is_bool_type():
85
109
  col_data = np.random.randint(0, 2, size=num_rows)
86
110
  col_data = [False if i == 0 else True for i in col_data]
87
- if col.col_type.is_timestamp_type():
88
- col_data = datetime.datetime.now()
89
- if col.col_type.is_json_type():
111
+ if col_type.is_timestamp_type():
112
+ col_data = [datetime.datetime.now()] * num_rows
113
+ if col_type.is_json_type():
90
114
  col_data = [sample_dict] * num_rows
91
- # TODO: implement this
92
- assert not col.col_type.is_image_type()
93
- assert not col.col_type.is_array_type()
94
- data[col.name] = col_data
95
- return pd.DataFrame(data=data)
115
+ if col_type.is_array_type():
116
+ col_data = [np.ones(col_type.shape, dtype=col_type.numpy_dtype()) for i in range(num_rows)]
117
+ if col_type.is_image_type():
118
+ image_path = get_image_files()[0]
119
+ col_data = [image_path for i in range(num_rows)]
120
+ if col_type.is_video_type():
121
+ video_path = get_video_files()[0]
122
+ col_data = [video_path for i in range(num_rows)]
123
+ data[col_name] = col_data
124
+ rows = [{col_name: data[col_name][i] for col_name in col_names} for i in range(num_rows)]
125
+ return rows
126
+
127
+ def create_test_tbl(client: pxt.Client, name: str = 'test_tbl') -> catalog.Table:
128
+ schema = {
129
+ 'c1': StringType(nullable=False),
130
+ 'c1n': StringType(nullable=True),
131
+ 'c2': IntType(nullable=False),
132
+ 'c3': FloatType(nullable=False),
133
+ 'c4': BoolType(nullable=False),
134
+ 'c5': TimestampType(nullable=False),
135
+ 'c6': JsonType(nullable=False),
136
+ 'c7': JsonType(nullable=False),
137
+ }
138
+ t = client.create_table(name, schema, primary_key='c2')
139
+ t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
140
+
141
+ num_rows = 100
142
+ d1 = {
143
+ 'f1': 'test string 1',
144
+ 'f2': 1,
145
+ 'f3': 1.0,
146
+ 'f4': True,
147
+ 'f5': [1.0, 2.0, 3.0, 4.0],
148
+ 'f6': {
149
+ 'f7': 'test string 2',
150
+ 'f8': [1.0, 2.0, 3.0, 4.0],
151
+ },
152
+ }
153
+ d2 = [d1, d1]
96
154
 
97
- def read_data_file(dir_name: str, file_name: str, path_col_names: List[str] = []) -> pd.DataFrame:
155
+ c1_data = [f'test string {i}' for i in range(num_rows)]
156
+ c2_data = [i for i in range(num_rows)]
157
+ c3_data = [float(i) for i in range(num_rows)]
158
+ c4_data = [bool(i % 2) for i in range(num_rows)]
159
+ c5_data = [datetime.datetime.now()] * num_rows
160
+ c6_data = []
161
+ for i in range(num_rows):
162
+ d = {
163
+ 'f1': f'test string {i}',
164
+ 'f2': i,
165
+ 'f3': float(i),
166
+ 'f4': bool(i % 2),
167
+ 'f5': [1.0, 2.0, 3.0, 4.0],
168
+ 'f6': {
169
+ 'f7': 'test string 2',
170
+ 'f8': [1.0, 2.0, 3.0, 4.0],
171
+ },
172
+ }
173
+ c6_data.append(d)
174
+
175
+ c7_data = [d2] * num_rows
176
+ rows = [
177
+ {
178
+ 'c1': c1_data[i],
179
+ 'c1n': c1_data[i] if i % 10 != 0 else None,
180
+ 'c2': c2_data[i],
181
+ 'c3': c3_data[i],
182
+ 'c4': c4_data[i],
183
+ 'c5': c5_data[i],
184
+ 'c6': c6_data[i],
185
+ 'c7': c7_data[i],
186
+ }
187
+ for i in range(num_rows)
188
+ ]
189
+ t.insert(rows)
190
+ return t
191
+
192
+ def create_all_datatypes_tbl(test_client: pxt.Client) -> catalog.Table:
193
+ """ Creates a table with all supported datatypes.
194
+ """
195
+ schema = {
196
+ 'row_id': IntType(nullable=False), # used for row selection
197
+ 'c_array': ArrayType(shape=(10,), dtype=FloatType(), nullable=True),
198
+ 'c_bool': BoolType(nullable=True),
199
+ 'c_float': FloatType(nullable=True),
200
+ 'c_image': ImageType(nullable=True),
201
+ 'c_int': IntType(nullable=True),
202
+ 'c_json': JsonType(nullable=True),
203
+ 'c_string': StringType(nullable=True),
204
+ 'c_timestamp': TimestampType(nullable=True),
205
+ 'c_video': VideoType(nullable=True),
206
+ }
207
+ tbl = test_client.create_table('all_datatype_tbl', schema)
208
+ example_rows = create_table_data(tbl, num_rows=11)
209
+
210
+ for i,r in enumerate(example_rows):
211
+ r['row_id'] = i # row_id
212
+
213
+ tbl.insert(example_rows)
214
+ return tbl
215
+
216
+ def read_data_file(dir_name: str, file_name: str, path_col_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
98
217
  """
99
218
  Locate dir_name, create df out of file_name.
100
- transform columns 'file_name' to column 'file_path' with absolute paths
101
219
  path_col_names: col names in csv file that contain file names; those will be converted to absolute paths
102
220
  by adding the path to 'file_name' as a prefix.
221
+ Returns:
222
+ tuple of (list of rows, list of column names)
103
223
  """
104
- glob_result = glob.glob(f'{os.getcwd()}/**/{dir_name}', recursive=True)
224
+ if path_col_names is None:
225
+ path_col_names = []
226
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
227
+ glob_result = glob.glob(f'{tests_dir}/**/{dir_name}', recursive=True)
105
228
  assert len(glob_result) == 1, f'Could not find {dir_name}'
106
229
  abs_path = Path(glob_result[0])
107
230
  data_file_path = abs_path / file_name
@@ -110,24 +233,176 @@ def read_data_file(dir_name: str, file_name: str, path_col_names: List[str] = []
110
233
  for col_name in path_col_names:
111
234
  assert col_name in df.columns
112
235
  df[col_name] = df.apply(lambda r: str(abs_path / r[col_name]), axis=1)
113
- return df
236
+ return df.to_dict(orient='records')
237
+
238
+ def get_video_files(include_bad_video: bool = False) -> List[str]:
239
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
240
+ glob_result = glob.glob(f'{tests_dir}/**/videos/*', recursive=True)
241
+ if not include_bad_video:
242
+ glob_result = [f for f in glob_result if 'bad_video' not in f]
243
+
244
+ half_res = [f for f in glob_result if 'half_res' in f or 'bad_video' in f]
245
+ return half_res
114
246
 
115
- def get_video_files() -> List[str]:
116
- glob_result = glob.glob(f'{os.getcwd()}/**/videos/*.mp4', recursive=True)
247
+ def get_test_video_files() -> List[str]:
248
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
249
+ glob_result = glob.glob(f'{tests_dir}/**/test_videos/*', recursive=True)
117
250
  return glob_result
118
251
 
252
+ def get_image_files(include_bad_image: bool = False) -> List[str]:
253
+ tests_dir = os.path.dirname(__file__) # search with respect to tests/ dir
254
+ glob_result = glob.glob(f'{tests_dir}/**/imagenette2-160/*', recursive=True)
255
+ if not include_bad_image:
256
+ glob_result = [f for f in glob_result if 'bad_image' not in f]
257
+ return glob_result
258
+
259
+ def get_audio_files(include_bad_audio: bool = False) -> List[str]:
260
+ tests_dir = os.path.dirname(__file__)
261
+ glob_result = glob.glob(f'{tests_dir}/**/audio/*', recursive=True)
262
+ if not include_bad_audio:
263
+ glob_result = [f for f in glob_result if 'bad_audio' not in f]
264
+ return glob_result
265
+
266
+ def get_documents() -> List[str]:
267
+ tests_dir = os.path.dirname(__file__)
268
+ # for now, we can only handle .html and .md
269
+ return [p for p in glob.glob(f'{tests_dir}/**/documents/*', recursive=True) if not p.endswith('.pdf')]
270
+
271
+ def get_sentences(n: int = 100) -> List[str]:
272
+ tests_dir = os.path.dirname(__file__)
273
+ path = glob.glob(f'{tests_dir}/**/jeopardy.json', recursive=True)[0]
274
+ with open(path, 'r', encoding='utf8') as f:
275
+ questions_list = json.load(f)
276
+ # this dataset contains \' around the questions
277
+ return [q['question'].replace("'", '') for q in questions_list[:n]]
278
+
279
+
280
+ def assert_resultset_eq(r1: DataFrameResultSet, r2: DataFrameResultSet) -> None:
281
+ assert len(r1) == len(r2)
282
+ assert len(r1.column_names()) == len(r2.column_names()) # we don't care about the actual column names
283
+ r1_pd = r1.to_pandas()
284
+ r2_pd = r2.to_pandas()
285
+ for i in range(len(r1.column_names())):
286
+ # only compare column values
287
+ s1 = r1_pd.iloc[:, i]
288
+ s2 = r2_pd.iloc[:, i]
289
+ if s1.dtype == np.float64:
290
+ assert np.allclose(s1, s2)
291
+ else:
292
+ assert s1.equals(s2)
293
+
294
+
295
+ def skip_test_if_not_installed(package) -> None:
296
+ if not Env.get().is_installed_package(package):
297
+ pytest.skip(f'Package `{package}` is not installed.')
298
+
299
+
300
+ def validate_update_status(status: UpdateStatus, expected_rows: Optional[int] = None) -> None:
301
+ assert status.num_excs == 0
302
+ if expected_rows is not None:
303
+ assert status.num_rows == expected_rows
304
+
305
+
306
+ def make_test_arrow_table(output_path: Path) -> None:
307
+ import pyarrow as pa
308
+
309
+ value_dict = {
310
+ 'c_id': [1, 2, 3, 4, 5],
311
+ 'c_int64': [-10, -20, -30, -40, None],
312
+ 'c_int32': [-1, -2, -3, -4, None],
313
+ 'c_float32': [1.1, 2.2, 3.3, 4.4, None],
314
+ 'c_string': ['aaa', 'bbb', 'ccc', 'ddd', None],
315
+ 'c_boolean': [True, False, True, False, None],
316
+ 'c_timestamp': [
317
+ datetime.datetime(2012, 1, 1, 12, 0, 0, 25),
318
+ datetime.datetime(2012, 1, 2, 12, 0, 0, 25),
319
+ datetime.datetime(2012, 1, 3, 12, 0, 0, 25),
320
+ datetime.datetime(2012, 1, 4, 12, 0, 0, 25),
321
+ None,
322
+ ],
323
+ # The pyarrow fixed_shape_tensor type does not support NULLs (currently can write them but not read them)
324
+ # So, no nulls in this column
325
+ 'c_array_float32': [
326
+ [
327
+ 1.0,
328
+ 2.0,
329
+ ],
330
+ [
331
+ 10.0,
332
+ 20.0,
333
+ ],
334
+ [
335
+ 100.0,
336
+ 200.0,
337
+ ],
338
+ [
339
+ 1000.0,
340
+ 2000.0,
341
+ ],
342
+ [10000.0, 20000.0],
343
+ ],
344
+ }
345
+
346
+ arr_size = len(value_dict['c_array_float32'][0])
347
+ tensor_type = pa.fixed_shape_tensor(pa.float32(), (arr_size,))
348
+
349
+ schema = pa.schema(
350
+ [
351
+ ('c_id', pa.int32()),
352
+ ('c_int64', pa.int64()),
353
+ ('c_int32', pa.int32()),
354
+ ('c_float32', pa.float32()),
355
+ ('c_string', pa.string()),
356
+ ('c_boolean', pa.bool_()),
357
+ ('c_timestamp', pa.timestamp('us')),
358
+ ('c_array_float32', tensor_type),
359
+ ]
360
+ )
361
+
362
+ test_table = pa.Table.from_pydict(value_dict, schema=schema)
363
+ pa.parquet.write_table(test_table, str(output_path / 'test.parquet'))
364
+
365
+
366
+ def assert_hf_dataset_equal(hf_dataset: 'datasets.Dataset', df: pxt.DataFrame, split_column_name: str) -> None:
367
+ import datasets
368
+ assert df.count() == hf_dataset.num_rows
369
+ assert set(df.get_column_names()) == (set(hf_dataset.features.keys()) | {split_column_name})
370
+
371
+ # immutable so we can use it as in a set
372
+ DatasetTuple = namedtuple('DatasetTuple', ' '.join(hf_dataset.features.keys()))
373
+ acc_dataset: Set[DatasetTuple] = set()
374
+ for tup in hf_dataset:
375
+ immutable_tup = {}
376
+ for k in tup:
377
+ if isinstance(tup[k], list):
378
+ immutable_tup[k] = tuple(tup[k])
379
+ else:
380
+ immutable_tup[k] = tup[k]
381
+
382
+ acc_dataset.add(DatasetTuple(**immutable_tup))
383
+
384
+ for tup in df.collect():
385
+ assert tup[split_column_name] in hf_dataset.split._name
386
+
387
+ encoded_tup = {}
388
+ for column_name, value in tup.items():
389
+ if column_name == split_column_name:
390
+ continue
391
+ feature_type = hf_dataset.features[column_name]
392
+ if isinstance(feature_type, datasets.ClassLabel):
393
+ assert value in feature_type.names
394
+ # must use the index of the class label as the value to
395
+ # compare with dataset iteration output.
396
+ value = feature_type.encode_example(value)
397
+ elif isinstance(feature_type, datasets.Sequence):
398
+ assert feature_type.feature.dtype == 'float32', 'may need to add more types'
399
+ value = tuple([float(x) for x in value])
400
+
401
+ encoded_tup[column_name] = value
402
+
403
+ check_tup = DatasetTuple(**encoded_tup)
404
+ assert check_tup in acc_dataset
405
+
119
406
 
120
- class SumAggregator:
121
- def __init__(self):
122
- self.sum = 0
123
- @classmethod
124
- def make_aggregator(cls) -> 'SumAggregator':
125
- return cls()
126
- def update(self, val: int) -> None:
127
- self.sum += val
128
- def value(self) -> int:
129
- return self.sum
130
-
131
- sum_uda = Function(
132
- IntType(), [IntType()],
133
- init_fn=SumAggregator.make_aggregator, update_fn=SumAggregator.update, value_fn=SumAggregator.value)
407
+ SAMPLE_IMAGE_URL = \
408
+ 'https://raw.githubusercontent.com/pixeltable/pixeltable/master/docs/source/data/images/000000000009.jpg'
@@ -0,0 +1,149 @@
1
+ import datetime
2
+ import json
3
+ import logging
4
+ import os
5
+ import pathlib
6
+ import subprocess
7
+
8
+ import pgserver
9
+ import toml
10
+
11
+ import pixeltable as pxt
12
+ import pixeltable.metadata as metadata
13
+ from pixeltable.env import Env
14
+ from pixeltable.type_system import \
15
+ StringType, IntType, FloatType, BoolType, TimestampType, JsonType
16
+
17
+ _logger = logging.getLogger('pixeltable')
18
+
19
+
20
+ class Dumper:
21
+
22
+ def __init__(self, output_dir='target', db_name='pxtdump') -> None:
23
+ self.output_dir = pathlib.Path(output_dir)
24
+ shared_home = pathlib.Path(os.environ.get('PIXELTABLE_HOME', '~/.pixeltable')).expanduser()
25
+ mock_home_dir = self.output_dir / '.pixeltable'
26
+ mock_home_dir.mkdir(parents=True, exist_ok=True)
27
+ os.environ['PIXELTABLE_HOME'] = str(mock_home_dir)
28
+ os.environ['PIXELTABLE_CONFIG'] = str(shared_home / 'config.yaml')
29
+ os.environ['PIXELTABLE_DB'] = db_name
30
+ os.environ['PIXELTABLE_PGDATA'] = str(shared_home / 'pgdata')
31
+
32
+ Env.get().set_up(reinit_db=True)
33
+ self.cl = pxt.Client()
34
+ self.cl.logging(level=logging.DEBUG, to_stdout=True)
35
+
36
+ def dump_db(self) -> None:
37
+ md_version = metadata.VERSION
38
+ dump_file = self.output_dir / f'pixeltable-v{md_version:03d}-test.dump.gz'
39
+ _logger.info(f'Creating database dump at: {dump_file}')
40
+ pg_package_dir = os.path.dirname(pgserver.__file__)
41
+ pg_dump_binary = f'{pg_package_dir}/pginstall/bin/pg_dump'
42
+ _logger.info(f'Using pg_dump binary at: {pg_dump_binary}')
43
+ with open(dump_file, 'wb') as dump:
44
+ pg_dump_process = subprocess.Popen(
45
+ [pg_dump_binary, Env.get().db_url, '-U', 'postgres', '-Fc'],
46
+ stdout=subprocess.PIPE
47
+ )
48
+ subprocess.run(
49
+ ["gzip", "-9"],
50
+ stdin=pg_dump_process.stdout,
51
+ stdout=dump,
52
+ check=True
53
+ )
54
+ info_file = self.output_dir / f'pixeltable-v{md_version:03d}-test-info.toml'
55
+ git_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()
56
+ user = os.environ.get('USER', os.environ.get('USERNAME'))
57
+ info_dict = {'pixeltable-dump': {
58
+ 'metadata-version': md_version,
59
+ 'git-sha': git_sha,
60
+ 'datetime': datetime.datetime.utcnow(),
61
+ 'user': user
62
+ }}
63
+ with open(info_file, 'w') as info:
64
+ toml.dump(info_dict, info)
65
+
66
+ # TODO: Add additional features to the test DB dump (ideally it should exercise
67
+ # every major pixeltable DB feature)
68
+ def create_tables(self) -> None:
69
+ schema = {
70
+ 'c1': StringType(nullable=False),
71
+ 'c1n': StringType(nullable=True),
72
+ 'c2': IntType(nullable=False),
73
+ 'c3': FloatType(nullable=False),
74
+ 'c4': BoolType(nullable=False),
75
+ 'c5': TimestampType(nullable=False),
76
+ 'c6': JsonType(nullable=False),
77
+ 'c7': JsonType(nullable=False),
78
+ }
79
+ t = self.cl.create_table('sample_table', schema, primary_key='c2')
80
+ t.add_column(c8=[[1, 2, 3], [4, 5, 6]])
81
+
82
+ # Add columns for .astype converters to ensure they're persisted properly
83
+ t.add_column(c2_as_float=t.c2.astype(FloatType()))
84
+
85
+ # Add columns for .apply
86
+ t.add_column(c2_to_string=t.c2.apply(str))
87
+ t.add_column(c6_to_string=t.c6.apply(json.dumps))
88
+ t.add_column(c6_back_to_json=t.c6_to_string.apply(json.loads))
89
+
90
+ num_rows = 100
91
+ d1 = {
92
+ 'f1': 'test string 1',
93
+ 'f2': 1,
94
+ 'f3': 1.0,
95
+ 'f4': True,
96
+ 'f5': [1.0, 2.0, 3.0, 4.0],
97
+ 'f6': {
98
+ 'f7': 'test string 2',
99
+ 'f8': [1.0, 2.0, 3.0, 4.0],
100
+ },
101
+ }
102
+ d2 = [d1, d1]
103
+
104
+ c1_data = [f'test string {i}' for i in range(num_rows)]
105
+ c2_data = [i for i in range(num_rows)]
106
+ c3_data = [float(i) for i in range(num_rows)]
107
+ c4_data = [bool(i % 2) for i in range(num_rows)]
108
+ c5_data = [datetime.datetime.now()] * num_rows
109
+ c6_data = []
110
+ for i in range(num_rows):
111
+ d = {
112
+ 'f1': f'test string {i}',
113
+ 'f2': i,
114
+ 'f3': float(i),
115
+ 'f4': bool(i % 2),
116
+ 'f5': [1.0, 2.0, 3.0, 4.0],
117
+ 'f6': {
118
+ 'f7': 'test string 2',
119
+ 'f8': [1.0, 2.0, 3.0, 4.0],
120
+ },
121
+ }
122
+ c6_data.append(d)
123
+
124
+ c7_data = [d2] * num_rows
125
+ rows = [
126
+ {
127
+ 'c1': c1_data[i],
128
+ 'c1n': c1_data[i] if i % 10 != 0 else None,
129
+ 'c2': c2_data[i],
130
+ 'c3': c3_data[i],
131
+ 'c4': c4_data[i],
132
+ 'c5': c5_data[i],
133
+ 'c6': c6_data[i],
134
+ 'c7': c7_data[i],
135
+ }
136
+ for i in range(num_rows)
137
+ ]
138
+ t.insert(rows)
139
+
140
+
141
+ def main() -> None:
142
+ _logger.info("Creating pixeltable test artifact.")
143
+ dumper = Dumper()
144
+ dumper.create_tables()
145
+ dumper.dump_db()
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
@@ -0,0 +1,81 @@
1
+ import av
2
+ import PIL.Image
3
+ import PIL.ImageDraw
4
+ import PIL.ImageFont
5
+
6
+ from pathlib import Path
7
+ from typing import Optional
8
+ import tempfile
9
+ import math
10
+
11
+ def create_test_video(
12
+ frame_count: int,
13
+ frame_rate: float = 1.0,
14
+ frame_width: int = 224,
15
+ aspect_ratio: str = '16:9',
16
+ frame_height: Optional[int] = None,
17
+ output_path: Optional[Path] = None,
18
+ font_file: str = '/Library/Fonts/Arial Unicode.ttf',
19
+ ) -> Path:
20
+ """
21
+ Creates an .mp4 video file such as the ones in /tests/data/test_videos
22
+ The video contains a frame number in each frame (for visual sanity check).
23
+
24
+ Args:
25
+ frame_count: Number of frames to create
26
+ frame_rate: Frame rate of the video
27
+ frame_width (int): Width in pixels of the video frame. Note: cost of decoding increases dramatically
28
+ with frame width * frame height.
29
+ aspect_ratio: Aspect ratio (width/height) of the video frames string of form 'width:height'
30
+ frame_height: Height of the video frame, if given, aspect_ratio is ignored
31
+ output_path: Path to save the video file
32
+ font_file: Path to the font file used for text.
33
+ """
34
+
35
+ if output_path is None:
36
+ output_path = Path(tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name)
37
+
38
+ parts = [int(p) for p in aspect_ratio.split(':')]
39
+ assert len(parts) == 2
40
+ aspect_ratio = parts[0] / parts[1]
41
+
42
+ if frame_height is None:
43
+ frame_height = math.ceil(frame_width / aspect_ratio)
44
+
45
+ frame_size = (frame_width, frame_height)
46
+
47
+ font_size = min(frame_height, frame_width) // 4
48
+ font = PIL.ImageFont.truetype(font=font_file, size=font_size)
49
+ font_fill = 0xFFFFFF # white
50
+ frame_color = 0xFFFFFF - font_fill # black
51
+ # Create a video container
52
+ container = av.open(str(output_path), mode='w')
53
+
54
+ # Add a video stream
55
+ stream = container.add_stream('h264', rate=frame_rate)
56
+ stream.width, stream.height = frame_size
57
+ stream.pix_fmt = 'yuv420p'
58
+
59
+ for frame_number in range(frame_count):
60
+ # Create an image with a number in it
61
+ image = PIL.Image.new('RGB', frame_size, color=frame_color)
62
+ draw = PIL.ImageDraw.Draw(image)
63
+ # Optionally, add a font here if you have one
64
+ text = str(frame_number)
65
+ _, _, text_width, text_height = draw.textbbox((0, 0), text, font=font)
66
+ text_position = ((frame_size[0] - text_width) // 2, (frame_size[1] - text_height) // 2)
67
+ draw.text(text_position, text, font=font, fill=font_fill)
68
+
69
+ # Convert the PIL image to an AVFrame
70
+ frame = av.VideoFrame.from_image(image)
71
+
72
+ # Encode and write the frame
73
+ for packet in stream.encode(frame):
74
+ container.mux(packet)
75
+
76
+ # Flush and close the stream
77
+ for packet in stream.encode():
78
+ container.mux(packet)
79
+
80
+ container.close()
81
+ return output_path