pixeltable 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +8 -3
- pixeltable/catalog/globals.py +8 -0
- pixeltable/catalog/table.py +25 -9
- pixeltable/catalog/table_version.py +30 -55
- pixeltable/catalog/view.py +1 -1
- pixeltable/env.py +4 -4
- pixeltable/exec/__init__.py +2 -1
- pixeltable/exec/row_update_node.py +61 -0
- pixeltable/exec/{sql_scan_node.py → sql_node.py} +120 -56
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +41 -16
- pixeltable/exprs/expr.py +72 -22
- pixeltable/exprs/function_call.py +64 -29
- pixeltable/exprs/globals.py +5 -1
- pixeltable/exprs/inline_array.py +18 -11
- pixeltable/exprs/method_ref.py +63 -0
- pixeltable/ext/__init__.py +9 -0
- pixeltable/ext/functions/__init__.py +8 -0
- pixeltable/ext/functions/whisperx.py +45 -5
- pixeltable/ext/functions/yolox.py +60 -14
- pixeltable/func/callable_function.py +12 -4
- pixeltable/func/expr_template_function.py +1 -1
- pixeltable/func/function.py +12 -2
- pixeltable/func/function_registry.py +24 -9
- pixeltable/func/udf.py +32 -4
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/fireworks.py +33 -0
- pixeltable/functions/huggingface.py +96 -6
- pixeltable/functions/image.py +226 -41
- pixeltable/functions/json.py +46 -0
- pixeltable/functions/openai.py +214 -0
- pixeltable/functions/string.py +195 -218
- pixeltable/functions/timestamp.py +210 -0
- pixeltable/functions/together.py +106 -0
- pixeltable/functions/video.py +2 -2
- pixeltable/functions/{eval.py → vision.py} +170 -27
- pixeltable/functions/whisper.py +32 -0
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +2 -2
- pixeltable/io/globals.py +133 -1
- pixeltable/io/pandas.py +82 -31
- pixeltable/iterators/video.py +55 -23
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_18.py +39 -0
- pixeltable/metadata/notes.py +10 -0
- pixeltable/plan.py +76 -1
- pixeltable/store.py +65 -28
- pixeltable/tool/create_test_db_dump.py +8 -9
- pixeltable/tool/doc_plugins/griffe.py +4 -0
- pixeltable/type_system.py +84 -63
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/METADATA +2 -2
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/RECORD +57 -51
- pixeltable/exprs/image_member_access.py +0 -96
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/entry_points.txt +0 -0
pixeltable/io/external_store.py
CHANGED
|
@@ -244,7 +244,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
244
244
|
if ext_col in export_cols:
|
|
245
245
|
# Validate that the table column can be assigned to the external column
|
|
246
246
|
ext_col_type = export_cols[ext_col]
|
|
247
|
-
if not ext_col_type.is_supertype_of(t_col_type):
|
|
247
|
+
if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
|
|
248
248
|
raise excs.Error(
|
|
249
249
|
f'Column `{t_col}` cannot be exported to external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
|
|
250
250
|
)
|
|
@@ -255,7 +255,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
255
255
|
f'Column `{t_col}` is a computed column, which cannot be populated from an external column'
|
|
256
256
|
)
|
|
257
257
|
ext_col_type = import_cols[ext_col]
|
|
258
|
-
if not t_col_type.is_supertype_of(ext_col_type):
|
|
258
|
+
if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
|
|
259
259
|
raise excs.Error(
|
|
260
260
|
f'Column `{t_col}` cannot be imported from external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
|
|
261
261
|
)
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
from typing import Any, Optional,
|
|
1
|
+
from typing import Any, Literal, Optional, Union
|
|
2
|
+
import urllib.request
|
|
2
3
|
|
|
4
|
+
import pixeltable as pxt
|
|
3
5
|
import pixeltable.exceptions as excs
|
|
4
6
|
from pixeltable import Table
|
|
5
7
|
from pixeltable.io.external_store import SyncStatus
|
|
@@ -134,3 +136,133 @@ def create_label_studio_project(
|
|
|
134
136
|
return t.sync()
|
|
135
137
|
else:
|
|
136
138
|
return SyncStatus.empty()
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def import_rows(
|
|
142
|
+
tbl_path: str,
|
|
143
|
+
rows: list[dict[str, Any]],
|
|
144
|
+
*,
|
|
145
|
+
schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
146
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
147
|
+
num_retained_versions: int = 10,
|
|
148
|
+
comment: str = ''
|
|
149
|
+
) -> Table:
|
|
150
|
+
"""
|
|
151
|
+
Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
|
|
152
|
+
`{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
153
|
+
supplied data, using the most specific type that can represent all the values in a column.
|
|
154
|
+
|
|
155
|
+
If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
|
|
156
|
+
Pixeltable will force the specified column to the specified type (and will not attempt any type inference
|
|
157
|
+
for that column).
|
|
158
|
+
|
|
159
|
+
All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
|
|
160
|
+
`schema_overrides`.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
tbl_path: The qualified name of the table to create.
|
|
164
|
+
rows: The list of dictionaries to import.
|
|
165
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
166
|
+
as described above.
|
|
167
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
168
|
+
num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
|
|
169
|
+
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
The newly created `Table`.
|
|
173
|
+
"""
|
|
174
|
+
if schema_overrides is None:
|
|
175
|
+
schema_overrides = {}
|
|
176
|
+
schema: dict[str, pxt.ColumnType] = {}
|
|
177
|
+
cols_with_nones: set[str] = set()
|
|
178
|
+
|
|
179
|
+
for n, row in enumerate(rows):
|
|
180
|
+
for col_name, value in row.items():
|
|
181
|
+
if col_name in schema_overrides:
|
|
182
|
+
# We do the insertion here; this will ensure that the column order matches the order
|
|
183
|
+
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
184
|
+
# is specified.
|
|
185
|
+
if col_name not in schema:
|
|
186
|
+
schema[col_name] = schema_overrides[col_name]
|
|
187
|
+
elif value is not None:
|
|
188
|
+
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
189
|
+
# The column type will always be nullable by default.
|
|
190
|
+
col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
|
|
191
|
+
if col_name not in schema:
|
|
192
|
+
schema[col_name] = col_type
|
|
193
|
+
else:
|
|
194
|
+
supertype = schema[col_name].supertype(col_type)
|
|
195
|
+
if supertype is None:
|
|
196
|
+
raise excs.Error(
|
|
197
|
+
f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
198
|
+
'Consider specifying the type explicitly in `schema_overrides`.'
|
|
199
|
+
)
|
|
200
|
+
schema[col_name] = supertype
|
|
201
|
+
else:
|
|
202
|
+
cols_with_nones.add(col_name)
|
|
203
|
+
|
|
204
|
+
extraneous_keys = schema_overrides.keys() - schema.keys()
|
|
205
|
+
if len(extraneous_keys) > 0:
|
|
206
|
+
raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
|
|
207
|
+
|
|
208
|
+
entirely_none_cols = cols_with_nones - schema.keys()
|
|
209
|
+
if len(entirely_none_cols) > 0:
|
|
210
|
+
# A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
|
|
211
|
+
# was not encountered in any row with a non-None value.
|
|
212
|
+
raise excs.Error(
|
|
213
|
+
f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
|
|
214
|
+
'Consider specifying the type(s) explicitly in `schema_overrides`.'
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
218
|
+
t.insert(rows)
|
|
219
|
+
return t
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def import_json(
|
|
223
|
+
tbl_path: str,
|
|
224
|
+
filepath_or_url: str,
|
|
225
|
+
*,
|
|
226
|
+
schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
|
|
227
|
+
primary_key: Optional[Union[str, list[str]]] = None,
|
|
228
|
+
num_retained_versions: int = 10,
|
|
229
|
+
comment: str = '',
|
|
230
|
+
**kwargs: Any
|
|
231
|
+
) -> Table:
|
|
232
|
+
"""
|
|
233
|
+
Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
|
|
234
|
+
to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
235
|
+
is the contents of the specified `filepath_or_url`.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
tbl_path: The name of the table to create.
|
|
239
|
+
filepath_or_url: The path or URL of the JSON file.
|
|
240
|
+
schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
|
|
241
|
+
(see [`import_rows()`][pixeltable.io.import_rows]).
|
|
242
|
+
primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
|
|
243
|
+
num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
|
|
244
|
+
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
245
|
+
kwargs: Additional keyword arguments to pass to `json.loads`.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
The newly created `Table`.
|
|
249
|
+
"""
|
|
250
|
+
import json
|
|
251
|
+
import urllib.parse
|
|
252
|
+
import urllib.request
|
|
253
|
+
|
|
254
|
+
# TODO Consolidate this logic with other places where files/URLs are parsed
|
|
255
|
+
parsed = urllib.parse.urlparse(filepath_or_url)
|
|
256
|
+
if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
|
|
257
|
+
# local file path
|
|
258
|
+
if len(parsed.scheme) <= 1:
|
|
259
|
+
filepath = filepath_or_url
|
|
260
|
+
else:
|
|
261
|
+
filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
|
|
262
|
+
with open(filepath) as fp:
|
|
263
|
+
contents = fp.read()
|
|
264
|
+
else:
|
|
265
|
+
# URL
|
|
266
|
+
contents = urllib.request.urlopen(filepath_or_url).read()
|
|
267
|
+
data = json.loads(contents, **kwargs)
|
|
268
|
+
return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Any, Optional, Union
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pandas as pd
|
|
6
|
+
import PIL.Image
|
|
5
7
|
|
|
6
8
|
import pixeltable as pxt
|
|
7
9
|
import pixeltable.exceptions as excs
|
|
@@ -15,7 +17,7 @@ def import_pandas(
|
|
|
15
17
|
comment: str = ''
|
|
16
18
|
) -> pxt.catalog.InsertableTable:
|
|
17
19
|
"""Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
|
|
18
|
-
will be inferred from the `DataFrame
|
|
20
|
+
will be inferred from the `DataFrame`.
|
|
19
21
|
|
|
20
22
|
The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
|
|
21
23
|
Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
|
|
@@ -32,9 +34,16 @@ def import_pandas(
|
|
|
32
34
|
`schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
|
|
33
35
|
Pixeltable identifiers).
|
|
34
36
|
"""
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
37
|
+
if schema_overrides is None:
|
|
38
|
+
schema_overrides = {}
|
|
39
|
+
if primary_key is None:
|
|
40
|
+
primary_key = []
|
|
41
|
+
elif isinstance(primary_key, str):
|
|
42
|
+
primary_key = [primary_key]
|
|
43
|
+
|
|
44
|
+
schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
|
|
45
|
+
tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
|
|
46
|
+
table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
|
|
38
47
|
table.insert(tbl_rows)
|
|
39
48
|
return table
|
|
40
49
|
|
|
@@ -71,22 +80,44 @@ def import_excel(
|
|
|
71
80
|
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
72
81
|
|
|
73
82
|
|
|
74
|
-
def
|
|
75
|
-
df: pd.DataFrame, schema_overrides:
|
|
76
|
-
) -> dict[str, pxt.ColumnType]:
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
83
|
+
def __df_to_pxt_schema(
|
|
84
|
+
df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
|
|
85
|
+
) -> tuple[dict[str, pxt.ColumnType], list[str]]:
|
|
86
|
+
"""
|
|
87
|
+
Infers a Pixeltable schema from a Pandas DataFrame.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
A tuple containing a Pixeltable schema and a list of primary key column names.
|
|
91
|
+
"""
|
|
92
|
+
for pd_name in schema_overrides:
|
|
93
|
+
if pd_name not in df.columns:
|
|
94
|
+
raise excs.Error(
|
|
95
|
+
f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
|
|
96
|
+
)
|
|
97
|
+
for pd_name in primary_key:
|
|
98
|
+
if pd_name not in df.columns:
|
|
99
|
+
raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
|
|
100
|
+
|
|
101
|
+
schema: dict[str, pxt.ColumnType] = {}
|
|
102
|
+
col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
|
|
103
|
+
|
|
84
104
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
85
|
-
if
|
|
105
|
+
if pd_name in schema_overrides:
|
|
86
106
|
pxt_type = schema_overrides[pd_name]
|
|
87
107
|
else:
|
|
88
|
-
|
|
89
|
-
|
|
108
|
+
# This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
|
|
109
|
+
# general objects, so we need to check for nulls in the specific cases where we might expect them.
|
|
110
|
+
# isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
|
|
111
|
+
# in object columns (where Pandas uses NaN as a general null).
|
|
112
|
+
# np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
|
|
113
|
+
has_na = any(
|
|
114
|
+
(isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
|
|
115
|
+
for val in df[pd_name]
|
|
116
|
+
)
|
|
117
|
+
if has_na and pd_name in primary_key:
|
|
118
|
+
raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
|
|
119
|
+
pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
120
|
+
pxt_name = __normalize_pxt_col_name(pd_name)
|
|
90
121
|
# Ensure that column names are unique by appending a distinguishing suffix
|
|
91
122
|
# to any collisions
|
|
92
123
|
if pxt_name in schema:
|
|
@@ -95,10 +126,13 @@ def _df_to_pxt_schema(
|
|
|
95
126
|
n += 1
|
|
96
127
|
pxt_name = f'{pxt_name}_{n}'
|
|
97
128
|
schema[pxt_name] = pxt_type
|
|
98
|
-
|
|
129
|
+
col_mapping[pd_name] = pxt_name
|
|
99
130
|
|
|
131
|
+
pxt_pk = [col_mapping[pk] for pk in primary_key]
|
|
132
|
+
return schema, pxt_pk
|
|
100
133
|
|
|
101
|
-
|
|
134
|
+
|
|
135
|
+
def __normalize_pxt_col_name(pd_name: str) -> str:
|
|
102
136
|
"""
|
|
103
137
|
Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
|
|
104
138
|
- replacing any non-ascii or non-alphanumeric characters with an underscore _
|
|
@@ -113,26 +147,43 @@ def _normalize_pxt_col_name(pd_name: str) -> str:
|
|
|
113
147
|
return id
|
|
114
148
|
|
|
115
149
|
|
|
116
|
-
def
|
|
150
|
+
def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
|
|
117
151
|
"""
|
|
118
152
|
Infers a Pixeltable type based on a Numpy dtype.
|
|
119
153
|
"""
|
|
120
154
|
if np.issubdtype(np_dtype, np.integer):
|
|
121
|
-
return pxt.IntType()
|
|
155
|
+
return pxt.IntType(nullable=nullable)
|
|
156
|
+
|
|
122
157
|
if np.issubdtype(np_dtype, np.floating):
|
|
123
|
-
return pxt.FloatType()
|
|
158
|
+
return pxt.FloatType(nullable=nullable)
|
|
159
|
+
|
|
124
160
|
if np.issubdtype(np_dtype, np.bool_):
|
|
125
|
-
return pxt.BoolType()
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
return pxt.StringType(nullable=
|
|
161
|
+
return pxt.BoolType(nullable=nullable)
|
|
162
|
+
|
|
163
|
+
if np.issubdtype(np_dtype, np.character):
|
|
164
|
+
return pxt.StringType(nullable=nullable)
|
|
165
|
+
|
|
129
166
|
if np.issubdtype(np_dtype, np.datetime64):
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
167
|
+
return pxt.TimestampType(nullable=nullable)
|
|
168
|
+
|
|
169
|
+
if np_dtype == np.object_:
|
|
170
|
+
# The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
|
|
171
|
+
# based on the actual data in `data_col`.
|
|
172
|
+
# First drop any null values (they don't contribute to type inference).
|
|
173
|
+
data_col = data_col.dropna()
|
|
174
|
+
|
|
175
|
+
if len(data_col) == 0:
|
|
176
|
+
# No non-null values; default to FloatType (the Pandas type of an all-NaN column)
|
|
177
|
+
return pxt.FloatType(nullable=nullable)
|
|
178
|
+
|
|
179
|
+
inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
|
|
180
|
+
if inferred_type is not None:
|
|
181
|
+
return inferred_type.copy(nullable=nullable)
|
|
182
|
+
|
|
183
|
+
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
|
|
133
184
|
|
|
134
185
|
|
|
135
|
-
def
|
|
186
|
+
def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
|
|
136
187
|
rows = {}
|
|
137
188
|
for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
|
|
138
189
|
if pxt_type.is_float_type():
|
pixeltable/iterators/video.py
CHANGED
|
@@ -1,57 +1,89 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import math
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
|
-
import PIL.Image
|
|
7
6
|
import cv2
|
|
7
|
+
import PIL.Image
|
|
8
8
|
|
|
9
9
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType,
|
|
10
|
+
from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
|
|
11
|
+
|
|
11
12
|
from .base import ComponentIterator
|
|
12
13
|
|
|
13
14
|
_logger = logging.getLogger('pixeltable')
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class FrameIterator(ComponentIterator):
|
|
17
|
-
"""
|
|
18
|
+
"""
|
|
19
|
+
Iterator over frames of a video. At most one of `fps` or `num_frames` may be specified. If `fps` is specified,
|
|
20
|
+
then frames will be extracted at the specified rate (frames per second). If `num_frames` is specified, then the
|
|
21
|
+
exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
|
|
22
|
+
frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
|
|
18
23
|
|
|
19
24
|
Args:
|
|
20
|
-
video: URL or
|
|
21
|
-
fps:
|
|
22
|
-
If set to 0.0, then the native framerate of the video will be used (all frames will be
|
|
23
|
-
|
|
25
|
+
video: URL or path of the video to use for frame extraction.
|
|
26
|
+
fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
|
|
27
|
+
If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
|
|
28
|
+
extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
|
|
29
|
+
num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
|
|
30
|
+
`num_frames` is greater than the number of frames in the video, all frames will be extracted.
|
|
24
31
|
"""
|
|
25
|
-
def __init__(self, video: str, *, fps: float =
|
|
32
|
+
def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
|
|
33
|
+
if fps is not None and num_frames is not None:
|
|
34
|
+
raise Error('At most one of `fps` or `num_frames` may be specified')
|
|
35
|
+
|
|
26
36
|
video_path = Path(video)
|
|
27
37
|
assert video_path.exists() and video_path.is_file()
|
|
28
38
|
self.video_path = video_path
|
|
29
|
-
self.fps = fps
|
|
30
39
|
self.video_reader = cv2.VideoCapture(str(video_path))
|
|
40
|
+
self.fps = fps
|
|
41
|
+
self.num_frames = num_frames
|
|
31
42
|
if not self.video_reader.isOpened():
|
|
32
43
|
raise Error(f'Failed to open video: {video}')
|
|
44
|
+
|
|
33
45
|
video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
|
|
34
|
-
if fps > video_fps:
|
|
46
|
+
if fps is not None and fps > video_fps:
|
|
35
47
|
raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
|
|
36
|
-
self.frame_freq = int(video_fps / fps) if fps > 0 else 1
|
|
37
48
|
num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
38
49
|
if num_video_frames == 0:
|
|
39
50
|
raise Error(f'Video {video}: failed to get number of frames')
|
|
40
|
-
# ceil: round up to ensure we count frame 0
|
|
41
|
-
self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
|
|
42
|
-
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
|
|
43
51
|
|
|
52
|
+
if num_frames is not None:
|
|
53
|
+
# specific number of frames
|
|
54
|
+
if num_frames > num_video_frames:
|
|
55
|
+
# Extract all frames
|
|
56
|
+
self.frames_to_extract = range(num_video_frames)
|
|
57
|
+
else:
|
|
58
|
+
spacing = float(num_video_frames) / float(num_frames)
|
|
59
|
+
self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
|
|
60
|
+
assert len(self.frames_to_extract) == num_frames
|
|
61
|
+
else:
|
|
62
|
+
if fps is None or fps == 0.0:
|
|
63
|
+
# Extract all frames
|
|
64
|
+
self.frames_to_extract = range(num_video_frames)
|
|
65
|
+
else:
|
|
66
|
+
# Extract frames at the implied frequency
|
|
67
|
+
freq = fps / video_fps
|
|
68
|
+
n = math.ceil(num_video_frames * freq) # number of frames to extract
|
|
69
|
+
self.frames_to_extract = list(round(i / freq) for i in range(n))
|
|
70
|
+
|
|
71
|
+
# We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
|
|
72
|
+
# there are lots of frames)
|
|
73
|
+
self.frames_set = set(self.frames_to_extract)
|
|
74
|
+
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
|
|
44
75
|
self.next_frame_idx = 0
|
|
45
76
|
|
|
46
77
|
@classmethod
|
|
47
|
-
def input_schema(cls) ->
|
|
78
|
+
def input_schema(cls) -> dict[str, ColumnType]:
|
|
48
79
|
return {
|
|
49
80
|
'video': VideoType(nullable=False),
|
|
50
|
-
'fps': FloatType()
|
|
81
|
+
'fps': FloatType(nullable=True),
|
|
82
|
+
'num_frames': IntType(nullable=True),
|
|
51
83
|
}
|
|
52
84
|
|
|
53
85
|
@classmethod
|
|
54
|
-
def output_schema(cls, *args: Any, **kwargs: Any) ->
|
|
86
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
55
87
|
return {
|
|
56
88
|
'frame_idx': IntType(),
|
|
57
89
|
'pos_msec': FloatType(),
|
|
@@ -59,7 +91,9 @@ class FrameIterator(ComponentIterator):
|
|
|
59
91
|
'frame': ImageType(),
|
|
60
92
|
}, ['frame']
|
|
61
93
|
|
|
62
|
-
def __next__(self) ->
|
|
94
|
+
def __next__(self) -> dict[str, Any]:
|
|
95
|
+
# jumping to the target frame here with video_reader.set() is far slower than just
|
|
96
|
+
# skipping the unwanted frames
|
|
63
97
|
while True:
|
|
64
98
|
pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
|
|
65
99
|
pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
|
|
@@ -69,7 +103,7 @@ class FrameIterator(ComponentIterator):
|
|
|
69
103
|
self.video_reader.release()
|
|
70
104
|
self.video_reader = None
|
|
71
105
|
raise StopIteration
|
|
72
|
-
if pos_frame
|
|
106
|
+
if pos_frame in self.frames_set:
|
|
73
107
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
74
108
|
result = {
|
|
75
109
|
'frame_idx': self.next_frame_idx,
|
|
@@ -78,8 +112,6 @@ class FrameIterator(ComponentIterator):
|
|
|
78
112
|
'frame': PIL.Image.fromarray(img),
|
|
79
113
|
}
|
|
80
114
|
self.next_frame_idx += 1
|
|
81
|
-
# frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
|
|
82
|
-
# skipping the unwanted frames
|
|
83
115
|
return result
|
|
84
116
|
|
|
85
117
|
def close(self) -> None:
|
|
@@ -92,5 +124,5 @@ class FrameIterator(ComponentIterator):
|
|
|
92
124
|
if pos == self.next_frame_idx:
|
|
93
125
|
return
|
|
94
126
|
_logger.debug(f'seeking to frame {pos}')
|
|
95
|
-
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES,
|
|
127
|
+
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
|
|
96
128
|
self.next_frame_idx = pos
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
|
|
|
10
10
|
from .schema import SystemInfo, SystemInfoMd
|
|
11
11
|
|
|
12
12
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
13
|
-
VERSION =
|
|
13
|
+
VERSION = 19
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
import sqlalchemy as sql
|
|
3
|
+
|
|
4
|
+
from pixeltable.metadata import register_converter
|
|
5
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@register_converter(version=18)
|
|
9
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
10
|
+
convert_table_md(
|
|
11
|
+
engine,
|
|
12
|
+
substitution_fn=__substitute_md
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def __substitute_md(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
|
|
17
|
+
# Migrate a few changed function names
|
|
18
|
+
if k == 'path' and v == 'pixeltable.functions.string.str_format':
|
|
19
|
+
return 'path', 'pixeltable.functions.string.format'
|
|
20
|
+
if k == 'path' and v.startswith('pixeltable.functions.pil.image'):
|
|
21
|
+
return 'path', v.replace('pixeltable.functions.pil.image', 'pixeltable.functions.image')
|
|
22
|
+
# Migrate deprecated `ImageMemberAccess` expressions to `FunctionCall`s
|
|
23
|
+
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ImageMemberAccess':
|
|
24
|
+
member_name = v['member_name']
|
|
25
|
+
new_v = {
|
|
26
|
+
'fn': {
|
|
27
|
+
'path': f'pixeltable.functions.image.{member_name}',
|
|
28
|
+
'_classpath': 'pixeltable.func.callable_function.CallableFunction',
|
|
29
|
+
},
|
|
30
|
+
'args': [[0, None]],
|
|
31
|
+
'kwargs': {},
|
|
32
|
+
'_classname': 'FunctionCall',
|
|
33
|
+
'components': v['components'],
|
|
34
|
+
'group_by_stop_idx': 0,
|
|
35
|
+
'group_by_start_idx': 0,
|
|
36
|
+
'order_by_start_idx': 1,
|
|
37
|
+
}
|
|
38
|
+
return k, new_v
|
|
39
|
+
return None
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Descriptive notes for each new metadata version. These are stored in a Python dict
|
|
2
|
+
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
|
+
# the unit tests when new versions are added.
|
|
4
|
+
VERSION_NOTES = {
|
|
5
|
+
19: 'UDF renames; ImageMemberAccess removal',
|
|
6
|
+
18: 'Restructured index metadata',
|
|
7
|
+
17: 'Renamed remotes to external_stores',
|
|
8
|
+
16: 'Query functions; deferred Expr deserialization',
|
|
9
|
+
15: 'Remotes in table metadata',
|
|
10
|
+
}
|
pixeltable/plan.py
CHANGED
|
@@ -107,7 +107,7 @@ class Analyzer:
|
|
|
107
107
|
for e in self.group_by_clause:
|
|
108
108
|
if e.sql_expr() is None:
|
|
109
109
|
raise excs.Error(f'Invalid grouping expression, needs to be expressible in SQL: {e}')
|
|
110
|
-
if e.
|
|
110
|
+
if e._contains(filter=lambda e: _is_agg_fn_call(e)):
|
|
111
111
|
raise excs.Error(f'Grouping expression contains aggregate function: {e}')
|
|
112
112
|
|
|
113
113
|
# check that agg fn calls don't have contradicting ordering requirements
|
|
@@ -288,6 +288,81 @@ class Planner:
|
|
|
288
288
|
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
289
289
|
return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
|
|
290
290
|
|
|
291
|
+
@classmethod
|
|
292
|
+
def create_batch_update_plan(
|
|
293
|
+
cls, tbl: catalog.TableVersionPath,
|
|
294
|
+
batch: list[dict[catalog.Column, exprs.Expr]], rowids: list[tuple[int, ...]],
|
|
295
|
+
cascade: bool
|
|
296
|
+
) -> Tuple[exec.ExecNode, exec.RowUpdateNode, sql.ClauseElement, List[catalog.Column], List[catalog.Column]]:
|
|
297
|
+
"""
|
|
298
|
+
Returns:
|
|
299
|
+
- root node of the plan to produce the updated rows
|
|
300
|
+
- RowUpdateNode of plan
|
|
301
|
+
- Where clause for deleting the current versions of updated rows
|
|
302
|
+
- list of columns that are getting updated
|
|
303
|
+
- list of user-visible columns that are being recomputed
|
|
304
|
+
"""
|
|
305
|
+
assert isinstance(tbl, catalog.TableVersionPath)
|
|
306
|
+
target = tbl.tbl_version # the one we need to update
|
|
307
|
+
sa_key_cols: list[sql.Column] = []
|
|
308
|
+
key_vals: list[tuple] = []
|
|
309
|
+
if len(rowids) > 0:
|
|
310
|
+
sa_key_cols = target.store_tbl.rowid_columns()
|
|
311
|
+
key_vals = rowids
|
|
312
|
+
else:
|
|
313
|
+
pk_cols = target.primary_key_columns()
|
|
314
|
+
sa_key_cols = [c.sa_col for c in pk_cols]
|
|
315
|
+
key_vals = [tuple(row[col].val for col in pk_cols) for row in batch]
|
|
316
|
+
|
|
317
|
+
# retrieve all stored cols and all target exprs
|
|
318
|
+
updated_cols = batch[0].keys() - target.primary_key_columns()
|
|
319
|
+
recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
|
|
320
|
+
# regardless of cascade, we need to update all indices on any updated column
|
|
321
|
+
idx_val_cols = target.get_idx_val_columns(updated_cols)
|
|
322
|
+
recomputed_cols.update(idx_val_cols)
|
|
323
|
+
# we only need to recompute stored columns (unstored ones are substituted away)
|
|
324
|
+
recomputed_cols = {c for c in recomputed_cols if c.is_stored}
|
|
325
|
+
recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
|
|
326
|
+
copied_cols = [
|
|
327
|
+
col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
|
|
328
|
+
]
|
|
329
|
+
select_list = [exprs.ColumnRef(col) for col in copied_cols]
|
|
330
|
+
select_list.extend([exprs.ColumnRef(col) for col in updated_cols])
|
|
331
|
+
|
|
332
|
+
recomputed_exprs = \
|
|
333
|
+
[c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
|
|
334
|
+
# the RowUpdateNode updates columns in-place, ie, in the original ColumnRef; no further sustitution is needed
|
|
335
|
+
select_list.extend(recomputed_exprs)
|
|
336
|
+
|
|
337
|
+
# ExecNode tree (from bottom to top):
|
|
338
|
+
# - SqlLookupNode to retrieve the existing rows
|
|
339
|
+
# - RowUpdateNode to update the retrieved rows
|
|
340
|
+
# - ExprEvalNode to evaluate the remaining output exprs
|
|
341
|
+
analyzer = Analyzer(tbl, select_list)
|
|
342
|
+
row_builder = exprs.RowBuilder(analyzer.all_exprs, [], analyzer.sql_exprs)
|
|
343
|
+
analyzer.finalize(row_builder)
|
|
344
|
+
plan = exec.SqlLookupNode(tbl, row_builder, analyzer.sql_exprs, sa_key_cols, key_vals)
|
|
345
|
+
delete_where_clause = plan.where_clause
|
|
346
|
+
col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
|
|
347
|
+
plan = row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, plan)
|
|
348
|
+
if not cls._is_contained_in(analyzer.select_list, analyzer.sql_exprs):
|
|
349
|
+
# we need an ExprEvalNode to evaluate the remaining output exprs
|
|
350
|
+
plan = exec.ExprEvalNode(row_builder, analyzer.select_list, analyzer.sql_exprs, input=plan)
|
|
351
|
+
# update row builder with column information
|
|
352
|
+
all_base_cols = copied_cols + list(updated_cols) + list(recomputed_base_cols) # same order as select_list
|
|
353
|
+
row_builder.substitute_exprs(select_list, remove_duplicates=False)
|
|
354
|
+
for i, col in enumerate(all_base_cols):
|
|
355
|
+
plan.row_builder.add_table_column(col, select_list[i].slot_idx)
|
|
356
|
+
|
|
357
|
+
ctx = exec.ExecContext(row_builder)
|
|
358
|
+
# we're returning everything to the user, so we might as well do it in a single batch
|
|
359
|
+
ctx.batch_size = 0
|
|
360
|
+
plan.set_ctx(ctx)
|
|
361
|
+
recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
|
|
362
|
+
return (
|
|
363
|
+
plan, row_update_node, delete_where_clause, list(updated_cols) + recomputed_user_cols, recomputed_user_cols
|
|
364
|
+
)
|
|
365
|
+
|
|
291
366
|
@classmethod
|
|
292
367
|
def create_view_update_plan(
|
|
293
368
|
cls, view: catalog.TableVersionPath, recompute_targets: List[catalog.Column]
|