pixeltable 0.2.13__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show
  1. pixeltable/__init__.py +1 -1
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/column.py +8 -3
  4. pixeltable/catalog/globals.py +8 -0
  5. pixeltable/catalog/table.py +25 -9
  6. pixeltable/catalog/table_version.py +30 -55
  7. pixeltable/catalog/view.py +1 -1
  8. pixeltable/env.py +4 -4
  9. pixeltable/exec/__init__.py +2 -1
  10. pixeltable/exec/row_update_node.py +61 -0
  11. pixeltable/exec/{sql_scan_node.py → sql_node.py} +120 -56
  12. pixeltable/exprs/__init__.py +1 -1
  13. pixeltable/exprs/arithmetic_expr.py +41 -16
  14. pixeltable/exprs/expr.py +72 -22
  15. pixeltable/exprs/function_call.py +64 -29
  16. pixeltable/exprs/globals.py +5 -1
  17. pixeltable/exprs/inline_array.py +18 -11
  18. pixeltable/exprs/method_ref.py +63 -0
  19. pixeltable/ext/__init__.py +9 -0
  20. pixeltable/ext/functions/__init__.py +8 -0
  21. pixeltable/ext/functions/whisperx.py +45 -5
  22. pixeltable/ext/functions/yolox.py +60 -14
  23. pixeltable/func/callable_function.py +12 -4
  24. pixeltable/func/expr_template_function.py +1 -1
  25. pixeltable/func/function.py +12 -2
  26. pixeltable/func/function_registry.py +24 -9
  27. pixeltable/func/udf.py +32 -4
  28. pixeltable/functions/__init__.py +1 -1
  29. pixeltable/functions/fireworks.py +33 -0
  30. pixeltable/functions/huggingface.py +96 -6
  31. pixeltable/functions/image.py +226 -41
  32. pixeltable/functions/json.py +46 -0
  33. pixeltable/functions/openai.py +214 -0
  34. pixeltable/functions/string.py +195 -218
  35. pixeltable/functions/timestamp.py +210 -0
  36. pixeltable/functions/together.py +106 -0
  37. pixeltable/functions/video.py +2 -2
  38. pixeltable/functions/{eval.py → vision.py} +170 -27
  39. pixeltable/functions/whisper.py +32 -0
  40. pixeltable/io/__init__.py +1 -1
  41. pixeltable/io/external_store.py +2 -2
  42. pixeltable/io/globals.py +133 -1
  43. pixeltable/io/pandas.py +82 -31
  44. pixeltable/iterators/video.py +55 -23
  45. pixeltable/metadata/__init__.py +1 -1
  46. pixeltable/metadata/converters/convert_18.py +39 -0
  47. pixeltable/metadata/notes.py +10 -0
  48. pixeltable/plan.py +76 -1
  49. pixeltable/store.py +65 -28
  50. pixeltable/tool/create_test_db_dump.py +8 -9
  51. pixeltable/tool/doc_plugins/griffe.py +4 -0
  52. pixeltable/type_system.py +84 -63
  53. {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/METADATA +2 -2
  54. {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/RECORD +57 -51
  55. pixeltable/exprs/image_member_access.py +0 -96
  56. {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/LICENSE +0 -0
  57. {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/WHEEL +0 -0
  58. {pixeltable-0.2.13.dist-info → pixeltable-0.2.15.dist-info}/entry_points.txt +0 -0
@@ -244,7 +244,7 @@ class Project(ExternalStore, abc.ABC):
244
244
  if ext_col in export_cols:
245
245
  # Validate that the table column can be assigned to the external column
246
246
  ext_col_type = export_cols[ext_col]
247
- if not ext_col_type.is_supertype_of(t_col_type):
247
+ if not ext_col_type.is_supertype_of(t_col_type, ignore_nullable=True):
248
248
  raise excs.Error(
249
249
  f'Column `{t_col}` cannot be exported to external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
250
250
  )
@@ -255,7 +255,7 @@ class Project(ExternalStore, abc.ABC):
255
255
  f'Column `{t_col}` is a computed column, which cannot be populated from an external column'
256
256
  )
257
257
  ext_col_type = import_cols[ext_col]
258
- if not t_col_type.is_supertype_of(ext_col_type):
258
+ if not t_col_type.is_supertype_of(ext_col_type, ignore_nullable=True):
259
259
  raise excs.Error(
260
260
  f'Column `{t_col}` cannot be imported from external column `{ext_col}` (incompatible types; expecting `{ext_col_type}`)'
261
261
  )
pixeltable/io/globals.py CHANGED
@@ -1,5 +1,7 @@
1
- from typing import Any, Optional, Literal
1
+ from typing import Any, Literal, Optional, Union
2
+ import urllib.request
2
3
 
4
+ import pixeltable as pxt
3
5
  import pixeltable.exceptions as excs
4
6
  from pixeltable import Table
5
7
  from pixeltable.io.external_store import SyncStatus
@@ -134,3 +136,133 @@ def create_label_studio_project(
134
136
  return t.sync()
135
137
  else:
136
138
  return SyncStatus.empty()
139
+
140
+
141
+ def import_rows(
142
+ tbl_path: str,
143
+ rows: list[dict[str, Any]],
144
+ *,
145
+ schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
146
+ primary_key: Optional[Union[str, list[str]]] = None,
147
+ num_retained_versions: int = 10,
148
+ comment: str = ''
149
+ ) -> Table:
150
+ """
151
+ Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
152
+ `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
153
+ supplied data, using the most specific type that can represent all the values in a column.
154
+
155
+ If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
156
+ Pixeltable will force the specified column to the specified type (and will not attempt any type inference
157
+ for that column).
158
+
159
+ All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
160
+ `schema_overrides`.
161
+
162
+ Args:
163
+ tbl_path: The qualified name of the table to create.
164
+ rows: The list of dictionaries to import.
165
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
166
+ as described above.
167
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
168
+ num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
169
+ comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
170
+
171
+ Returns:
172
+ The newly created `Table`.
173
+ """
174
+ if schema_overrides is None:
175
+ schema_overrides = {}
176
+ schema: dict[str, pxt.ColumnType] = {}
177
+ cols_with_nones: set[str] = set()
178
+
179
+ for n, row in enumerate(rows):
180
+ for col_name, value in row.items():
181
+ if col_name in schema_overrides:
182
+ # We do the insertion here; this will ensure that the column order matches the order
183
+ # in which the column names are encountered in the input data, even if `schema_overrides`
184
+ # is specified.
185
+ if col_name not in schema:
186
+ schema[col_name] = schema_overrides[col_name]
187
+ elif value is not None:
188
+ # If `key` is not in `schema_overrides`, then we infer its type from the data.
189
+ # The column type will always be nullable by default.
190
+ col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
191
+ if col_name not in schema:
192
+ schema[col_name] = col_type
193
+ else:
194
+ supertype = schema[col_name].supertype(col_type)
195
+ if supertype is None:
196
+ raise excs.Error(
197
+ f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
198
+ 'Consider specifying the type explicitly in `schema_overrides`.'
199
+ )
200
+ schema[col_name] = supertype
201
+ else:
202
+ cols_with_nones.add(col_name)
203
+
204
+ extraneous_keys = schema_overrides.keys() - schema.keys()
205
+ if len(extraneous_keys) > 0:
206
+ raise excs.Error(f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}')
207
+
208
+ entirely_none_cols = cols_with_nones - schema.keys()
209
+ if len(entirely_none_cols) > 0:
210
+ # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
211
+ # was not encountered in any row with a non-None value.
212
+ raise excs.Error(
213
+ f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
214
+ 'Consider specifying the type(s) explicitly in `schema_overrides`.'
215
+ )
216
+
217
+ t = pxt.create_table(tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
218
+ t.insert(rows)
219
+ return t
220
+
221
+
222
+ def import_json(
223
+ tbl_path: str,
224
+ filepath_or_url: str,
225
+ *,
226
+ schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
227
+ primary_key: Optional[Union[str, list[str]]] = None,
228
+ num_retained_versions: int = 10,
229
+ comment: str = '',
230
+ **kwargs: Any
231
+ ) -> Table:
232
+ """
233
+ Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
234
+ to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
235
+ is the contents of the specified `filepath_or_url`.
236
+
237
+ Args:
238
+ tbl_path: The name of the table to create.
239
+ filepath_or_url: The path or URL of the JSON file.
240
+ schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
241
+ (see [`import_rows()`][pixeltable.io.import_rows]).
242
+ primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
243
+ num_retained_versions: The number of retained versions of the table (see [`create_table()`][pixeltable.create_table]).
244
+ comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
245
+ kwargs: Additional keyword arguments to pass to `json.loads`.
246
+
247
+ Returns:
248
+ The newly created `Table`.
249
+ """
250
+ import json
251
+ import urllib.parse
252
+ import urllib.request
253
+
254
+ # TODO Consolidate this logic with other places where files/URLs are parsed
255
+ parsed = urllib.parse.urlparse(filepath_or_url)
256
+ if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
257
+ # local file path
258
+ if len(parsed.scheme) <= 1:
259
+ filepath = filepath_or_url
260
+ else:
261
+ filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
262
+ with open(filepath) as fp:
263
+ contents = fp.read()
264
+ else:
265
+ # URL
266
+ contents = urllib.request.urlopen(filepath_or_url).read()
267
+ data = json.loads(contents, **kwargs)
268
+ return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
pixeltable/io/pandas.py CHANGED
@@ -1,7 +1,9 @@
1
- from typing import Optional, Any, Iterable, Union
1
+ import datetime
2
+ from typing import Any, Optional, Union
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
6
+ import PIL.Image
5
7
 
6
8
  import pixeltable as pxt
7
9
  import pixeltable.exceptions as excs
@@ -15,7 +17,7 @@ def import_pandas(
15
17
  comment: str = ''
16
18
  ) -> pxt.catalog.InsertableTable:
17
19
  """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
18
- will be inferred from the `DataFrame`, unless `schema` is specified.
20
+ will be inferred from the `DataFrame`.
19
21
 
20
22
  The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
21
23
  Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
@@ -32,9 +34,16 @@ def import_pandas(
32
34
  `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
33
35
  Pixeltable identifiers).
34
36
  """
35
- schema = _df_to_pxt_schema(df, schema_overrides)
36
- tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
37
- table = pxt.create_table(tbl_name, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
37
+ if schema_overrides is None:
38
+ schema_overrides = {}
39
+ if primary_key is None:
40
+ primary_key = []
41
+ elif isinstance(primary_key, str):
42
+ primary_key = [primary_key]
43
+
44
+ schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
45
+ tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
46
+ table = pxt.create_table(tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment)
38
47
  table.insert(tbl_rows)
39
48
  return table
40
49
 
@@ -71,22 +80,44 @@ def import_excel(
71
80
  return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
72
81
 
73
82
 
74
- def _df_to_pxt_schema(
75
- df: pd.DataFrame, schema_overrides: Optional[dict[str, pxt.ColumnType]]
76
- ) -> dict[str, pxt.ColumnType]:
77
- if schema_overrides is not None:
78
- for pd_name in schema_overrides:
79
- if pd_name not in df.columns:
80
- raise excs.Error(
81
- f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
82
- )
83
- schema = {}
83
+ def __df_to_pxt_schema(
84
+ df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
85
+ ) -> tuple[dict[str, pxt.ColumnType], list[str]]:
86
+ """
87
+ Infers a Pixeltable schema from a Pandas DataFrame.
88
+
89
+ Returns:
90
+ A tuple containing a Pixeltable schema and a list of primary key column names.
91
+ """
92
+ for pd_name in schema_overrides:
93
+ if pd_name not in df.columns:
94
+ raise excs.Error(
95
+ f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
96
+ )
97
+ for pd_name in primary_key:
98
+ if pd_name not in df.columns:
99
+ raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
100
+
101
+ schema: dict[str, pxt.ColumnType] = {}
102
+ col_mapping: dict[str, str] = {} # Maps Pandas column names to Pixeltable column names
103
+
84
104
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
85
- if schema_overrides is not None and pd_name in schema_overrides:
105
+ if pd_name in schema_overrides:
86
106
  pxt_type = schema_overrides[pd_name]
87
107
  else:
88
- pxt_type = _np_dtype_to_pxt_type(pd_dtype, df[pd_name])
89
- pxt_name = _normalize_pxt_col_name(pd_name)
108
+ # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
109
+ # general objects, so we need to check for nulls in the specific cases where we might expect them.
110
+ # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
111
+ # in object columns (where Pandas uses NaN as a general null).
112
+ # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
113
+ has_na = any(
114
+ (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
115
+ for val in df[pd_name]
116
+ )
117
+ if has_na and pd_name in primary_key:
118
+ raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
119
+ pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
120
+ pxt_name = __normalize_pxt_col_name(pd_name)
90
121
  # Ensure that column names are unique by appending a distinguishing suffix
91
122
  # to any collisions
92
123
  if pxt_name in schema:
@@ -95,10 +126,13 @@ def _df_to_pxt_schema(
95
126
  n += 1
96
127
  pxt_name = f'{pxt_name}_{n}'
97
128
  schema[pxt_name] = pxt_type
98
- return schema
129
+ col_mapping[pd_name] = pxt_name
99
130
 
131
+ pxt_pk = [col_mapping[pk] for pk in primary_key]
132
+ return schema, pxt_pk
100
133
 
101
- def _normalize_pxt_col_name(pd_name: str) -> str:
134
+
135
+ def __normalize_pxt_col_name(pd_name: str) -> str:
102
136
  """
103
137
  Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
104
138
  - replacing any non-ascii or non-alphanumeric characters with an underscore _
@@ -113,26 +147,43 @@ def _normalize_pxt_col_name(pd_name: str) -> str:
113
147
  return id
114
148
 
115
149
 
116
- def _np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series) -> pxt.ColumnType:
150
+ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
117
151
  """
118
152
  Infers a Pixeltable type based on a Numpy dtype.
119
153
  """
120
154
  if np.issubdtype(np_dtype, np.integer):
121
- return pxt.IntType()
155
+ return pxt.IntType(nullable=nullable)
156
+
122
157
  if np.issubdtype(np_dtype, np.floating):
123
- return pxt.FloatType()
158
+ return pxt.FloatType(nullable=nullable)
159
+
124
160
  if np.issubdtype(np_dtype, np.bool_):
125
- return pxt.BoolType()
126
- if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
127
- has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
128
- return pxt.StringType(nullable=has_nan)
161
+ return pxt.BoolType(nullable=nullable)
162
+
163
+ if np.issubdtype(np_dtype, np.character):
164
+ return pxt.StringType(nullable=nullable)
165
+
129
166
  if np.issubdtype(np_dtype, np.datetime64):
130
- has_nat = any(pd.isnull(val) for val in data_col)
131
- return pxt.TimestampType(nullable=has_nat)
132
- raise excs.Error(f'Unsupported dtype: {np_dtype}')
167
+ return pxt.TimestampType(nullable=nullable)
168
+
169
+ if np_dtype == np.object_:
170
+ # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
171
+ # based on the actual data in `data_col`.
172
+ # First drop any null values (they don't contribute to type inference).
173
+ data_col = data_col.dropna()
174
+
175
+ if len(data_col) == 0:
176
+ # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
177
+ return pxt.FloatType(nullable=nullable)
178
+
179
+ inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
180
+ if inferred_type is not None:
181
+ return inferred_type.copy(nullable=nullable)
182
+
183
+ raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
133
184
 
134
185
 
135
- def _df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
186
+ def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
136
187
  rows = {}
137
188
  for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
138
189
  if pxt_type.is_float_type():
@@ -1,57 +1,89 @@
1
1
  import logging
2
2
  import math
3
3
  from pathlib import Path
4
- from typing import Dict, Any, List, Tuple
4
+ from typing import Any, Optional
5
5
 
6
- import PIL.Image
7
6
  import cv2
7
+ import PIL.Image
8
8
 
9
9
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
10
+ from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
11
+
11
12
  from .base import ComponentIterator
12
13
 
13
14
  _logger = logging.getLogger('pixeltable')
14
15
 
15
16
 
16
17
  class FrameIterator(ComponentIterator):
17
- """Iterator over frames of a video.
18
+ """
19
+ Iterator over frames of a video. At most one of `fps` or `num_frames` may be specified. If `fps` is specified,
20
+ then frames will be extracted at the specified rate (frames per second). If `num_frames` is specified, then the
21
+ exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
22
+ frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
18
23
 
19
24
  Args:
20
- video: URL or file of the video to use for frame extraction
21
- fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
22
- If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
23
- Default: 0.0
25
+ video: URL or path of the video to use for frame extraction.
26
+ fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
27
+ If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
28
+ extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
29
+ num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
30
+ `num_frames` is greater than the number of frames in the video, all frames will be extracted.
24
31
  """
25
- def __init__(self, video: str, *, fps: float = 0.0):
32
+ def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
33
+ if fps is not None and num_frames is not None:
34
+ raise Error('At most one of `fps` or `num_frames` may be specified')
35
+
26
36
  video_path = Path(video)
27
37
  assert video_path.exists() and video_path.is_file()
28
38
  self.video_path = video_path
29
- self.fps = fps
30
39
  self.video_reader = cv2.VideoCapture(str(video_path))
40
+ self.fps = fps
41
+ self.num_frames = num_frames
31
42
  if not self.video_reader.isOpened():
32
43
  raise Error(f'Failed to open video: {video}')
44
+
33
45
  video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
34
- if fps > video_fps:
46
+ if fps is not None and fps > video_fps:
35
47
  raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
36
- self.frame_freq = int(video_fps / fps) if fps > 0 else 1
37
48
  num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
38
49
  if num_video_frames == 0:
39
50
  raise Error(f'Video {video}: failed to get number of frames')
40
- # ceil: round up to ensure we count frame 0
41
- self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
42
- _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
43
51
 
52
+ if num_frames is not None:
53
+ # specific number of frames
54
+ if num_frames > num_video_frames:
55
+ # Extract all frames
56
+ self.frames_to_extract = range(num_video_frames)
57
+ else:
58
+ spacing = float(num_video_frames) / float(num_frames)
59
+ self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
60
+ assert len(self.frames_to_extract) == num_frames
61
+ else:
62
+ if fps is None or fps == 0.0:
63
+ # Extract all frames
64
+ self.frames_to_extract = range(num_video_frames)
65
+ else:
66
+ # Extract frames at the implied frequency
67
+ freq = fps / video_fps
68
+ n = math.ceil(num_video_frames * freq) # number of frames to extract
69
+ self.frames_to_extract = list(round(i / freq) for i in range(n))
70
+
71
+ # We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
72
+ # there are lots of frames)
73
+ self.frames_set = set(self.frames_to_extract)
74
+ _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
44
75
  self.next_frame_idx = 0
45
76
 
46
77
  @classmethod
47
- def input_schema(cls) -> Dict[str, ColumnType]:
78
+ def input_schema(cls) -> dict[str, ColumnType]:
48
79
  return {
49
80
  'video': VideoType(nullable=False),
50
- 'fps': FloatType()
81
+ 'fps': FloatType(nullable=True),
82
+ 'num_frames': IntType(nullable=True),
51
83
  }
52
84
 
53
85
  @classmethod
54
- def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
86
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
55
87
  return {
56
88
  'frame_idx': IntType(),
57
89
  'pos_msec': FloatType(),
@@ -59,7 +91,9 @@ class FrameIterator(ComponentIterator):
59
91
  'frame': ImageType(),
60
92
  }, ['frame']
61
93
 
62
- def __next__(self) -> Dict[str, Any]:
94
+ def __next__(self) -> dict[str, Any]:
95
+ # jumping to the target frame here with video_reader.set() is far slower than just
96
+ # skipping the unwanted frames
63
97
  while True:
64
98
  pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
65
99
  pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
@@ -69,7 +103,7 @@ class FrameIterator(ComponentIterator):
69
103
  self.video_reader.release()
70
104
  self.video_reader = None
71
105
  raise StopIteration
72
- if pos_frame % self.frame_freq == 0:
106
+ if pos_frame in self.frames_set:
73
107
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
74
108
  result = {
75
109
  'frame_idx': self.next_frame_idx,
@@ -78,8 +112,6 @@ class FrameIterator(ComponentIterator):
78
112
  'frame': PIL.Image.fromarray(img),
79
113
  }
80
114
  self.next_frame_idx += 1
81
- # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
82
- # skipping the unwanted frames
83
115
  return result
84
116
 
85
117
  def close(self) -> None:
@@ -92,5 +124,5 @@ class FrameIterator(ComponentIterator):
92
124
  if pos == self.next_frame_idx:
93
125
  return
94
126
  _logger.debug(f'seeking to frame {pos}')
95
- self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
127
+ self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
96
128
  self.next_frame_idx = pos
@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
10
10
  from .schema import SystemInfo, SystemInfoMd
11
11
 
12
12
  # current version of the metadata; this is incremented whenever the metadata schema changes
13
- VERSION = 18
13
+ VERSION = 19
14
14
 
15
15
 
16
16
  def create_system_info(engine: sql.engine.Engine) -> None:
@@ -0,0 +1,39 @@
1
+ from typing import Any, Optional
2
+ import sqlalchemy as sql
3
+
4
+ from pixeltable.metadata import register_converter
5
+ from pixeltable.metadata.converters.util import convert_table_md
6
+
7
+
8
+ @register_converter(version=18)
9
+ def _(engine: sql.engine.Engine) -> None:
10
+ convert_table_md(
11
+ engine,
12
+ substitution_fn=__substitute_md
13
+ )
14
+
15
+
16
+ def __substitute_md(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
17
+ # Migrate a few changed function names
18
+ if k == 'path' and v == 'pixeltable.functions.string.str_format':
19
+ return 'path', 'pixeltable.functions.string.format'
20
+ if k == 'path' and v.startswith('pixeltable.functions.pil.image'):
21
+ return 'path', v.replace('pixeltable.functions.pil.image', 'pixeltable.functions.image')
22
+ # Migrate deprecated `ImageMemberAccess` expressions to `FunctionCall`s
23
+ if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ImageMemberAccess':
24
+ member_name = v['member_name']
25
+ new_v = {
26
+ 'fn': {
27
+ 'path': f'pixeltable.functions.image.{member_name}',
28
+ '_classpath': 'pixeltable.func.callable_function.CallableFunction',
29
+ },
30
+ 'args': [[0, None]],
31
+ 'kwargs': {},
32
+ '_classname': 'FunctionCall',
33
+ 'components': v['components'],
34
+ 'group_by_stop_idx': 0,
35
+ 'group_by_start_idx': 0,
36
+ 'order_by_start_idx': 1,
37
+ }
38
+ return k, new_v
39
+ return None
@@ -0,0 +1,10 @@
1
+ # Descriptive notes for each new metadata version. These are stored in a Python dict
2
+ # rather than as a comment, so that the existence of a description can be enforced by
3
+ # the unit tests when new versions are added.
4
+ VERSION_NOTES = {
5
+ 19: 'UDF renames; ImageMemberAccess removal',
6
+ 18: 'Restructured index metadata',
7
+ 17: 'Renamed remotes to external_stores',
8
+ 16: 'Query functions; deferred Expr deserialization',
9
+ 15: 'Remotes in table metadata',
10
+ }
pixeltable/plan.py CHANGED
@@ -107,7 +107,7 @@ class Analyzer:
107
107
  for e in self.group_by_clause:
108
108
  if e.sql_expr() is None:
109
109
  raise excs.Error(f'Invalid grouping expression, needs to be expressible in SQL: {e}')
110
- if e.contains(filter=lambda e: _is_agg_fn_call(e)):
110
+ if e._contains(filter=lambda e: _is_agg_fn_call(e)):
111
111
  raise excs.Error(f'Grouping expression contains aggregate function: {e}')
112
112
 
113
113
  # check that agg fn calls don't have contradicting ordering requirements
@@ -288,6 +288,81 @@ class Planner:
288
288
  recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
289
289
  return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
290
290
 
291
+ @classmethod
292
+ def create_batch_update_plan(
293
+ cls, tbl: catalog.TableVersionPath,
294
+ batch: list[dict[catalog.Column, exprs.Expr]], rowids: list[tuple[int, ...]],
295
+ cascade: bool
296
+ ) -> Tuple[exec.ExecNode, exec.RowUpdateNode, sql.ClauseElement, List[catalog.Column], List[catalog.Column]]:
297
+ """
298
+ Returns:
299
+ - root node of the plan to produce the updated rows
300
+ - RowUpdateNode of plan
301
+ - Where clause for deleting the current versions of updated rows
302
+ - list of columns that are getting updated
303
+ - list of user-visible columns that are being recomputed
304
+ """
305
+ assert isinstance(tbl, catalog.TableVersionPath)
306
+ target = tbl.tbl_version # the one we need to update
307
+ sa_key_cols: list[sql.Column] = []
308
+ key_vals: list[tuple] = []
309
+ if len(rowids) > 0:
310
+ sa_key_cols = target.store_tbl.rowid_columns()
311
+ key_vals = rowids
312
+ else:
313
+ pk_cols = target.primary_key_columns()
314
+ sa_key_cols = [c.sa_col for c in pk_cols]
315
+ key_vals = [tuple(row[col].val for col in pk_cols) for row in batch]
316
+
317
+ # retrieve all stored cols and all target exprs
318
+ updated_cols = batch[0].keys() - target.primary_key_columns()
319
+ recomputed_cols = target.get_dependent_columns(updated_cols) if cascade else set()
320
+ # regardless of cascade, we need to update all indices on any updated column
321
+ idx_val_cols = target.get_idx_val_columns(updated_cols)
322
+ recomputed_cols.update(idx_val_cols)
323
+ # we only need to recompute stored columns (unstored ones are substituted away)
324
+ recomputed_cols = {c for c in recomputed_cols if c.is_stored}
325
+ recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
326
+ copied_cols = [
327
+ col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
328
+ ]
329
+ select_list = [exprs.ColumnRef(col) for col in copied_cols]
330
+ select_list.extend([exprs.ColumnRef(col) for col in updated_cols])
331
+
332
+ recomputed_exprs = \
333
+ [c.value_expr.copy().resolve_computed_cols(resolve_cols=recomputed_base_cols) for c in recomputed_base_cols]
334
+ # the RowUpdateNode updates columns in-place, ie, in the original ColumnRef; no further sustitution is needed
335
+ select_list.extend(recomputed_exprs)
336
+
337
+ # ExecNode tree (from bottom to top):
338
+ # - SqlLookupNode to retrieve the existing rows
339
+ # - RowUpdateNode to update the retrieved rows
340
+ # - ExprEvalNode to evaluate the remaining output exprs
341
+ analyzer = Analyzer(tbl, select_list)
342
+ row_builder = exprs.RowBuilder(analyzer.all_exprs, [], analyzer.sql_exprs)
343
+ analyzer.finalize(row_builder)
344
+ plan = exec.SqlLookupNode(tbl, row_builder, analyzer.sql_exprs, sa_key_cols, key_vals)
345
+ delete_where_clause = plan.where_clause
346
+ col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
347
+ plan = row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, plan)
348
+ if not cls._is_contained_in(analyzer.select_list, analyzer.sql_exprs):
349
+ # we need an ExprEvalNode to evaluate the remaining output exprs
350
+ plan = exec.ExprEvalNode(row_builder, analyzer.select_list, analyzer.sql_exprs, input=plan)
351
+ # update row builder with column information
352
+ all_base_cols = copied_cols + list(updated_cols) + list(recomputed_base_cols) # same order as select_list
353
+ row_builder.substitute_exprs(select_list, remove_duplicates=False)
354
+ for i, col in enumerate(all_base_cols):
355
+ plan.row_builder.add_table_column(col, select_list[i].slot_idx)
356
+
357
+ ctx = exec.ExecContext(row_builder)
358
+ # we're returning everything to the user, so we might as well do it in a single batch
359
+ ctx.batch_size = 0
360
+ plan.set_ctx(ctx)
361
+ recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
362
+ return (
363
+ plan, row_update_node, delete_where_clause, list(updated_cols) + recomputed_user_cols, recomputed_user_cols
364
+ )
365
+
291
366
  @classmethod
292
367
  def create_view_update_plan(
293
368
  cls, view: catalog.TableVersionPath, recompute_targets: List[catalog.Column]