pixeltable 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/column.py +6 -3
  3. pixeltable/catalog/dir.py +1 -1
  4. pixeltable/catalog/globals.py +15 -6
  5. pixeltable/catalog/insertable_table.py +23 -8
  6. pixeltable/catalog/named_function.py +1 -1
  7. pixeltable/catalog/path_dict.py +4 -4
  8. pixeltable/catalog/schema_object.py +30 -18
  9. pixeltable/catalog/table.py +87 -104
  10. pixeltable/catalog/table_version.py +35 -24
  11. pixeltable/catalog/table_version_path.py +2 -2
  12. pixeltable/catalog/view.py +15 -8
  13. pixeltable/dataframe.py +56 -56
  14. pixeltable/env.py +10 -9
  15. pixeltable/exec/__init__.py +3 -3
  16. pixeltable/exec/aggregation_node.py +3 -3
  17. pixeltable/exec/expr_eval_node.py +3 -3
  18. pixeltable/exec/in_memory_data_node.py +4 -4
  19. pixeltable/exec/sql_node.py +4 -1
  20. pixeltable/exprs/arithmetic_expr.py +41 -16
  21. pixeltable/exprs/array_slice.py +3 -4
  22. pixeltable/exprs/column_ref.py +20 -4
  23. pixeltable/exprs/comparison.py +11 -6
  24. pixeltable/exprs/data_row.py +3 -0
  25. pixeltable/exprs/expr.py +88 -23
  26. pixeltable/exprs/function_call.py +12 -1
  27. pixeltable/exprs/globals.py +3 -1
  28. pixeltable/exprs/inline_array.py +4 -4
  29. pixeltable/exprs/json_path.py +36 -20
  30. pixeltable/exprs/row_builder.py +4 -4
  31. pixeltable/exprs/rowid_ref.py +1 -1
  32. pixeltable/functions/__init__.py +1 -2
  33. pixeltable/functions/audio.py +32 -0
  34. pixeltable/functions/huggingface.py +4 -4
  35. pixeltable/functions/image.py +1 -1
  36. pixeltable/functions/json.py +46 -0
  37. pixeltable/functions/video.py +5 -1
  38. pixeltable/functions/{eval.py → vision.py} +166 -27
  39. pixeltable/globals.py +57 -28
  40. pixeltable/io/external_store.py +6 -6
  41. pixeltable/io/globals.py +13 -14
  42. pixeltable/io/label_studio.py +6 -6
  43. pixeltable/io/pandas.py +60 -19
  44. pixeltable/io/parquet.py +14 -14
  45. pixeltable/iterators/document.py +7 -7
  46. pixeltable/iterators/video.py +55 -23
  47. pixeltable/plan.py +58 -29
  48. pixeltable/store.py +97 -59
  49. pixeltable/tool/create_test_db_dump.py +17 -11
  50. pixeltable/type_system.py +155 -143
  51. pixeltable/utils/pytorch.py +12 -10
  52. {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
  53. {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/RECORD +56 -54
  54. {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
  55. {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
  56. {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0
pixeltable/io/globals.py CHANGED
@@ -1,5 +1,4 @@
1
1
  from typing import Any, Literal, Optional, Union
2
- import urllib.request
3
2
 
4
3
  import pixeltable as pxt
5
4
  import pixeltable.exceptions as excs
@@ -19,7 +18,7 @@ def create_label_studio_project(
19
18
  **kwargs: Any
20
19
  ) -> SyncStatus:
21
20
  """
22
- Create a new Label Studio project and link it to the specified `Table`.
21
+ Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
23
22
 
24
23
  - A tutorial notebook with fully worked examples can be found here:
25
24
  [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
@@ -34,7 +33,7 @@ def create_label_studio_project(
34
33
  then the linked project will have a column named `image`. In addition, the linked project
35
34
  will always have a JSON-typed column `annotations` representing the output.
36
35
 
37
- By default, Pixeltable will link each of these columns to a column of the specified `Table`
36
+ By default, Pixeltable will link each of these columns to a column of the specified [`Table`][pixeltable.Table]
38
37
  with the same name. If any of the data fields are missing, an exception will be raised. If
39
38
  the `annotations` column is missing, it will be created. The default names can be overridden
40
39
  by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
@@ -52,7 +51,7 @@ def create_label_studio_project(
52
51
  - `pip install boto3` (if using S3 import storage)
53
52
 
54
53
  Args:
55
- t: The Table to link to.
54
+ t: The table to link to.
56
55
  label_config: The Label Studio project configuration, in XML format.
57
56
  name: An optional name for the new project in Pixeltable. If specified, must be a valid
58
57
  Pixeltable identifier and must not be the name of any other external data store
@@ -73,7 +72,7 @@ def create_label_studio_project(
73
72
  The default is `post`.
74
73
  col_mapping: An optional mapping of local column names to Label Studio fields.
75
74
  sync_immediately: If `True`, immediately perform an initial synchronization by
76
- exporting all rows of the `Table` as Label Studio tasks.
75
+ exporting all rows of the table as Label Studio tasks.
77
76
  s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
78
77
  be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
79
78
  referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
@@ -148,15 +147,15 @@ def import_rows(
148
147
  comment: str = ''
149
148
  ) -> Table:
150
149
  """
151
- Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
152
- `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
150
+ Creates a new base table from a list of dictionaries. The dictionaries must be of the
151
+ form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
153
152
  supplied data, using the most specific type that can represent all the values in a column.
154
153
 
155
154
  If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
156
155
  Pixeltable will force the specified column to the specified type (and will not attempt any type inference
157
156
  for that column).
158
157
 
159
- All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
158
+ All column types of the new table will be nullable unless explicitly specified as non-nullable in
160
159
  `schema_overrides`.
161
160
 
162
161
  Args:
@@ -169,7 +168,7 @@ def import_rows(
169
168
  comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
170
169
 
171
170
  Returns:
172
- The newly created `Table`.
171
+ A handle to the newly created [`Table`][pixeltable.Table].
173
172
  """
174
173
  if schema_overrides is None:
175
174
  schema_overrides = {}
@@ -187,11 +186,11 @@ def import_rows(
187
186
  elif value is not None:
188
187
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
189
188
  # The column type will always be nullable by default.
190
- col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
189
+ col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
191
190
  if col_name not in schema:
192
191
  schema[col_name] = col_type
193
192
  else:
194
- supertype = pxt.ColumnType.supertype(schema[col_name], col_type)
193
+ supertype = schema[col_name].supertype(col_type)
195
194
  if supertype is None:
196
195
  raise excs.Error(
197
196
  f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
@@ -230,8 +229,8 @@ def import_json(
230
229
  **kwargs: Any
231
230
  ) -> Table:
232
231
  """
233
- Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
234
- to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
232
+ Creates a new base table from a JSON file. This is a convenience method and is
233
+ equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
235
234
  is the contents of the specified `filepath_or_url`.
236
235
 
237
236
  Args:
@@ -245,7 +244,7 @@ def import_json(
245
244
  kwargs: Additional keyword arguments to pass to `json.loads`.
246
245
 
247
246
  Returns:
248
- The newly created `Table`.
247
+ A handle to the newly created [`Table`][pixeltable.Table].
249
248
  """
250
249
  import json
251
250
  import urllib.parse
@@ -105,7 +105,7 @@ class LabelStudioProject(Project):
105
105
  return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
106
106
 
107
107
  def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
108
- _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
108
+ _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
109
109
  f' (export: {export_data}, import: {import_data}).')
110
110
  # Collect all existing tasks into a dict with entries `rowid: task`
111
111
  tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
@@ -396,15 +396,15 @@ class LabelStudioProject(Project):
396
396
  updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
397
397
  if len(updates) > 0:
398
398
  _logger.info(
399
- f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
399
+ f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
400
400
  )
401
401
  # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
402
402
  # batch_update on the actual ancestor table that holds the annotations column.
403
403
  # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
404
404
  ancestor = t
405
405
  while local_annotations_col not in ancestor._tbl_version.cols:
406
- assert ancestor.base is not None
407
- ancestor = ancestor.base
406
+ assert ancestor._base is not None
407
+ ancestor = ancestor._base
408
408
  update_status = ancestor.batch_update(updates)
409
409
  print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
410
410
  return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
@@ -565,7 +565,7 @@ class LabelStudioProject(Project):
565
565
 
566
566
  if title is None:
567
567
  # `title` defaults to table name
568
- title = t.name
568
+ title = t._name
569
569
 
570
570
  # Create a column to hold the annotations, if one does not yet exist
571
571
  if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
@@ -573,7 +573,7 @@ class LabelStudioProject(Project):
573
573
  local_annotations_column = ANNOTATIONS_COLUMN
574
574
  else:
575
575
  local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
576
- if local_annotations_column not in t.column_names():
576
+ if local_annotations_column not in t._schema.keys():
577
577
  t[local_annotations_column] = pxt.JsonType(nullable=True)
578
578
 
579
579
  resolved_col_mapping = cls.validate_columns(
pixeltable/io/pandas.py CHANGED
@@ -1,7 +1,9 @@
1
+ import datetime
1
2
  from typing import Any, Optional, Union
2
3
 
3
4
  import numpy as np
4
5
  import pandas as pd
6
+ import PIL.Image
5
7
 
6
8
  import pixeltable as pxt
7
9
  import pixeltable.exceptions as excs
@@ -13,11 +15,12 @@ def import_pandas(
13
15
  primary_key: Optional[Union[str, list[str]]] = None,
14
16
  num_retained_versions: int = 10,
15
17
  comment: str = ''
16
- ) -> pxt.catalog.InsertableTable:
17
- """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
18
- will be inferred from the `DataFrame`.
18
+ ) -> pxt.Table:
19
+ """Creates a new base table from a Pandas
20
+ [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
21
+ specified name. The schema of the table will be inferred from the DataFrame.
19
22
 
20
- The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
23
+ The column names of the new table will be identical to those in the DataFrame, as long as they are valid
21
24
  Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
22
25
  the following procedure:
23
26
  - first replace any non-alphanumeric characters with underscores;
@@ -31,6 +34,9 @@ def import_pandas(
31
34
  name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
32
35
  `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
33
36
  Pixeltable identifiers).
37
+
38
+ Returns:
39
+ A handle to the newly created [`Table`][pixeltable.Table].
34
40
  """
35
41
  if schema_overrides is None:
36
42
  schema_overrides = {}
@@ -52,11 +58,15 @@ def import_csv(
52
58
  num_retained_versions: int = 10,
53
59
  comment: str = '',
54
60
  **kwargs
55
- ) -> pxt.catalog.InsertableTable:
61
+ ) -> pxt.Table:
56
62
  """
57
- Creates a new `Table` from a csv file. This is a convenience method and is equivalent
63
+ Creates a new base table from a csv file. This is a convenience method and is equivalent
58
64
  to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
59
- See the Pandas documentation for `read_csv` for more details.
65
+ See the Pandas documentation for [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
66
+ for more details.
67
+
68
+ Returns:
69
+ A handle to the newly created [`Table`][pixeltable.Table].
60
70
  """
61
71
  df = pd.read_csv(filepath_or_buffer, **kwargs)
62
72
  return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -68,11 +78,15 @@ def import_excel(
68
78
  num_retained_versions: int = 10,
69
79
  comment: str = '',
70
80
  **kwargs
71
- ) -> pxt.catalog.InsertableTable:
81
+ ) -> pxt.Table:
72
82
  """
73
- Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
74
- to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
75
- See the Pandas documentation for `read_excel` for more details.
83
+ Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
84
+ equivalent to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
85
+ See the Pandas documentation for [`read_excel`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html)
86
+ for more details.
87
+
88
+ Returns:
89
+ A handle to the newly created [`Table`][pixeltable.Table].
76
90
  """
77
91
  df = pd.read_excel(io, *args, **kwargs)
78
92
  return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -103,6 +117,17 @@ def __df_to_pxt_schema(
103
117
  if pd_name in schema_overrides:
104
118
  pxt_type = schema_overrides[pd_name]
105
119
  else:
120
+ # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
121
+ # general objects, so we need to check for nulls in the specific cases where we might expect them.
122
+ # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
123
+ # in object columns (where Pandas uses NaN as a general null).
124
+ # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
125
+ has_na = any(
126
+ (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
127
+ for val in df[pd_name]
128
+ )
129
+ if has_na and pd_name in primary_key:
130
+ raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
106
131
  pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
107
132
  pxt_name = __normalize_pxt_col_name(pd_name)
108
133
  # Ensure that column names are unique by appending a distinguishing suffix
@@ -140,21 +165,37 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
140
165
  """
141
166
  if np.issubdtype(np_dtype, np.integer):
142
167
  return pxt.IntType(nullable=nullable)
168
+
143
169
  if np.issubdtype(np_dtype, np.floating):
144
170
  return pxt.FloatType(nullable=nullable)
171
+
145
172
  if np.issubdtype(np_dtype, np.bool_):
146
173
  return pxt.BoolType(nullable=nullable)
147
- if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
148
- has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
149
- if has_nan and not nullable:
150
- raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
174
+
175
+ if np.issubdtype(np_dtype, np.character):
151
176
  return pxt.StringType(nullable=nullable)
177
+
152
178
  if np.issubdtype(np_dtype, np.datetime64):
153
- has_nat = any(pd.isnull(val) for val in data_col)
154
- if has_nat and not nullable:
155
- raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
156
179
  return pxt.TimestampType(nullable=nullable)
157
- raise excs.Error(f'Unsupported dtype: {np_dtype}')
180
+
181
+ if np_dtype == np.object_:
182
+ # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
183
+ # based on the actual data in `data_col`.
184
+ # First drop any null values (they don't contribute to type inference).
185
+ data_col = data_col.dropna()
186
+
187
+ if len(data_col) == 0:
188
+ # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
189
+ return pxt.FloatType(nullable=nullable)
190
+
191
+ inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
192
+ if inferred_type is None:
193
+ # Fallback on StringType if everything else fails
194
+ return pxt.StringType(nullable=nullable)
195
+ else:
196
+ return inferred_type.copy(nullable=nullable)
197
+
198
+ raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
158
199
 
159
200
 
160
201
  def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
pixeltable/io/parquet.py CHANGED
@@ -19,12 +19,14 @@ from pixeltable.utils.transactional_directory import transactional_directory
19
19
  if typing.TYPE_CHECKING:
20
20
  import pixeltable as pxt
21
21
  import pyarrow as pa
22
+ from pyarrow import parquet
22
23
 
23
24
  _logger = logging.getLogger(__name__)
24
25
 
25
26
 
26
27
  def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
27
28
  import pyarrow as pa
29
+ from pyarrow import parquet
28
30
 
29
31
  pydict = {}
30
32
  for field in schema:
@@ -35,7 +37,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
35
37
  pydict[field.name] = value_batch[field.name]
36
38
 
37
39
  tab = pa.Table.from_pydict(pydict, schema=schema)
38
- pa.parquet.write_table(tab, output_path)
40
+ parquet.write_table(tab, output_path)
39
41
 
40
42
 
41
43
  def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -55,23 +57,21 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
55
57
  """
56
58
  from pixeltable.utils.arrow import to_arrow_schema
57
59
 
58
- column_names = df.get_column_names()
59
- column_types = df.get_column_types()
60
- type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
61
- arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
60
+ type_dict = {k: v.as_dict() for k, v in df.schema.items()}
61
+ arrow_schema = to_arrow_schema(df.schema)
62
62
 
63
63
  # store the changes atomically
64
64
  with transactional_directory(dest_path) as temp_path:
65
65
  # dump metadata json file so we can inspect what was the source of the parquet file later on.
66
- json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
66
+ json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
67
67
  json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
68
68
 
69
69
  batch_num = 0
70
- current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
70
+ current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
71
71
  current_byte_estimate = 0
72
72
 
73
- for data_row in df._exec(): # pylint: disable=protected-access
74
- for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
73
+ for data_row in df._exec():
74
+ for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
75
75
  val = data_row[e.slot_idx]
76
76
  if val is None:
77
77
  current_value_batch[col_name].append(val)
@@ -122,7 +122,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
122
122
  assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
123
123
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
124
124
  batch_num += 1
125
- current_value_batch = {k: deque() for k in column_names}
125
+ current_value_batch = {k: deque() for k in df.schema.keys()}
126
126
  current_byte_estimate = 0
127
127
 
128
128
  _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -130,11 +130,11 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
130
130
 
131
131
  def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
132
132
  """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
133
- import pyarrow as pa
133
+ from pyarrow import parquet
134
134
  from pixeltable.utils.arrow import to_pixeltable_schema
135
135
 
136
136
  input_path = Path(parquet_path).expanduser()
137
- parquet_dataset = pa.parquet.ParquetDataset(input_path)
137
+ parquet_dataset = parquet.ParquetDataset(input_path)
138
138
  return to_pixeltable_schema(parquet_dataset.schema)
139
139
 
140
140
 
@@ -159,11 +159,11 @@ def import_parquet(
159
159
  The newly created table. The table will have loaded the data from the Parquet file(s).
160
160
  """
161
161
  import pixeltable as pxt
162
- import pyarrow as pa
162
+ from pyarrow import parquet
163
163
  from pixeltable.utils.arrow import iter_tuples
164
164
 
165
165
  input_path = Path(parquet_path).expanduser()
166
- parquet_dataset = pa.parquet.ParquetDataset(input_path)
166
+ parquet_dataset = parquet.ParquetDataset(input_path)
167
167
 
168
168
  schema = parquet_schema_to_pixeltable_schema(parquet_path)
169
169
  if schema_override is None:
@@ -38,7 +38,7 @@ class DocumentSectionMetadata:
38
38
  sourceline: Optional[int] = None
39
39
  # the stack of headings up to the most recently observed one;
40
40
  # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
41
- heading: Optional[Dict[int, str]] = None
41
+ heading: Optional[Dict[str, str]] = None
42
42
 
43
43
  # pdf-specific metadata
44
44
  page: Optional[int] = None
@@ -236,7 +236,7 @@ class DocumentSplitter(ComponentIterator):
236
236
  accumulated_text = [] # currently accumulated text
237
237
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
238
238
 
239
- headings: Dict[int, str] = {} # current state of observed headings (level -> text)
239
+ headings: Dict[str, str] = {} # current state of observed headings (level -> text)
240
240
  sourceline = 0 # most recently seen sourceline
241
241
 
242
242
  def update_metadata(el: bs4.Tag) -> None:
@@ -244,12 +244,11 @@ class DocumentSplitter(ComponentIterator):
244
244
  nonlocal headings, sourceline
245
245
  sourceline = el.sourceline
246
246
  if el.name in _HTML_HEADINGS:
247
- level = int(el.name[1])
248
247
  # remove the previously seen lower levels
249
- lower_levels = [l for l in headings if l > level]
248
+ lower_levels = [l for l in headings if l > el.name]
250
249
  for l in lower_levels:
251
250
  del headings[l]
252
- headings[level] = el.get_text().strip()
251
+ headings[el.name] = el.get_text().strip()
253
252
 
254
253
  def emit() -> None:
255
254
  nonlocal accumulated_text, headings, sourceline
@@ -295,13 +294,14 @@ class DocumentSplitter(ComponentIterator):
295
294
  # current state
296
295
  accumulated_text = [] # currently accumulated text
297
296
  # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
298
- headings: Dict[int, str] = {} # current state of observed headings (level -> text)
297
+ headings: Dict[str, str] = {} # current state of observed headings (level -> text)
299
298
 
300
299
  def update_headings(heading: Dict) -> None:
301
300
  # update current state
302
301
  nonlocal headings
303
302
  assert 'type' in heading and heading['type'] == 'heading'
304
- level = heading['attrs']['level']
303
+ lint = heading['attrs']['level']
304
+ level = f'h{lint}'
305
305
  text = heading['children'][0]['raw'].strip()
306
306
  # remove the previously seen lower levels
307
307
  lower_levels = [l for l in headings.keys() if l > level]
@@ -1,57 +1,89 @@
1
1
  import logging
2
2
  import math
3
3
  from pathlib import Path
4
- from typing import Dict, Any, List, Tuple
4
+ from typing import Any, Optional
5
5
 
6
- import PIL.Image
7
6
  import cv2
7
+ import PIL.Image
8
8
 
9
9
  from pixeltable.exceptions import Error
10
- from pixeltable.type_system import ColumnType, VideoType, ImageType, IntType, FloatType
10
+ from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
11
+
11
12
  from .base import ComponentIterator
12
13
 
13
14
  _logger = logging.getLogger('pixeltable')
14
15
 
15
16
 
16
17
  class FrameIterator(ComponentIterator):
17
- """Iterator over frames of a video.
18
+ """
19
+ Iterator over frames of a video. At most one of `fps` or `num_frames` may be specified. If `fps` is specified,
20
+ then frames will be extracted at the specified rate (frames per second). If `num_frames` is specified, then the
21
+ exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
22
+ frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
18
23
 
19
24
  Args:
20
- video: URL or file of the video to use for frame extraction
21
- fps: number of frames to extract per second of video. This may be a fractional value, such as 0.5.
22
- If set to 0.0, then the native framerate of the video will be used (all frames will be extracted).
23
- Default: 0.0
25
+ video: URL or path of the video to use for frame extraction.
26
+ fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
27
+ If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
28
+ extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
29
+ num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
30
+ `num_frames` is greater than the number of frames in the video, all frames will be extracted.
24
31
  """
25
- def __init__(self, video: str, *, fps: float = 0.0):
32
+ def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
33
+ if fps is not None and num_frames is not None:
34
+ raise Error('At most one of `fps` or `num_frames` may be specified')
35
+
26
36
  video_path = Path(video)
27
37
  assert video_path.exists() and video_path.is_file()
28
38
  self.video_path = video_path
29
- self.fps = fps
30
39
  self.video_reader = cv2.VideoCapture(str(video_path))
40
+ self.fps = fps
41
+ self.num_frames = num_frames
31
42
  if not self.video_reader.isOpened():
32
43
  raise Error(f'Failed to open video: {video}')
44
+
33
45
  video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
34
- if fps > video_fps:
46
+ if fps is not None and fps > video_fps:
35
47
  raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
36
- self.frame_freq = int(video_fps / fps) if fps > 0 else 1
37
48
  num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
38
49
  if num_video_frames == 0:
39
50
  raise Error(f'Video {video}: failed to get number of frames')
40
- # ceil: round up to ensure we count frame 0
41
- self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
42
- _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
43
51
 
52
+ if num_frames is not None:
53
+ # specific number of frames
54
+ if num_frames > num_video_frames:
55
+ # Extract all frames
56
+ self.frames_to_extract = range(num_video_frames)
57
+ else:
58
+ spacing = float(num_video_frames) / float(num_frames)
59
+ self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
60
+ assert len(self.frames_to_extract) == num_frames
61
+ else:
62
+ if fps is None or fps == 0.0:
63
+ # Extract all frames
64
+ self.frames_to_extract = range(num_video_frames)
65
+ else:
66
+ # Extract frames at the implied frequency
67
+ freq = fps / video_fps
68
+ n = math.ceil(num_video_frames * freq) # number of frames to extract
69
+ self.frames_to_extract = list(round(i / freq) for i in range(n))
70
+
71
+ # We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
72
+ # there are lots of frames)
73
+ self.frames_set = set(self.frames_to_extract)
74
+ _logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
44
75
  self.next_frame_idx = 0
45
76
 
46
77
  @classmethod
47
- def input_schema(cls) -> Dict[str, ColumnType]:
78
+ def input_schema(cls) -> dict[str, ColumnType]:
48
79
  return {
49
80
  'video': VideoType(nullable=False),
50
- 'fps': FloatType()
81
+ 'fps': FloatType(nullable=True),
82
+ 'num_frames': IntType(nullable=True),
51
83
  }
52
84
 
53
85
  @classmethod
54
- def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
86
+ def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
55
87
  return {
56
88
  'frame_idx': IntType(),
57
89
  'pos_msec': FloatType(),
@@ -59,7 +91,9 @@ class FrameIterator(ComponentIterator):
59
91
  'frame': ImageType(),
60
92
  }, ['frame']
61
93
 
62
- def __next__(self) -> Dict[str, Any]:
94
+ def __next__(self) -> dict[str, Any]:
95
+ # jumping to the target frame here with video_reader.set() is far slower than just
96
+ # skipping the unwanted frames
63
97
  while True:
64
98
  pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
65
99
  pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
@@ -69,7 +103,7 @@ class FrameIterator(ComponentIterator):
69
103
  self.video_reader.release()
70
104
  self.video_reader = None
71
105
  raise StopIteration
72
- if pos_frame % self.frame_freq == 0:
106
+ if pos_frame in self.frames_set:
73
107
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
74
108
  result = {
75
109
  'frame_idx': self.next_frame_idx,
@@ -78,8 +112,6 @@ class FrameIterator(ComponentIterator):
78
112
  'frame': PIL.Image.fromarray(img),
79
113
  }
80
114
  self.next_frame_idx += 1
81
- # frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
82
- # skipping the unwanted frames
83
115
  return result
84
116
 
85
117
  def close(self) -> None:
@@ -92,5 +124,5 @@ class FrameIterator(ComponentIterator):
92
124
  if pos == self.next_frame_idx:
93
125
  return
94
126
  _logger.debug(f'seeking to frame {pos}')
95
- self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, pos * self.frame_freq)
127
+ self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
96
128
  self.next_frame_idx = pos