pixeltable 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +6 -3
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/globals.py +15 -6
- pixeltable/catalog/insertable_table.py +23 -8
- pixeltable/catalog/named_function.py +1 -1
- pixeltable/catalog/path_dict.py +4 -4
- pixeltable/catalog/schema_object.py +30 -18
- pixeltable/catalog/table.py +87 -104
- pixeltable/catalog/table_version.py +35 -24
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +15 -8
- pixeltable/dataframe.py +56 -56
- pixeltable/env.py +10 -9
- pixeltable/exec/__init__.py +3 -3
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/expr_eval_node.py +3 -3
- pixeltable/exec/in_memory_data_node.py +4 -4
- pixeltable/exec/sql_node.py +4 -1
- pixeltable/exprs/arithmetic_expr.py +41 -16
- pixeltable/exprs/array_slice.py +3 -4
- pixeltable/exprs/column_ref.py +20 -4
- pixeltable/exprs/comparison.py +11 -6
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/expr.py +88 -23
- pixeltable/exprs/function_call.py +12 -1
- pixeltable/exprs/globals.py +3 -1
- pixeltable/exprs/inline_array.py +4 -4
- pixeltable/exprs/json_path.py +36 -20
- pixeltable/exprs/row_builder.py +4 -4
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/functions/__init__.py +1 -2
- pixeltable/functions/audio.py +32 -0
- pixeltable/functions/huggingface.py +4 -4
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/json.py +46 -0
- pixeltable/functions/video.py +5 -1
- pixeltable/functions/{eval.py → vision.py} +166 -27
- pixeltable/globals.py +57 -28
- pixeltable/io/external_store.py +6 -6
- pixeltable/io/globals.py +13 -14
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/pandas.py +60 -19
- pixeltable/io/parquet.py +14 -14
- pixeltable/iterators/document.py +7 -7
- pixeltable/iterators/video.py +55 -23
- pixeltable/plan.py +58 -29
- pixeltable/store.py +97 -59
- pixeltable/tool/create_test_db_dump.py +17 -11
- pixeltable/type_system.py +155 -143
- pixeltable/utils/pytorch.py +12 -10
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/RECORD +56 -54
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.14.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0
pixeltable/io/globals.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from typing import Any, Literal, Optional, Union
|
|
2
|
-
import urllib.request
|
|
3
2
|
|
|
4
3
|
import pixeltable as pxt
|
|
5
4
|
import pixeltable.exceptions as excs
|
|
@@ -19,7 +18,7 @@ def create_label_studio_project(
|
|
|
19
18
|
**kwargs: Any
|
|
20
19
|
) -> SyncStatus:
|
|
21
20
|
"""
|
|
22
|
-
Create a new Label Studio project and link it to the specified `Table
|
|
21
|
+
Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
|
|
23
22
|
|
|
24
23
|
- A tutorial notebook with fully worked examples can be found here:
|
|
25
24
|
[Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
|
|
@@ -34,7 +33,7 @@ def create_label_studio_project(
|
|
|
34
33
|
then the linked project will have a column named `image`. In addition, the linked project
|
|
35
34
|
will always have a JSON-typed column `annotations` representing the output.
|
|
36
35
|
|
|
37
|
-
By default, Pixeltable will link each of these columns to a column of the specified `Table`
|
|
36
|
+
By default, Pixeltable will link each of these columns to a column of the specified [`Table`][pixeltable.Table]
|
|
38
37
|
with the same name. If any of the data fields are missing, an exception will be raised. If
|
|
39
38
|
the `annotations` column is missing, it will be created. The default names can be overridden
|
|
40
39
|
by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
|
|
@@ -52,7 +51,7 @@ def create_label_studio_project(
|
|
|
52
51
|
- `pip install boto3` (if using S3 import storage)
|
|
53
52
|
|
|
54
53
|
Args:
|
|
55
|
-
t: The
|
|
54
|
+
t: The table to link to.
|
|
56
55
|
label_config: The Label Studio project configuration, in XML format.
|
|
57
56
|
name: An optional name for the new project in Pixeltable. If specified, must be a valid
|
|
58
57
|
Pixeltable identifier and must not be the name of any other external data store
|
|
@@ -73,7 +72,7 @@ def create_label_studio_project(
|
|
|
73
72
|
The default is `post`.
|
|
74
73
|
col_mapping: An optional mapping of local column names to Label Studio fields.
|
|
75
74
|
sync_immediately: If `True`, immediately perform an initial synchronization by
|
|
76
|
-
exporting all rows of the
|
|
75
|
+
exporting all rows of the table as Label Studio tasks.
|
|
77
76
|
s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
|
|
78
77
|
be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
|
|
79
78
|
referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
|
|
@@ -148,15 +147,15 @@ def import_rows(
|
|
|
148
147
|
comment: str = ''
|
|
149
148
|
) -> Table:
|
|
150
149
|
"""
|
|
151
|
-
Creates a new
|
|
152
|
-
`{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
150
|
+
Creates a new base table from a list of dictionaries. The dictionaries must be of the
|
|
151
|
+
form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
153
152
|
supplied data, using the most specific type that can represent all the values in a column.
|
|
154
153
|
|
|
155
154
|
If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
|
|
156
155
|
Pixeltable will force the specified column to the specified type (and will not attempt any type inference
|
|
157
156
|
for that column).
|
|
158
157
|
|
|
159
|
-
All column types of the new
|
|
158
|
+
All column types of the new table will be nullable unless explicitly specified as non-nullable in
|
|
160
159
|
`schema_overrides`.
|
|
161
160
|
|
|
162
161
|
Args:
|
|
@@ -169,7 +168,7 @@ def import_rows(
|
|
|
169
168
|
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
170
169
|
|
|
171
170
|
Returns:
|
|
172
|
-
|
|
171
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
173
172
|
"""
|
|
174
173
|
if schema_overrides is None:
|
|
175
174
|
schema_overrides = {}
|
|
@@ -187,11 +186,11 @@ def import_rows(
|
|
|
187
186
|
elif value is not None:
|
|
188
187
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
189
188
|
# The column type will always be nullable by default.
|
|
190
|
-
col_type = pxt.ColumnType.infer_literal_type(value
|
|
189
|
+
col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
|
|
191
190
|
if col_name not in schema:
|
|
192
191
|
schema[col_name] = col_type
|
|
193
192
|
else:
|
|
194
|
-
supertype =
|
|
193
|
+
supertype = schema[col_name].supertype(col_type)
|
|
195
194
|
if supertype is None:
|
|
196
195
|
raise excs.Error(
|
|
197
196
|
f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
|
|
@@ -230,8 +229,8 @@ def import_json(
|
|
|
230
229
|
**kwargs: Any
|
|
231
230
|
) -> Table:
|
|
232
231
|
"""
|
|
233
|
-
Creates a new
|
|
234
|
-
to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
232
|
+
Creates a new base table from a JSON file. This is a convenience method and is
|
|
233
|
+
equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
235
234
|
is the contents of the specified `filepath_or_url`.
|
|
236
235
|
|
|
237
236
|
Args:
|
|
@@ -245,7 +244,7 @@ def import_json(
|
|
|
245
244
|
kwargs: Additional keyword arguments to pass to `json.loads`.
|
|
246
245
|
|
|
247
246
|
Returns:
|
|
248
|
-
|
|
247
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
249
248
|
"""
|
|
250
249
|
import json
|
|
251
250
|
import urllib.parse
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -105,7 +105,7 @@ class LabelStudioProject(Project):
|
|
|
105
105
|
return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
|
|
106
106
|
|
|
107
107
|
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
108
|
-
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.
|
|
108
|
+
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
109
109
|
f' (export: {export_data}, import: {import_data}).')
|
|
110
110
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
111
111
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
@@ -396,15 +396,15 @@ class LabelStudioProject(Project):
|
|
|
396
396
|
updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
|
|
397
397
|
if len(updates) > 0:
|
|
398
398
|
_logger.info(
|
|
399
|
-
f'Updating table `{t.
|
|
399
|
+
f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
|
|
400
400
|
)
|
|
401
401
|
# batch_update currently doesn't propagate from views to base tables. As a workaround, we call
|
|
402
402
|
# batch_update on the actual ancestor table that holds the annotations column.
|
|
403
403
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
404
404
|
ancestor = t
|
|
405
405
|
while local_annotations_col not in ancestor._tbl_version.cols:
|
|
406
|
-
assert ancestor.
|
|
407
|
-
ancestor = ancestor.
|
|
406
|
+
assert ancestor._base is not None
|
|
407
|
+
ancestor = ancestor._base
|
|
408
408
|
update_status = ancestor.batch_update(updates)
|
|
409
409
|
print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
410
410
|
return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
|
|
@@ -565,7 +565,7 @@ class LabelStudioProject(Project):
|
|
|
565
565
|
|
|
566
566
|
if title is None:
|
|
567
567
|
# `title` defaults to table name
|
|
568
|
-
title = t.
|
|
568
|
+
title = t._name
|
|
569
569
|
|
|
570
570
|
# Create a column to hold the annotations, if one does not yet exist
|
|
571
571
|
if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
|
|
@@ -573,7 +573,7 @@ class LabelStudioProject(Project):
|
|
|
573
573
|
local_annotations_column = ANNOTATIONS_COLUMN
|
|
574
574
|
else:
|
|
575
575
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
576
|
-
if local_annotations_column not in t.
|
|
576
|
+
if local_annotations_column not in t._schema.keys():
|
|
577
577
|
t[local_annotations_column] = pxt.JsonType(nullable=True)
|
|
578
578
|
|
|
579
579
|
resolved_col_mapping = cls.validate_columns(
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
+
import datetime
|
|
1
2
|
from typing import Any, Optional, Union
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pandas as pd
|
|
6
|
+
import PIL.Image
|
|
5
7
|
|
|
6
8
|
import pixeltable as pxt
|
|
7
9
|
import pixeltable.exceptions as excs
|
|
@@ -13,11 +15,12 @@ def import_pandas(
|
|
|
13
15
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
14
16
|
num_retained_versions: int = 10,
|
|
15
17
|
comment: str = ''
|
|
16
|
-
) -> pxt.
|
|
17
|
-
"""Creates a new
|
|
18
|
-
|
|
18
|
+
) -> pxt.Table:
|
|
19
|
+
"""Creates a new base table from a Pandas
|
|
20
|
+
[`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
|
|
21
|
+
specified name. The schema of the table will be inferred from the DataFrame.
|
|
19
22
|
|
|
20
|
-
The column names of the new
|
|
23
|
+
The column names of the new table will be identical to those in the DataFrame, as long as they are valid
|
|
21
24
|
Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
|
|
22
25
|
the following procedure:
|
|
23
26
|
- first replace any non-alphanumeric characters with underscores;
|
|
@@ -31,6 +34,9 @@ def import_pandas(
|
|
|
31
34
|
name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
|
|
32
35
|
`schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
|
|
33
36
|
Pixeltable identifiers).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
34
40
|
"""
|
|
35
41
|
if schema_overrides is None:
|
|
36
42
|
schema_overrides = {}
|
|
@@ -52,11 +58,15 @@ def import_csv(
|
|
|
52
58
|
num_retained_versions: int = 10,
|
|
53
59
|
comment: str = '',
|
|
54
60
|
**kwargs
|
|
55
|
-
) -> pxt.
|
|
61
|
+
) -> pxt.Table:
|
|
56
62
|
"""
|
|
57
|
-
Creates a new
|
|
63
|
+
Creates a new base table from a csv file. This is a convenience method and is equivalent
|
|
58
64
|
to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
|
|
59
|
-
See the Pandas documentation for `read_csv`
|
|
65
|
+
See the Pandas documentation for [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
|
|
66
|
+
for more details.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
60
70
|
"""
|
|
61
71
|
df = pd.read_csv(filepath_or_buffer, **kwargs)
|
|
62
72
|
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
@@ -68,11 +78,15 @@ def import_excel(
|
|
|
68
78
|
num_retained_versions: int = 10,
|
|
69
79
|
comment: str = '',
|
|
70
80
|
**kwargs
|
|
71
|
-
) -> pxt.
|
|
81
|
+
) -> pxt.Table:
|
|
72
82
|
"""
|
|
73
|
-
Creates a new
|
|
74
|
-
to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
|
|
75
|
-
See the Pandas documentation for `read_excel`
|
|
83
|
+
Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
|
|
84
|
+
equivalent to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
|
|
85
|
+
See the Pandas documentation for [`read_excel`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html)
|
|
86
|
+
for more details.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
76
90
|
"""
|
|
77
91
|
df = pd.read_excel(io, *args, **kwargs)
|
|
78
92
|
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
@@ -103,6 +117,17 @@ def __df_to_pxt_schema(
|
|
|
103
117
|
if pd_name in schema_overrides:
|
|
104
118
|
pxt_type = schema_overrides[pd_name]
|
|
105
119
|
else:
|
|
120
|
+
# This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
|
|
121
|
+
# general objects, so we need to check for nulls in the specific cases where we might expect them.
|
|
122
|
+
# isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
|
|
123
|
+
# in object columns (where Pandas uses NaN as a general null).
|
|
124
|
+
# np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
|
|
125
|
+
has_na = any(
|
|
126
|
+
(isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
|
|
127
|
+
for val in df[pd_name]
|
|
128
|
+
)
|
|
129
|
+
if has_na and pd_name in primary_key:
|
|
130
|
+
raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
|
|
106
131
|
pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
107
132
|
pxt_name = __normalize_pxt_col_name(pd_name)
|
|
108
133
|
# Ensure that column names are unique by appending a distinguishing suffix
|
|
@@ -140,21 +165,37 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
|
|
|
140
165
|
"""
|
|
141
166
|
if np.issubdtype(np_dtype, np.integer):
|
|
142
167
|
return pxt.IntType(nullable=nullable)
|
|
168
|
+
|
|
143
169
|
if np.issubdtype(np_dtype, np.floating):
|
|
144
170
|
return pxt.FloatType(nullable=nullable)
|
|
171
|
+
|
|
145
172
|
if np.issubdtype(np_dtype, np.bool_):
|
|
146
173
|
return pxt.BoolType(nullable=nullable)
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
if has_nan and not nullable:
|
|
150
|
-
raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
|
|
174
|
+
|
|
175
|
+
if np.issubdtype(np_dtype, np.character):
|
|
151
176
|
return pxt.StringType(nullable=nullable)
|
|
177
|
+
|
|
152
178
|
if np.issubdtype(np_dtype, np.datetime64):
|
|
153
|
-
has_nat = any(pd.isnull(val) for val in data_col)
|
|
154
|
-
if has_nat and not nullable:
|
|
155
|
-
raise excs.Error(f'Primary key column `{data_col.name}` cannot contain null values.')
|
|
156
179
|
return pxt.TimestampType(nullable=nullable)
|
|
157
|
-
|
|
180
|
+
|
|
181
|
+
if np_dtype == np.object_:
|
|
182
|
+
# The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
|
|
183
|
+
# based on the actual data in `data_col`.
|
|
184
|
+
# First drop any null values (they don't contribute to type inference).
|
|
185
|
+
data_col = data_col.dropna()
|
|
186
|
+
|
|
187
|
+
if len(data_col) == 0:
|
|
188
|
+
# No non-null values; default to FloatType (the Pandas type of an all-NaN column)
|
|
189
|
+
return pxt.FloatType(nullable=nullable)
|
|
190
|
+
|
|
191
|
+
inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
|
|
192
|
+
if inferred_type is None:
|
|
193
|
+
# Fallback on StringType if everything else fails
|
|
194
|
+
return pxt.StringType(nullable=nullable)
|
|
195
|
+
else:
|
|
196
|
+
return inferred_type.copy(nullable=nullable)
|
|
197
|
+
|
|
198
|
+
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
|
|
158
199
|
|
|
159
200
|
|
|
160
201
|
def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
|
pixeltable/io/parquet.py
CHANGED
|
@@ -19,12 +19,14 @@ from pixeltable.utils.transactional_directory import transactional_directory
|
|
|
19
19
|
if typing.TYPE_CHECKING:
|
|
20
20
|
import pixeltable as pxt
|
|
21
21
|
import pyarrow as pa
|
|
22
|
+
from pyarrow import parquet
|
|
22
23
|
|
|
23
24
|
_logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
27
28
|
import pyarrow as pa
|
|
29
|
+
from pyarrow import parquet
|
|
28
30
|
|
|
29
31
|
pydict = {}
|
|
30
32
|
for field in schema:
|
|
@@ -35,7 +37,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
|
|
|
35
37
|
pydict[field.name] = value_batch[field.name]
|
|
36
38
|
|
|
37
39
|
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
38
|
-
|
|
40
|
+
parquet.write_table(tab, output_path)
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
|
|
@@ -55,23 +57,21 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
55
57
|
"""
|
|
56
58
|
from pixeltable.utils.arrow import to_arrow_schema
|
|
57
59
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
|
|
61
|
-
arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
|
|
60
|
+
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
61
|
+
arrow_schema = to_arrow_schema(df.schema)
|
|
62
62
|
|
|
63
63
|
# store the changes atomically
|
|
64
64
|
with transactional_directory(dest_path) as temp_path:
|
|
65
65
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
66
|
-
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
66
|
+
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
67
67
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
68
68
|
|
|
69
69
|
batch_num = 0
|
|
70
|
-
current_value_batch: Dict[str, deque] = {k: deque() for k in
|
|
70
|
+
current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
|
|
71
71
|
current_byte_estimate = 0
|
|
72
72
|
|
|
73
|
-
for data_row in df._exec():
|
|
74
|
-
for col_name, col_type, e in zip(
|
|
73
|
+
for data_row in df._exec():
|
|
74
|
+
for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
|
|
75
75
|
val = data_row[e.slot_idx]
|
|
76
76
|
if val is None:
|
|
77
77
|
current_value_batch[col_name].append(val)
|
|
@@ -122,7 +122,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
122
122
|
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
123
123
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
124
124
|
batch_num += 1
|
|
125
|
-
current_value_batch = {k: deque() for k in
|
|
125
|
+
current_value_batch = {k: deque() for k in df.schema.keys()}
|
|
126
126
|
current_byte_estimate = 0
|
|
127
127
|
|
|
128
128
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
@@ -130,11 +130,11 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
130
130
|
|
|
131
131
|
def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
|
|
132
132
|
"""Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
|
|
133
|
-
|
|
133
|
+
from pyarrow import parquet
|
|
134
134
|
from pixeltable.utils.arrow import to_pixeltable_schema
|
|
135
135
|
|
|
136
136
|
input_path = Path(parquet_path).expanduser()
|
|
137
|
-
parquet_dataset =
|
|
137
|
+
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
138
138
|
return to_pixeltable_schema(parquet_dataset.schema)
|
|
139
139
|
|
|
140
140
|
|
|
@@ -159,11 +159,11 @@ def import_parquet(
|
|
|
159
159
|
The newly created table. The table will have loaded the data from the Parquet file(s).
|
|
160
160
|
"""
|
|
161
161
|
import pixeltable as pxt
|
|
162
|
-
|
|
162
|
+
from pyarrow import parquet
|
|
163
163
|
from pixeltable.utils.arrow import iter_tuples
|
|
164
164
|
|
|
165
165
|
input_path = Path(parquet_path).expanduser()
|
|
166
|
-
parquet_dataset =
|
|
166
|
+
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
167
167
|
|
|
168
168
|
schema = parquet_schema_to_pixeltable_schema(parquet_path)
|
|
169
169
|
if schema_override is None:
|
pixeltable/iterators/document.py
CHANGED
|
@@ -38,7 +38,7 @@ class DocumentSectionMetadata:
|
|
|
38
38
|
sourceline: Optional[int] = None
|
|
39
39
|
# the stack of headings up to the most recently observed one;
|
|
40
40
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
41
|
-
heading: Optional[Dict[
|
|
41
|
+
heading: Optional[Dict[str, str]] = None
|
|
42
42
|
|
|
43
43
|
# pdf-specific metadata
|
|
44
44
|
page: Optional[int] = None
|
|
@@ -236,7 +236,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
236
236
|
accumulated_text = [] # currently accumulated text
|
|
237
237
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
238
238
|
|
|
239
|
-
headings: Dict[
|
|
239
|
+
headings: Dict[str, str] = {} # current state of observed headings (level -> text)
|
|
240
240
|
sourceline = 0 # most recently seen sourceline
|
|
241
241
|
|
|
242
242
|
def update_metadata(el: bs4.Tag) -> None:
|
|
@@ -244,12 +244,11 @@ class DocumentSplitter(ComponentIterator):
|
|
|
244
244
|
nonlocal headings, sourceline
|
|
245
245
|
sourceline = el.sourceline
|
|
246
246
|
if el.name in _HTML_HEADINGS:
|
|
247
|
-
level = int(el.name[1])
|
|
248
247
|
# remove the previously seen lower levels
|
|
249
|
-
lower_levels = [l for l in headings if l >
|
|
248
|
+
lower_levels = [l for l in headings if l > el.name]
|
|
250
249
|
for l in lower_levels:
|
|
251
250
|
del headings[l]
|
|
252
|
-
headings[
|
|
251
|
+
headings[el.name] = el.get_text().strip()
|
|
253
252
|
|
|
254
253
|
def emit() -> None:
|
|
255
254
|
nonlocal accumulated_text, headings, sourceline
|
|
@@ -295,13 +294,14 @@ class DocumentSplitter(ComponentIterator):
|
|
|
295
294
|
# current state
|
|
296
295
|
accumulated_text = [] # currently accumulated text
|
|
297
296
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
298
|
-
headings: Dict[
|
|
297
|
+
headings: Dict[str, str] = {} # current state of observed headings (level -> text)
|
|
299
298
|
|
|
300
299
|
def update_headings(heading: Dict) -> None:
|
|
301
300
|
# update current state
|
|
302
301
|
nonlocal headings
|
|
303
302
|
assert 'type' in heading and heading['type'] == 'heading'
|
|
304
|
-
|
|
303
|
+
lint = heading['attrs']['level']
|
|
304
|
+
level = f'h{lint}'
|
|
305
305
|
text = heading['children'][0]['raw'].strip()
|
|
306
306
|
# remove the previously seen lower levels
|
|
307
307
|
lower_levels = [l for l in headings.keys() if l > level]
|
pixeltable/iterators/video.py
CHANGED
|
@@ -1,57 +1,89 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import math
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
|
-
import PIL.Image
|
|
7
6
|
import cv2
|
|
7
|
+
import PIL.Image
|
|
8
8
|
|
|
9
9
|
from pixeltable.exceptions import Error
|
|
10
|
-
from pixeltable.type_system import ColumnType,
|
|
10
|
+
from pixeltable.type_system import ColumnType, FloatType, ImageType, IntType, VideoType
|
|
11
|
+
|
|
11
12
|
from .base import ComponentIterator
|
|
12
13
|
|
|
13
14
|
_logger = logging.getLogger('pixeltable')
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class FrameIterator(ComponentIterator):
|
|
17
|
-
"""
|
|
18
|
+
"""
|
|
19
|
+
Iterator over frames of a video. At most one of `fps` or `num_frames` may be specified. If `fps` is specified,
|
|
20
|
+
then frames will be extracted at the specified rate (frames per second). If `num_frames` is specified, then the
|
|
21
|
+
exact number of frames will be extracted. If neither is specified, then all frames will be extracted. The first
|
|
22
|
+
frame of the video will always be extracted, and the remaining frames will be spaced as evenly as possible.
|
|
18
23
|
|
|
19
24
|
Args:
|
|
20
|
-
video: URL or
|
|
21
|
-
fps:
|
|
22
|
-
If set to 0.0, then the native framerate of the video will be used (all frames will be
|
|
23
|
-
|
|
25
|
+
video: URL or path of the video to use for frame extraction.
|
|
26
|
+
fps: Number of frames to extract per second of video. This may be a fractional value, such as 0.5.
|
|
27
|
+
If omitted or set to 0.0, then the native framerate of the video will be used (all frames will be
|
|
28
|
+
extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
|
|
29
|
+
num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
|
|
30
|
+
`num_frames` is greater than the number of frames in the video, all frames will be extracted.
|
|
24
31
|
"""
|
|
25
|
-
def __init__(self, video: str, *, fps: float =
|
|
32
|
+
def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
|
|
33
|
+
if fps is not None and num_frames is not None:
|
|
34
|
+
raise Error('At most one of `fps` or `num_frames` may be specified')
|
|
35
|
+
|
|
26
36
|
video_path = Path(video)
|
|
27
37
|
assert video_path.exists() and video_path.is_file()
|
|
28
38
|
self.video_path = video_path
|
|
29
|
-
self.fps = fps
|
|
30
39
|
self.video_reader = cv2.VideoCapture(str(video_path))
|
|
40
|
+
self.fps = fps
|
|
41
|
+
self.num_frames = num_frames
|
|
31
42
|
if not self.video_reader.isOpened():
|
|
32
43
|
raise Error(f'Failed to open video: {video}')
|
|
44
|
+
|
|
33
45
|
video_fps = int(self.video_reader.get(cv2.CAP_PROP_FPS))
|
|
34
|
-
if fps > video_fps:
|
|
46
|
+
if fps is not None and fps > video_fps:
|
|
35
47
|
raise Error(f'Video {video}: requested fps ({fps}) exceeds that of the video ({video_fps})')
|
|
36
|
-
self.frame_freq = int(video_fps / fps) if fps > 0 else 1
|
|
37
48
|
num_video_frames = int(self.video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
38
49
|
if num_video_frames == 0:
|
|
39
50
|
raise Error(f'Video {video}: failed to get number of frames')
|
|
40
|
-
# ceil: round up to ensure we count frame 0
|
|
41
|
-
self.num_frames = math.ceil(num_video_frames / self.frame_freq) if fps > 0 else num_video_frames
|
|
42
|
-
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps}')
|
|
43
51
|
|
|
52
|
+
if num_frames is not None:
|
|
53
|
+
# specific number of frames
|
|
54
|
+
if num_frames > num_video_frames:
|
|
55
|
+
# Extract all frames
|
|
56
|
+
self.frames_to_extract = range(num_video_frames)
|
|
57
|
+
else:
|
|
58
|
+
spacing = float(num_video_frames) / float(num_frames)
|
|
59
|
+
self.frames_to_extract = list(round(i * spacing) for i in range(num_frames))
|
|
60
|
+
assert len(self.frames_to_extract) == num_frames
|
|
61
|
+
else:
|
|
62
|
+
if fps is None or fps == 0.0:
|
|
63
|
+
# Extract all frames
|
|
64
|
+
self.frames_to_extract = range(num_video_frames)
|
|
65
|
+
else:
|
|
66
|
+
# Extract frames at the implied frequency
|
|
67
|
+
freq = fps / video_fps
|
|
68
|
+
n = math.ceil(num_video_frames * freq) # number of frames to extract
|
|
69
|
+
self.frames_to_extract = list(round(i / freq) for i in range(n))
|
|
70
|
+
|
|
71
|
+
# We need the list of frames as both a list (for set_pos) and a set (for fast lookups when
|
|
72
|
+
# there are lots of frames)
|
|
73
|
+
self.frames_set = set(self.frames_to_extract)
|
|
74
|
+
_logger.debug(f'FrameIterator: path={self.video_path} fps={self.fps} num_frames={self.num_frames}')
|
|
44
75
|
self.next_frame_idx = 0
|
|
45
76
|
|
|
46
77
|
@classmethod
|
|
47
|
-
def input_schema(cls) ->
|
|
78
|
+
def input_schema(cls) -> dict[str, ColumnType]:
|
|
48
79
|
return {
|
|
49
80
|
'video': VideoType(nullable=False),
|
|
50
|
-
'fps': FloatType()
|
|
81
|
+
'fps': FloatType(nullable=True),
|
|
82
|
+
'num_frames': IntType(nullable=True),
|
|
51
83
|
}
|
|
52
84
|
|
|
53
85
|
@classmethod
|
|
54
|
-
def output_schema(cls, *args: Any, **kwargs: Any) ->
|
|
86
|
+
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
|
|
55
87
|
return {
|
|
56
88
|
'frame_idx': IntType(),
|
|
57
89
|
'pos_msec': FloatType(),
|
|
@@ -59,7 +91,9 @@ class FrameIterator(ComponentIterator):
|
|
|
59
91
|
'frame': ImageType(),
|
|
60
92
|
}, ['frame']
|
|
61
93
|
|
|
62
|
-
def __next__(self) ->
|
|
94
|
+
def __next__(self) -> dict[str, Any]:
|
|
95
|
+
# jumping to the target frame here with video_reader.set() is far slower than just
|
|
96
|
+
# skipping the unwanted frames
|
|
63
97
|
while True:
|
|
64
98
|
pos_msec = self.video_reader.get(cv2.CAP_PROP_POS_MSEC)
|
|
65
99
|
pos_frame = self.video_reader.get(cv2.CAP_PROP_POS_FRAMES)
|
|
@@ -69,7 +103,7 @@ class FrameIterator(ComponentIterator):
|
|
|
69
103
|
self.video_reader.release()
|
|
70
104
|
self.video_reader = None
|
|
71
105
|
raise StopIteration
|
|
72
|
-
if pos_frame
|
|
106
|
+
if pos_frame in self.frames_set:
|
|
73
107
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
|
74
108
|
result = {
|
|
75
109
|
'frame_idx': self.next_frame_idx,
|
|
@@ -78,8 +112,6 @@ class FrameIterator(ComponentIterator):
|
|
|
78
112
|
'frame': PIL.Image.fromarray(img),
|
|
79
113
|
}
|
|
80
114
|
self.next_frame_idx += 1
|
|
81
|
-
# frame_freq > 1: jumping to the target frame here with video_reader.set() is far slower than just
|
|
82
|
-
# skipping the unwanted frames
|
|
83
115
|
return result
|
|
84
116
|
|
|
85
117
|
def close(self) -> None:
|
|
@@ -92,5 +124,5 @@ class FrameIterator(ComponentIterator):
|
|
|
92
124
|
if pos == self.next_frame_idx:
|
|
93
125
|
return
|
|
94
126
|
_logger.debug(f'seeking to frame {pos}')
|
|
95
|
-
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES,
|
|
127
|
+
self.video_reader.set(cv2.CAP_PROP_POS_FRAMES, self.frames_to_extract[pos])
|
|
96
128
|
self.next_frame_idx = pos
|