pixeltable 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +3 -0
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/globals.py +15 -6
- pixeltable/catalog/insertable_table.py +23 -8
- pixeltable/catalog/named_function.py +1 -1
- pixeltable/catalog/path_dict.py +4 -4
- pixeltable/catalog/schema_object.py +30 -18
- pixeltable/catalog/table.py +84 -99
- pixeltable/catalog/table_version.py +35 -24
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +15 -8
- pixeltable/dataframe.py +56 -56
- pixeltable/env.py +6 -5
- pixeltable/exec/__init__.py +3 -3
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/expr_eval_node.py +3 -3
- pixeltable/exec/in_memory_data_node.py +4 -4
- pixeltable/exec/sql_node.py +4 -1
- pixeltable/exprs/array_slice.py +3 -4
- pixeltable/exprs/column_ref.py +20 -4
- pixeltable/exprs/comparison.py +11 -6
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/expr.py +51 -23
- pixeltable/exprs/function_call.py +8 -1
- pixeltable/exprs/inline_array.py +2 -2
- pixeltable/exprs/json_path.py +36 -20
- pixeltable/exprs/row_builder.py +4 -4
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/functions/__init__.py +1 -2
- pixeltable/functions/audio.py +32 -0
- pixeltable/functions/huggingface.py +4 -4
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/video.py +5 -1
- pixeltable/functions/vision.py +2 -6
- pixeltable/globals.py +57 -28
- pixeltable/io/external_store.py +4 -4
- pixeltable/io/globals.py +12 -13
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/pandas.py +27 -12
- pixeltable/io/parquet.py +14 -14
- pixeltable/iterators/document.py +7 -7
- pixeltable/plan.py +58 -29
- pixeltable/store.py +32 -31
- pixeltable/tool/create_test_db_dump.py +12 -6
- pixeltable/type_system.py +89 -97
- pixeltable/utils/pytorch.py +12 -10
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/RECORD +52 -51
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import logging
|
|
3
3
|
from typing import Any, Optional, Union
|
|
4
|
+
from uuid import UUID
|
|
4
5
|
|
|
5
6
|
import pandas as pd
|
|
6
7
|
import sqlalchemy as sql
|
|
8
|
+
from pandas.io.formats.style import Styler
|
|
7
9
|
from sqlalchemy.util.preloaded import orm
|
|
8
10
|
|
|
9
11
|
import pixeltable.exceptions as excs
|
|
10
12
|
import pixeltable.exprs as exprs
|
|
11
|
-
from pixeltable import catalog, func
|
|
13
|
+
from pixeltable import DataFrame, catalog, func
|
|
12
14
|
from pixeltable.catalog import Catalog
|
|
15
|
+
from pixeltable.dataframe import DataFrameResultSet
|
|
13
16
|
from pixeltable.env import Env
|
|
14
17
|
from pixeltable.iterators import ComponentIterator
|
|
15
18
|
from pixeltable.metadata import schema
|
|
@@ -24,21 +27,25 @@ def init() -> None:
|
|
|
24
27
|
|
|
25
28
|
def create_table(
|
|
26
29
|
path_str: str,
|
|
27
|
-
|
|
30
|
+
schema_or_df: Union[dict[str, Any], DataFrame],
|
|
28
31
|
*,
|
|
29
32
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
30
33
|
num_retained_versions: int = 10,
|
|
31
34
|
comment: str = '',
|
|
32
|
-
) -> catalog.
|
|
33
|
-
"""Create a new
|
|
35
|
+
) -> catalog.Table:
|
|
36
|
+
"""Create a new base table.
|
|
34
37
|
|
|
35
38
|
Args:
|
|
36
39
|
path_str: Path to the table.
|
|
37
|
-
|
|
40
|
+
schema_or_df: Either a dictionary that maps column names to column types, or a
|
|
41
|
+
[`DataFrame`][pixeltable.DataFrame] whose contents and schema will be used to pre-populate the table.
|
|
42
|
+
primary_key: An optional column name or list of column names to use as the primary key(s) of the
|
|
43
|
+
table.
|
|
38
44
|
num_retained_versions: Number of versions of the table to retain.
|
|
45
|
+
comment: An optional comment; its meaning is user-defined.
|
|
39
46
|
|
|
40
47
|
Returns:
|
|
41
|
-
|
|
48
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
42
49
|
|
|
43
50
|
Raises:
|
|
44
51
|
Error: if the path already exists or is invalid.
|
|
@@ -46,12 +53,27 @@ def create_table(
|
|
|
46
53
|
Examples:
|
|
47
54
|
Create a table with an int and a string column:
|
|
48
55
|
|
|
49
|
-
>>> table =
|
|
56
|
+
>>> table = pxt.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
|
|
57
|
+
|
|
58
|
+
Create a table from a select statement over an existing table `tbl`:
|
|
59
|
+
|
|
60
|
+
>>> table = pxt.create_table('my_table', tbl.where(tbl.col1 < 10).select(tbl.col2))
|
|
50
61
|
"""
|
|
51
62
|
path = catalog.Path(path_str)
|
|
52
63
|
Catalog.get().paths.check_is_valid(path, expected=None)
|
|
53
64
|
dir = Catalog.get().paths[path.parent]
|
|
54
65
|
|
|
66
|
+
df: Optional[DataFrame] = None
|
|
67
|
+
if isinstance(schema_or_df, dict):
|
|
68
|
+
schema = schema_or_df
|
|
69
|
+
elif isinstance(schema_or_df, DataFrame):
|
|
70
|
+
df = schema_or_df
|
|
71
|
+
schema = df.schema
|
|
72
|
+
elif isinstance(schema_or_df, DataFrameResultSet):
|
|
73
|
+
raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame. (Is there an extraneous call to `collect()`?)')
|
|
74
|
+
else:
|
|
75
|
+
raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame.')
|
|
76
|
+
|
|
55
77
|
if len(schema) == 0:
|
|
56
78
|
raise excs.Error(f'Table schema is empty: `{path_str}`')
|
|
57
79
|
|
|
@@ -63,15 +85,17 @@ def create_table(
|
|
|
63
85
|
if not isinstance(primary_key, list) or not all(isinstance(pk, str) for pk in primary_key):
|
|
64
86
|
raise excs.Error('primary_key must be a single column name or a list of column names')
|
|
65
87
|
|
|
66
|
-
tbl = catalog.InsertableTable.
|
|
88
|
+
tbl = catalog.InsertableTable._create(
|
|
67
89
|
dir._id,
|
|
68
90
|
path.name,
|
|
69
91
|
schema,
|
|
92
|
+
df,
|
|
70
93
|
primary_key=primary_key,
|
|
71
94
|
num_retained_versions=num_retained_versions,
|
|
72
95
|
comment=comment,
|
|
73
96
|
)
|
|
74
97
|
Catalog.get().paths[path] = tbl
|
|
98
|
+
|
|
75
99
|
_logger.info(f'Created table `{path_str}`.')
|
|
76
100
|
return tbl
|
|
77
101
|
|
|
@@ -87,25 +111,28 @@ def create_view(
|
|
|
87
111
|
num_retained_versions: int = 10,
|
|
88
112
|
comment: str = '',
|
|
89
113
|
ignore_errors: bool = False,
|
|
90
|
-
) -> catalog.
|
|
91
|
-
"""Create a
|
|
114
|
+
) -> Optional[catalog.Table]:
|
|
115
|
+
"""Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
|
|
92
116
|
|
|
93
117
|
Args:
|
|
94
118
|
path_str: Path to the view.
|
|
95
|
-
base: Table (i.e., table or view or snapshot) or DataFrame to
|
|
119
|
+
base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
|
|
120
|
+
base the view on.
|
|
96
121
|
schema: dictionary mapping column names to column types, value expressions, or to column specifications.
|
|
97
122
|
filter: predicate to filter rows of the base table.
|
|
98
123
|
is_snapshot: Whether the view is a snapshot.
|
|
99
124
|
iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
|
|
100
125
|
the base table.
|
|
101
126
|
num_retained_versions: Number of versions of the view to retain.
|
|
127
|
+
comment: Optional comment for the view.
|
|
102
128
|
ignore_errors: if True, fail silently if the path already exists or is invalid.
|
|
103
129
|
|
|
104
130
|
Returns:
|
|
105
|
-
|
|
131
|
+
A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
|
|
132
|
+
exists or is invalid and `ignore_errors=True`, returns `None`.
|
|
106
133
|
|
|
107
134
|
Raises:
|
|
108
|
-
Error: if the path already exists or is invalid
|
|
135
|
+
Error: if the path already exists or is invalid and `ignore_errors=False`.
|
|
109
136
|
|
|
110
137
|
Examples:
|
|
111
138
|
Create a view with an additional int and a string column and a filter:
|
|
@@ -140,7 +167,7 @@ def create_view(
|
|
|
140
167
|
Catalog.get().paths.check_is_valid(path, expected=None)
|
|
141
168
|
except Exception as e:
|
|
142
169
|
if ignore_errors:
|
|
143
|
-
return
|
|
170
|
+
return None
|
|
144
171
|
else:
|
|
145
172
|
raise e
|
|
146
173
|
dir = Catalog.get().paths[path.parent]
|
|
@@ -152,7 +179,7 @@ def create_view(
|
|
|
152
179
|
else:
|
|
153
180
|
iterator_class, iterator_args = iterator
|
|
154
181
|
|
|
155
|
-
view = catalog.View.
|
|
182
|
+
view = catalog.View._create(
|
|
156
183
|
dir._id,
|
|
157
184
|
path.name,
|
|
158
185
|
base=tbl_version_path,
|
|
@@ -170,16 +197,16 @@ def create_view(
|
|
|
170
197
|
|
|
171
198
|
|
|
172
199
|
def get_table(path: str) -> catalog.Table:
|
|
173
|
-
"""Get a handle to
|
|
200
|
+
"""Get a handle to an existing table or view or snapshot.
|
|
174
201
|
|
|
175
202
|
Args:
|
|
176
203
|
path: Path to the table.
|
|
177
204
|
|
|
178
205
|
Returns:
|
|
179
|
-
A
|
|
206
|
+
A handle to the [`Table`][pixeltable.Table].
|
|
180
207
|
|
|
181
208
|
Raises:
|
|
182
|
-
Error: If the path does not exist or does not designate a table.
|
|
209
|
+
Error: If the path does not exist or does not designate a table object.
|
|
183
210
|
|
|
184
211
|
Examples:
|
|
185
212
|
Get handle for a table in the top-level directory:
|
|
@@ -197,6 +224,7 @@ def get_table(path: str) -> catalog.Table:
|
|
|
197
224
|
p = catalog.Path(path)
|
|
198
225
|
Catalog.get().paths.check_is_valid(p, expected=catalog.Table)
|
|
199
226
|
obj = Catalog.get().paths[p]
|
|
227
|
+
assert isinstance(obj, catalog.Table)
|
|
200
228
|
return obj
|
|
201
229
|
|
|
202
230
|
|
|
@@ -230,15 +258,15 @@ def move(path: str, new_path: str) -> None:
|
|
|
230
258
|
|
|
231
259
|
|
|
232
260
|
def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> None:
|
|
233
|
-
"""Drop a table.
|
|
261
|
+
"""Drop a table or view or snapshot.
|
|
234
262
|
|
|
235
263
|
Args:
|
|
236
|
-
path: Path to the
|
|
264
|
+
path: Path to the [`Table`][pixeltable.Table].
|
|
237
265
|
force: If `True`, will also drop all views or sub-views of this table.
|
|
238
266
|
ignore_errors: Whether to ignore errors if the table does not exist.
|
|
239
267
|
|
|
240
268
|
Raises:
|
|
241
|
-
Error: If the path does not exist or does not designate a table and ignore_errors is False.
|
|
269
|
+
Error: If the path does not exist or does not designate a table object and ignore_errors is False.
|
|
242
270
|
|
|
243
271
|
Examples:
|
|
244
272
|
>>> cl.drop_table('my_table')
|
|
@@ -256,7 +284,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
|
|
|
256
284
|
tbl = cat.paths[path_obj]
|
|
257
285
|
assert isinstance(tbl, catalog.Table)
|
|
258
286
|
if len(cat.tbl_dependents[tbl._id]) > 0:
|
|
259
|
-
dependent_paths = [dep.
|
|
287
|
+
dependent_paths = [dep._path for dep in cat.tbl_dependents[tbl._id]]
|
|
260
288
|
if force:
|
|
261
289
|
for dependent_path in dependent_paths:
|
|
262
290
|
drop_table(dependent_path, force=True)
|
|
@@ -268,14 +296,14 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
|
|
|
268
296
|
|
|
269
297
|
|
|
270
298
|
def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
271
|
-
"""List the
|
|
299
|
+
"""List the [`Table`][pixeltable.Table]s in a directory.
|
|
272
300
|
|
|
273
301
|
Args:
|
|
274
302
|
dir_path: Path to the directory. Defaults to the root directory.
|
|
275
303
|
recursive: Whether to list tables in subdirectories as well.
|
|
276
304
|
|
|
277
305
|
Returns:
|
|
278
|
-
A list of
|
|
306
|
+
A list of [`Table`][pixeltable.Table] paths.
|
|
279
307
|
|
|
280
308
|
Raises:
|
|
281
309
|
Error: If the path does not exist or does not designate a directory.
|
|
@@ -297,7 +325,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
297
325
|
return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
|
|
298
326
|
|
|
299
327
|
|
|
300
|
-
def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
|
|
328
|
+
def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.Dir]:
|
|
301
329
|
"""Create a directory.
|
|
302
330
|
|
|
303
331
|
Args:
|
|
@@ -325,6 +353,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
|
|
|
325
353
|
session.add(dir_record)
|
|
326
354
|
session.flush()
|
|
327
355
|
assert dir_record.id is not None
|
|
356
|
+
assert isinstance(dir_record.id, UUID)
|
|
328
357
|
dir = catalog.Dir(dir_record.id, parent._id, path.name)
|
|
329
358
|
Catalog.get().paths[path] = dir
|
|
330
359
|
session.commit()
|
|
@@ -333,7 +362,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
|
|
|
333
362
|
return dir
|
|
334
363
|
except excs.Error as e:
|
|
335
364
|
if ignore_errors:
|
|
336
|
-
return
|
|
365
|
+
return None
|
|
337
366
|
else:
|
|
338
367
|
raise e
|
|
339
368
|
|
|
@@ -415,7 +444,7 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
|
|
|
415
444
|
return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Dir, recursive=recursive)]
|
|
416
445
|
|
|
417
446
|
|
|
418
|
-
def list_functions() ->
|
|
447
|
+
def list_functions() -> Styler:
|
|
419
448
|
"""Returns information about all registered functions.
|
|
420
449
|
|
|
421
450
|
Returns:
|
|
@@ -436,7 +465,7 @@ def list_functions() -> pd.DataFrame:
|
|
|
436
465
|
'Return Type': [str(f.signature.get_return_type()) for f in functions],
|
|
437
466
|
}
|
|
438
467
|
)
|
|
439
|
-
pd_df = pd_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
|
|
468
|
+
pd_df = pd_df.style.set_properties(None, **{'text-align': 'left'}).set_table_styles(
|
|
440
469
|
[dict(selector='th', props=[('text-align', 'center')])]
|
|
441
470
|
) # center-align headings
|
|
442
471
|
return pd_df.hide(axis='index')
|
pixeltable/io/external_store.py
CHANGED
|
@@ -217,17 +217,17 @@ class Project(ExternalStore, abc.ABC):
|
|
|
217
217
|
resolved_col_mapping: dict[Column, str] = {}
|
|
218
218
|
|
|
219
219
|
# Validate names
|
|
220
|
-
t_cols = table.
|
|
220
|
+
t_cols = set(table._schema.keys())
|
|
221
221
|
for t_col, ext_col in col_mapping.items():
|
|
222
222
|
if t_col not in t_cols:
|
|
223
223
|
if is_user_specified_col_mapping:
|
|
224
224
|
raise excs.Error(
|
|
225
|
-
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.
|
|
225
|
+
f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table._name}` '
|
|
226
226
|
'contains no such column.'
|
|
227
227
|
)
|
|
228
228
|
else:
|
|
229
229
|
raise excs.Error(
|
|
230
|
-
f'Column `{t_col}` does not exist in Table `{table.
|
|
230
|
+
f'Column `{t_col}` does not exist in Table `{table._name}`. Either add a column `{t_col}`, '
|
|
231
231
|
f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
|
|
232
232
|
)
|
|
233
233
|
if ext_col not in export_cols and ext_col not in import_cols:
|
|
@@ -238,7 +238,7 @@ class Project(ExternalStore, abc.ABC):
|
|
|
238
238
|
col = table[t_col].col
|
|
239
239
|
resolved_col_mapping[col] = ext_col
|
|
240
240
|
# Validate column specs
|
|
241
|
-
t_col_types = table.
|
|
241
|
+
t_col_types = table._schema
|
|
242
242
|
for t_col, ext_col in col_mapping.items():
|
|
243
243
|
t_col_type = t_col_types[t_col]
|
|
244
244
|
if ext_col in export_cols:
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from typing import Any, Literal, Optional, Union
|
|
2
|
-
import urllib.request
|
|
3
2
|
|
|
4
3
|
import pixeltable as pxt
|
|
5
4
|
import pixeltable.exceptions as excs
|
|
@@ -19,7 +18,7 @@ def create_label_studio_project(
|
|
|
19
18
|
**kwargs: Any
|
|
20
19
|
) -> SyncStatus:
|
|
21
20
|
"""
|
|
22
|
-
Create a new Label Studio project and link it to the specified `Table
|
|
21
|
+
Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
|
|
23
22
|
|
|
24
23
|
- A tutorial notebook with fully worked examples can be found here:
|
|
25
24
|
[Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
|
|
@@ -34,7 +33,7 @@ def create_label_studio_project(
|
|
|
34
33
|
then the linked project will have a column named `image`. In addition, the linked project
|
|
35
34
|
will always have a JSON-typed column `annotations` representing the output.
|
|
36
35
|
|
|
37
|
-
By default, Pixeltable will link each of these columns to a column of the specified `Table`
|
|
36
|
+
By default, Pixeltable will link each of these columns to a column of the specified [`Table`][pixeltable.Table]
|
|
38
37
|
with the same name. If any of the data fields are missing, an exception will be raised. If
|
|
39
38
|
the `annotations` column is missing, it will be created. The default names can be overridden
|
|
40
39
|
by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
|
|
@@ -52,7 +51,7 @@ def create_label_studio_project(
|
|
|
52
51
|
- `pip install boto3` (if using S3 import storage)
|
|
53
52
|
|
|
54
53
|
Args:
|
|
55
|
-
t: The
|
|
54
|
+
t: The table to link to.
|
|
56
55
|
label_config: The Label Studio project configuration, in XML format.
|
|
57
56
|
name: An optional name for the new project in Pixeltable. If specified, must be a valid
|
|
58
57
|
Pixeltable identifier and must not be the name of any other external data store
|
|
@@ -73,7 +72,7 @@ def create_label_studio_project(
|
|
|
73
72
|
The default is `post`.
|
|
74
73
|
col_mapping: An optional mapping of local column names to Label Studio fields.
|
|
75
74
|
sync_immediately: If `True`, immediately perform an initial synchronization by
|
|
76
|
-
exporting all rows of the
|
|
75
|
+
exporting all rows of the table as Label Studio tasks.
|
|
77
76
|
s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
|
|
78
77
|
be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
|
|
79
78
|
referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
|
|
@@ -148,15 +147,15 @@ def import_rows(
|
|
|
148
147
|
comment: str = ''
|
|
149
148
|
) -> Table:
|
|
150
149
|
"""
|
|
151
|
-
Creates a new
|
|
152
|
-
`{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
150
|
+
Creates a new base table from a list of dictionaries. The dictionaries must be of the
|
|
151
|
+
form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
|
|
153
152
|
supplied data, using the most specific type that can represent all the values in a column.
|
|
154
153
|
|
|
155
154
|
If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
|
|
156
155
|
Pixeltable will force the specified column to the specified type (and will not attempt any type inference
|
|
157
156
|
for that column).
|
|
158
157
|
|
|
159
|
-
All column types of the new
|
|
158
|
+
All column types of the new table will be nullable unless explicitly specified as non-nullable in
|
|
160
159
|
`schema_overrides`.
|
|
161
160
|
|
|
162
161
|
Args:
|
|
@@ -169,7 +168,7 @@ def import_rows(
|
|
|
169
168
|
comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
|
|
170
169
|
|
|
171
170
|
Returns:
|
|
172
|
-
|
|
171
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
173
172
|
"""
|
|
174
173
|
if schema_overrides is None:
|
|
175
174
|
schema_overrides = {}
|
|
@@ -187,7 +186,7 @@ def import_rows(
|
|
|
187
186
|
elif value is not None:
|
|
188
187
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
|
189
188
|
# The column type will always be nullable by default.
|
|
190
|
-
col_type = pxt.ColumnType.infer_literal_type(value
|
|
189
|
+
col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
|
|
191
190
|
if col_name not in schema:
|
|
192
191
|
schema[col_name] = col_type
|
|
193
192
|
else:
|
|
@@ -230,8 +229,8 @@ def import_json(
|
|
|
230
229
|
**kwargs: Any
|
|
231
230
|
) -> Table:
|
|
232
231
|
"""
|
|
233
|
-
Creates a new
|
|
234
|
-
to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
232
|
+
Creates a new base table from a JSON file. This is a convenience method and is
|
|
233
|
+
equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
|
|
235
234
|
is the contents of the specified `filepath_or_url`.
|
|
236
235
|
|
|
237
236
|
Args:
|
|
@@ -245,7 +244,7 @@ def import_json(
|
|
|
245
244
|
kwargs: Additional keyword arguments to pass to `json.loads`.
|
|
246
245
|
|
|
247
246
|
Returns:
|
|
248
|
-
|
|
247
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
249
248
|
"""
|
|
250
249
|
import json
|
|
251
250
|
import urllib.parse
|
pixeltable/io/label_studio.py
CHANGED
|
@@ -105,7 +105,7 @@ class LabelStudioProject(Project):
|
|
|
105
105
|
return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
|
|
106
106
|
|
|
107
107
|
def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
|
|
108
|
-
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.
|
|
108
|
+
_logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
|
|
109
109
|
f' (export: {export_data}, import: {import_data}).')
|
|
110
110
|
# Collect all existing tasks into a dict with entries `rowid: task`
|
|
111
111
|
tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
|
|
@@ -396,15 +396,15 @@ class LabelStudioProject(Project):
|
|
|
396
396
|
updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
|
|
397
397
|
if len(updates) > 0:
|
|
398
398
|
_logger.info(
|
|
399
|
-
f'Updating table `{t.
|
|
399
|
+
f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
|
|
400
400
|
)
|
|
401
401
|
# batch_update currently doesn't propagate from views to base tables. As a workaround, we call
|
|
402
402
|
# batch_update on the actual ancestor table that holds the annotations column.
|
|
403
403
|
# TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
|
|
404
404
|
ancestor = t
|
|
405
405
|
while local_annotations_col not in ancestor._tbl_version.cols:
|
|
406
|
-
assert ancestor.
|
|
407
|
-
ancestor = ancestor.
|
|
406
|
+
assert ancestor._base is not None
|
|
407
|
+
ancestor = ancestor._base
|
|
408
408
|
update_status = ancestor.batch_update(updates)
|
|
409
409
|
print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
|
|
410
410
|
return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
|
|
@@ -565,7 +565,7 @@ class LabelStudioProject(Project):
|
|
|
565
565
|
|
|
566
566
|
if title is None:
|
|
567
567
|
# `title` defaults to table name
|
|
568
|
-
title = t.
|
|
568
|
+
title = t._name
|
|
569
569
|
|
|
570
570
|
# Create a column to hold the annotations, if one does not yet exist
|
|
571
571
|
if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
|
|
@@ -573,7 +573,7 @@ class LabelStudioProject(Project):
|
|
|
573
573
|
local_annotations_column = ANNOTATIONS_COLUMN
|
|
574
574
|
else:
|
|
575
575
|
local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
|
|
576
|
-
if local_annotations_column not in t.
|
|
576
|
+
if local_annotations_column not in t._schema.keys():
|
|
577
577
|
t[local_annotations_column] = pxt.JsonType(nullable=True)
|
|
578
578
|
|
|
579
579
|
resolved_col_mapping = cls.validate_columns(
|
pixeltable/io/pandas.py
CHANGED
|
@@ -15,11 +15,12 @@ def import_pandas(
|
|
|
15
15
|
primary_key: Optional[Union[str, list[str]]] = None,
|
|
16
16
|
num_retained_versions: int = 10,
|
|
17
17
|
comment: str = ''
|
|
18
|
-
) -> pxt.
|
|
19
|
-
"""Creates a new
|
|
20
|
-
|
|
18
|
+
) -> pxt.Table:
|
|
19
|
+
"""Creates a new base table from a Pandas
|
|
20
|
+
[`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
|
|
21
|
+
specified name. The schema of the table will be inferred from the DataFrame.
|
|
21
22
|
|
|
22
|
-
The column names of the new
|
|
23
|
+
The column names of the new table will be identical to those in the DataFrame, as long as they are valid
|
|
23
24
|
Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
|
|
24
25
|
the following procedure:
|
|
25
26
|
- first replace any non-alphanumeric characters with underscores;
|
|
@@ -33,6 +34,9 @@ def import_pandas(
|
|
|
33
34
|
name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
|
|
34
35
|
`schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
|
|
35
36
|
Pixeltable identifiers).
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
36
40
|
"""
|
|
37
41
|
if schema_overrides is None:
|
|
38
42
|
schema_overrides = {}
|
|
@@ -54,11 +58,15 @@ def import_csv(
|
|
|
54
58
|
num_retained_versions: int = 10,
|
|
55
59
|
comment: str = '',
|
|
56
60
|
**kwargs
|
|
57
|
-
) -> pxt.
|
|
61
|
+
) -> pxt.Table:
|
|
58
62
|
"""
|
|
59
|
-
Creates a new
|
|
63
|
+
Creates a new base table from a csv file. This is a convenience method and is equivalent
|
|
60
64
|
to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
|
|
61
|
-
See the Pandas documentation for `read_csv`
|
|
65
|
+
See the Pandas documentation for [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
|
|
66
|
+
for more details.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
62
70
|
"""
|
|
63
71
|
df = pd.read_csv(filepath_or_buffer, **kwargs)
|
|
64
72
|
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
@@ -70,11 +78,15 @@ def import_excel(
|
|
|
70
78
|
num_retained_versions: int = 10,
|
|
71
79
|
comment: str = '',
|
|
72
80
|
**kwargs
|
|
73
|
-
) -> pxt.
|
|
81
|
+
) -> pxt.Table:
|
|
74
82
|
"""
|
|
75
|
-
Creates a new
|
|
76
|
-
to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
|
|
77
|
-
See the Pandas documentation for `read_excel`
|
|
83
|
+
Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
|
|
84
|
+
equivalent to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
|
|
85
|
+
See the Pandas documentation for [`read_excel`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html)
|
|
86
|
+
for more details.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
A handle to the newly created [`Table`][pixeltable.Table].
|
|
78
90
|
"""
|
|
79
91
|
df = pd.read_excel(io, *args, **kwargs)
|
|
80
92
|
return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
|
|
@@ -177,7 +189,10 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
|
|
|
177
189
|
return pxt.FloatType(nullable=nullable)
|
|
178
190
|
|
|
179
191
|
inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
|
|
180
|
-
if inferred_type is
|
|
192
|
+
if inferred_type is None:
|
|
193
|
+
# Fallback on StringType if everything else fails
|
|
194
|
+
return pxt.StringType(nullable=nullable)
|
|
195
|
+
else:
|
|
181
196
|
return inferred_type.copy(nullable=nullable)
|
|
182
197
|
|
|
183
198
|
raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
|
pixeltable/io/parquet.py
CHANGED
|
@@ -19,12 +19,14 @@ from pixeltable.utils.transactional_directory import transactional_directory
|
|
|
19
19
|
if typing.TYPE_CHECKING:
|
|
20
20
|
import pixeltable as pxt
|
|
21
21
|
import pyarrow as pa
|
|
22
|
+
from pyarrow import parquet
|
|
22
23
|
|
|
23
24
|
_logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
|
|
27
28
|
import pyarrow as pa
|
|
29
|
+
from pyarrow import parquet
|
|
28
30
|
|
|
29
31
|
pydict = {}
|
|
30
32
|
for field in schema:
|
|
@@ -35,7 +37,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
|
|
|
35
37
|
pydict[field.name] = value_batch[field.name]
|
|
36
38
|
|
|
37
39
|
tab = pa.Table.from_pydict(pydict, schema=schema)
|
|
38
|
-
|
|
40
|
+
parquet.write_table(tab, output_path)
|
|
39
41
|
|
|
40
42
|
|
|
41
43
|
def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
|
|
@@ -55,23 +57,21 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
55
57
|
"""
|
|
56
58
|
from pixeltable.utils.arrow import to_arrow_schema
|
|
57
59
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
|
|
61
|
-
arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
|
|
60
|
+
type_dict = {k: v.as_dict() for k, v in df.schema.items()}
|
|
61
|
+
arrow_schema = to_arrow_schema(df.schema)
|
|
62
62
|
|
|
63
63
|
# store the changes atomically
|
|
64
64
|
with transactional_directory(dest_path) as temp_path:
|
|
65
65
|
# dump metadata json file so we can inspect what was the source of the parquet file later on.
|
|
66
|
-
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
66
|
+
json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
|
|
67
67
|
json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
|
|
68
68
|
|
|
69
69
|
batch_num = 0
|
|
70
|
-
current_value_batch: Dict[str, deque] = {k: deque() for k in
|
|
70
|
+
current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
|
|
71
71
|
current_byte_estimate = 0
|
|
72
72
|
|
|
73
|
-
for data_row in df._exec():
|
|
74
|
-
for col_name, col_type, e in zip(
|
|
73
|
+
for data_row in df._exec():
|
|
74
|
+
for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
|
|
75
75
|
val = data_row[e.slot_idx]
|
|
76
76
|
if val is None:
|
|
77
77
|
current_value_batch[col_name].append(val)
|
|
@@ -122,7 +122,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
122
122
|
assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
|
|
123
123
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
124
124
|
batch_num += 1
|
|
125
|
-
current_value_batch = {k: deque() for k in
|
|
125
|
+
current_value_batch = {k: deque() for k in df.schema.keys()}
|
|
126
126
|
current_byte_estimate = 0
|
|
127
127
|
|
|
128
128
|
_write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
|
|
@@ -130,11 +130,11 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
|
|
|
130
130
|
|
|
131
131
|
def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
|
|
132
132
|
"""Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
|
|
133
|
-
|
|
133
|
+
from pyarrow import parquet
|
|
134
134
|
from pixeltable.utils.arrow import to_pixeltable_schema
|
|
135
135
|
|
|
136
136
|
input_path = Path(parquet_path).expanduser()
|
|
137
|
-
parquet_dataset =
|
|
137
|
+
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
138
138
|
return to_pixeltable_schema(parquet_dataset.schema)
|
|
139
139
|
|
|
140
140
|
|
|
@@ -159,11 +159,11 @@ def import_parquet(
|
|
|
159
159
|
The newly created table. The table will have loaded the data from the Parquet file(s).
|
|
160
160
|
"""
|
|
161
161
|
import pixeltable as pxt
|
|
162
|
-
|
|
162
|
+
from pyarrow import parquet
|
|
163
163
|
from pixeltable.utils.arrow import iter_tuples
|
|
164
164
|
|
|
165
165
|
input_path = Path(parquet_path).expanduser()
|
|
166
|
-
parquet_dataset =
|
|
166
|
+
parquet_dataset = parquet.ParquetDataset(input_path)
|
|
167
167
|
|
|
168
168
|
schema = parquet_schema_to_pixeltable_schema(parquet_path)
|
|
169
169
|
if schema_override is None:
|
pixeltable/iterators/document.py
CHANGED
|
@@ -38,7 +38,7 @@ class DocumentSectionMetadata:
|
|
|
38
38
|
sourceline: Optional[int] = None
|
|
39
39
|
# the stack of headings up to the most recently observed one;
|
|
40
40
|
# eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
|
|
41
|
-
heading: Optional[Dict[
|
|
41
|
+
heading: Optional[Dict[str, str]] = None
|
|
42
42
|
|
|
43
43
|
# pdf-specific metadata
|
|
44
44
|
page: Optional[int] = None
|
|
@@ -236,7 +236,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
236
236
|
accumulated_text = [] # currently accumulated text
|
|
237
237
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
238
238
|
|
|
239
|
-
headings: Dict[
|
|
239
|
+
headings: Dict[str, str] = {} # current state of observed headings (level -> text)
|
|
240
240
|
sourceline = 0 # most recently seen sourceline
|
|
241
241
|
|
|
242
242
|
def update_metadata(el: bs4.Tag) -> None:
|
|
@@ -244,12 +244,11 @@ class DocumentSplitter(ComponentIterator):
|
|
|
244
244
|
nonlocal headings, sourceline
|
|
245
245
|
sourceline = el.sourceline
|
|
246
246
|
if el.name in _HTML_HEADINGS:
|
|
247
|
-
level = int(el.name[1])
|
|
248
247
|
# remove the previously seen lower levels
|
|
249
|
-
lower_levels = [l for l in headings if l >
|
|
248
|
+
lower_levels = [l for l in headings if l > el.name]
|
|
250
249
|
for l in lower_levels:
|
|
251
250
|
del headings[l]
|
|
252
|
-
headings[
|
|
251
|
+
headings[el.name] = el.get_text().strip()
|
|
253
252
|
|
|
254
253
|
def emit() -> None:
|
|
255
254
|
nonlocal accumulated_text, headings, sourceline
|
|
@@ -295,13 +294,14 @@ class DocumentSplitter(ComponentIterator):
|
|
|
295
294
|
# current state
|
|
296
295
|
accumulated_text = [] # currently accumulated text
|
|
297
296
|
# accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
|
|
298
|
-
headings: Dict[
|
|
297
|
+
headings: Dict[str, str] = {} # current state of observed headings (level -> text)
|
|
299
298
|
|
|
300
299
|
def update_headings(heading: Dict) -> None:
|
|
301
300
|
# update current state
|
|
302
301
|
nonlocal headings
|
|
303
302
|
assert 'type' in heading and heading['type'] == 'heading'
|
|
304
|
-
|
|
303
|
+
lint = heading['attrs']['level']
|
|
304
|
+
level = f'h{lint}'
|
|
305
305
|
text = heading['children'][0]['raw'].strip()
|
|
306
306
|
# remove the previously seen lower levels
|
|
307
307
|
lower_levels = [l for l in headings.keys() if l > level]
|