pixeltable 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +619 -255
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +9 -9
- pixeltable/catalog/path.py +59 -20
- pixeltable/catalog/schema_object.py +10 -4
- pixeltable/catalog/table.py +51 -53
- pixeltable/catalog/table_version.py +216 -156
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/view.py +63 -65
- pixeltable/config.py +12 -4
- pixeltable/dataframe.py +75 -6
- pixeltable/env.py +46 -17
- pixeltable/exec/aggregation_node.py +1 -1
- pixeltable/exec/cache_prefetch_node.py +2 -6
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +10 -51
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/in_memory_data_node.py +17 -16
- pixeltable/exec/sql_node.py +6 -7
- pixeltable/exprs/column_ref.py +2 -1
- pixeltable/exprs/data_row.py +13 -13
- pixeltable/exprs/row_builder.py +16 -4
- pixeltable/exprs/string_op.py +1 -1
- pixeltable/func/expr_template_function.py +1 -4
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/openai.py +9 -6
- pixeltable/functions/timestamp.py +6 -6
- pixeltable/functions/video.py +2 -6
- pixeltable/globals.py +62 -33
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/schema.py +39 -2
- pixeltable/plan.py +5 -14
- pixeltable/share/packager.py +13 -13
- pixeltable/store.py +31 -7
- pixeltable/type_system.py +2 -1
- pixeltable/utils/filecache.py +1 -1
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/media_store.py +90 -34
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/RECORD +52 -51
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from pandas.io.formats.style import Styler
|
|
10
10
|
|
|
11
|
-
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
|
|
11
|
+
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
|
|
12
12
|
from pixeltable.catalog import Catalog, TableVersionPath
|
|
13
13
|
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
14
14
|
from pixeltable.config import Config
|
|
@@ -44,7 +44,7 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
|
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
def create_table(
|
|
47
|
-
|
|
47
|
+
path: str,
|
|
48
48
|
schema: Optional[dict[str, Any]] = None,
|
|
49
49
|
*,
|
|
50
50
|
source: Optional[TableDataSource] = None,
|
|
@@ -58,14 +58,24 @@ def create_table(
|
|
|
58
58
|
if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
|
|
59
59
|
extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
|
|
60
60
|
) -> catalog.Table:
|
|
61
|
-
"""Create a new base table.
|
|
61
|
+
"""Create a new base table. Exactly one of `schema` or `source` must be provided.
|
|
62
|
+
|
|
63
|
+
If a `schema` is provided, then an empty table will be created with the specified schema.
|
|
64
|
+
|
|
65
|
+
If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
|
|
66
|
+
contents of the specified data, and the data will be imported from the specified source into the new table. The
|
|
67
|
+
source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
|
|
62
68
|
|
|
63
69
|
Args:
|
|
64
|
-
|
|
65
|
-
schema:
|
|
66
|
-
source: A data source
|
|
67
|
-
source_format:
|
|
68
|
-
|
|
70
|
+
path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
|
|
71
|
+
schema: Schema for the new table, mapping column names to Pixeltable types.
|
|
72
|
+
source: A data source (file, URL, DataFrame, or list of rows) to import from.
|
|
73
|
+
source_format: Must be used in conjunction with a `source`.
|
|
74
|
+
If specified, then the given format will be used to read the source data. (Otherwise,
|
|
75
|
+
Pixeltable will attempt to infer the format from the source data.)
|
|
76
|
+
schema_overrides: Must be used in conjunction with a `source`.
|
|
77
|
+
If specified, then columns in `schema_overrides` will be given the specified types.
|
|
78
|
+
(Pixeltable will attempt to infer the types of any columns not specified.)
|
|
69
79
|
on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
|
|
70
80
|
invalid media file (such as a corrupt image) for one of the inserted rows.
|
|
71
81
|
|
|
@@ -81,14 +91,15 @@ def create_table(
|
|
|
81
91
|
|
|
82
92
|
- `'on_read'`: validate media files at query time
|
|
83
93
|
- `'on_write'`: validate media files during insert/update operations
|
|
84
|
-
if_exists:
|
|
85
|
-
Must be one of the following:
|
|
94
|
+
if_exists: Determines the behavior if a table already exists at the specified path location.
|
|
86
95
|
|
|
87
96
|
- `'error'`: raise an error
|
|
88
97
|
- `'ignore'`: do nothing and return the existing table handle
|
|
89
|
-
- `'replace'`: if the existing table has no views, drop and replace it with a new one
|
|
90
|
-
|
|
91
|
-
|
|
98
|
+
- `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
|
|
99
|
+
raise an error if the existing table has views or snapshots
|
|
100
|
+
- `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
|
|
101
|
+
extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
|
|
102
|
+
passed along to the source data provider.
|
|
92
103
|
|
|
93
104
|
Returns:
|
|
94
105
|
A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
|
|
@@ -114,7 +125,7 @@ def create_table(
|
|
|
114
125
|
>>> tbl1 = pxt.get_table('orig_table')
|
|
115
126
|
... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
|
|
116
127
|
|
|
117
|
-
Create a table if does not already exist, otherwise get the existing table:
|
|
128
|
+
Create a table if it does not already exist, otherwise get the existing table:
|
|
118
129
|
|
|
119
130
|
>>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
|
|
120
131
|
|
|
@@ -130,12 +141,12 @@ def create_table(
|
|
|
130
141
|
from pixeltable.io.utils import normalize_primary_key_parameter
|
|
131
142
|
|
|
132
143
|
if (schema is None) == (source is None):
|
|
133
|
-
raise excs.Error('
|
|
144
|
+
raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
|
|
134
145
|
|
|
135
146
|
if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
|
|
136
147
|
raise excs.Error('`schema` must be a non-empty dictionary')
|
|
137
148
|
|
|
138
|
-
path_obj = catalog.Path(
|
|
149
|
+
path_obj = catalog.Path.parse(path)
|
|
139
150
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
140
151
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
141
152
|
primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
|
|
@@ -146,7 +157,14 @@ def create_table(
|
|
|
146
157
|
tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
|
|
147
158
|
tds.check_source_format()
|
|
148
159
|
data_source = tds.specialize()
|
|
149
|
-
|
|
160
|
+
src_schema_overrides: dict[str, ts.ColumnType] = {}
|
|
161
|
+
if schema_overrides is not None:
|
|
162
|
+
for col_name, py_type in schema_overrides.items():
|
|
163
|
+
col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
|
|
164
|
+
if col_type is None:
|
|
165
|
+
raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
|
|
166
|
+
src_schema_overrides[col_name] = col_type
|
|
167
|
+
data_source.src_schema_overrides = src_schema_overrides
|
|
150
168
|
data_source.src_pk = primary_key
|
|
151
169
|
data_source.infer_schema()
|
|
152
170
|
schema = data_source.pxt_schema
|
|
@@ -255,9 +273,7 @@ def create_view(
|
|
|
255
273
|
tbl_version_path = base._tbl_version_path
|
|
256
274
|
sample_clause = None
|
|
257
275
|
elif isinstance(base, DataFrame):
|
|
258
|
-
base.
|
|
259
|
-
if len(base._from_clause.tbls) > 1:
|
|
260
|
-
raise excs.Error('Cannot create a view of a join')
|
|
276
|
+
base._validate_mutable_op_sequence('create_view', allow_select=True)
|
|
261
277
|
tbl_version_path = base._from_clause.tbls[0]
|
|
262
278
|
where = base.where_clause
|
|
263
279
|
sample_clause = base.sample_clause
|
|
@@ -268,7 +284,7 @@ def create_view(
|
|
|
268
284
|
raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
|
|
269
285
|
assert isinstance(base, (catalog.Table, DataFrame))
|
|
270
286
|
|
|
271
|
-
path_obj = catalog.Path(path)
|
|
287
|
+
path_obj = catalog.Path.parse(path)
|
|
272
288
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
273
289
|
media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
|
|
274
290
|
|
|
@@ -429,8 +445,12 @@ def get_table(path: str) -> catalog.Table:
|
|
|
429
445
|
Handles to views and snapshots are retrieved in the same way:
|
|
430
446
|
|
|
431
447
|
>>> tbl = pxt.get_table('my_snapshot')
|
|
448
|
+
|
|
449
|
+
Get a handle to a specific version of a table:
|
|
450
|
+
|
|
451
|
+
>>> tbl = pxt.get_table('my_table:722')
|
|
432
452
|
"""
|
|
433
|
-
path_obj = catalog.Path(path)
|
|
453
|
+
path_obj = catalog.Path.parse(path, allow_versioned_path=True)
|
|
434
454
|
tbl = Catalog.get().get_table(path_obj)
|
|
435
455
|
return tbl
|
|
436
456
|
|
|
@@ -456,7 +476,7 @@ def move(path: str, new_path: str) -> None:
|
|
|
456
476
|
"""
|
|
457
477
|
if path == new_path:
|
|
458
478
|
raise excs.Error('move(): source and destination cannot be identical')
|
|
459
|
-
path_obj, new_path_obj = catalog.Path(path), catalog.Path(new_path)
|
|
479
|
+
path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
|
|
460
480
|
if path_obj.is_ancestor(new_path_obj):
|
|
461
481
|
raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
|
|
462
482
|
cat = Catalog.get()
|
|
@@ -509,7 +529,7 @@ def drop_table(
|
|
|
509
529
|
assert isinstance(table, str)
|
|
510
530
|
tbl_path = table
|
|
511
531
|
|
|
512
|
-
path_obj = catalog.Path(tbl_path)
|
|
532
|
+
path_obj = catalog.Path.parse(tbl_path)
|
|
513
533
|
if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
514
534
|
Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
|
|
515
535
|
|
|
@@ -537,9 +557,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
|
537
557
|
|
|
538
558
|
>>> pxt.list_tables('dir1')
|
|
539
559
|
"""
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
560
|
+
return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
|
|
564
|
+
path_obj = catalog.Path.parse(dir_path, allow_empty_path=True, allow_system_path=allow_system_paths)
|
|
565
|
+
contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
|
|
543
566
|
return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
|
|
544
567
|
|
|
545
568
|
|
|
@@ -590,7 +613,7 @@ def create_dir(
|
|
|
590
613
|
|
|
591
614
|
>>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
|
|
592
615
|
"""
|
|
593
|
-
path_obj = catalog.Path(path)
|
|
616
|
+
path_obj = catalog.Path.parse(path)
|
|
594
617
|
if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
|
|
595
618
|
return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
|
|
596
619
|
|
|
@@ -632,7 +655,7 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
|
|
|
632
655
|
|
|
633
656
|
>>> pxt.drop_dir('my_dir', force=True)
|
|
634
657
|
"""
|
|
635
|
-
path_obj = catalog.Path(path) # validate format
|
|
658
|
+
path_obj = catalog.Path.parse(path) # validate format
|
|
636
659
|
if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
|
|
637
660
|
Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
|
|
638
661
|
|
|
@@ -647,13 +670,16 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
647
670
|
To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
|
|
648
671
|
[list_dirs()][pixeltable.list_dirs] instead.
|
|
649
672
|
"""
|
|
673
|
+
from pixeltable.catalog import retry_loop
|
|
650
674
|
from pixeltable.metadata import schema
|
|
651
675
|
|
|
652
676
|
cat = Catalog.get()
|
|
653
|
-
path_obj = catalog.Path(path,
|
|
677
|
+
path_obj = catalog.Path.parse(path, allow_empty_path=True)
|
|
654
678
|
dir_entries = cat.get_dir_contents(path_obj)
|
|
655
|
-
|
|
656
|
-
|
|
679
|
+
|
|
680
|
+
@retry_loop(for_write=False)
|
|
681
|
+
def op() -> list[list[str]]:
|
|
682
|
+
rows: list[list[str]] = []
|
|
657
683
|
for name, entry in dir_entries.items():
|
|
658
684
|
if name.startswith('_'):
|
|
659
685
|
continue
|
|
@@ -679,6 +705,9 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
679
705
|
if md['is_replica']:
|
|
680
706
|
kind = f'{kind}-replica'
|
|
681
707
|
rows.append([name, kind, version, base])
|
|
708
|
+
return rows
|
|
709
|
+
|
|
710
|
+
rows = op()
|
|
682
711
|
|
|
683
712
|
rows = sorted(rows, key=lambda x: x[0])
|
|
684
713
|
df = pd.DataFrame(
|
|
@@ -734,7 +763,7 @@ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
|
|
|
734
763
|
>>> cl.list_dirs('my_dir', recursive=True)
|
|
735
764
|
['my_dir', 'my_dir.sub_dir1']
|
|
736
765
|
"""
|
|
737
|
-
path_obj = catalog.Path(path,
|
|
766
|
+
path_obj = catalog.Path.parse(path, allow_empty_path=True) # validate format
|
|
738
767
|
cat = Catalog.get()
|
|
739
768
|
contents = cat.get_dir_contents(path_obj, recursive=recursive)
|
|
740
769
|
return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]
|
pixeltable/io/datarows.py
CHANGED
|
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
def _infer_schema_from_rows(
|
|
11
|
-
rows: Iterable[dict[str, Any]], schema_overrides: dict[str,
|
|
11
|
+
rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
12
12
|
) -> dict[str, ts.ColumnType]:
|
|
13
13
|
schema: dict[str, ts.ColumnType] = {}
|
|
14
14
|
cols_with_nones: set[str] = set()
|
|
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
|
|
|
20
20
|
# in which the column names are encountered in the input data, even if `schema_overrides`
|
|
21
21
|
# is specified.
|
|
22
22
|
if col_name not in schema:
|
|
23
|
+
assert isinstance(schema_overrides[col_name], ts.ColumnType)
|
|
23
24
|
schema[col_name] = schema_overrides[col_name]
|
|
24
25
|
elif value is not None:
|
|
25
26
|
# If `key` is not in `schema_overrides`, then we infer its type from the data.
|
pixeltable/io/pandas.py
CHANGED
|
@@ -132,6 +132,7 @@ def df_infer_schema(
|
|
|
132
132
|
pd_schema: dict[str, ts.ColumnType] = {}
|
|
133
133
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
134
134
|
if pd_name in schema_overrides:
|
|
135
|
+
assert isinstance(schema_overrides[pd_name], ts.ColumnType)
|
|
135
136
|
pxt_type = schema_overrides[pd_name]
|
|
136
137
|
else:
|
|
137
138
|
pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
|
|
@@ -47,13 +47,13 @@ class TableDataConduitFormat(str, enum.Enum):
|
|
|
47
47
|
|
|
48
48
|
@dataclass
|
|
49
49
|
class TableDataConduit:
|
|
50
|
-
source: TableDataSource
|
|
50
|
+
source: 'TableDataSource'
|
|
51
51
|
source_format: Optional[str] = None
|
|
52
52
|
source_column_map: Optional[dict[str, str]] = None
|
|
53
53
|
if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
|
|
54
|
-
pxt_schema: Optional[dict[str,
|
|
55
|
-
src_schema_overrides: Optional[dict[str,
|
|
56
|
-
src_schema: Optional[dict[str,
|
|
54
|
+
pxt_schema: Optional[dict[str, ts.ColumnType]] = None
|
|
55
|
+
src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
|
|
56
|
+
src_schema: Optional[dict[str, ts.ColumnType]] = None
|
|
57
57
|
pxt_pk: Optional[list[str]] = None
|
|
58
58
|
src_pk: Optional[list[str]] = None
|
|
59
59
|
valid_rows: Optional[RowData] = None
|
|
@@ -87,7 +87,7 @@ class TableDataConduit:
|
|
|
87
87
|
for name, coltype in self.pxt_schema.items():
|
|
88
88
|
self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
|
|
89
89
|
|
|
90
|
-
def infer_schema(self) -> dict[str,
|
|
90
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
91
91
|
raise NotImplementedError
|
|
92
92
|
|
|
93
93
|
def valid_row_batch(self) -> Iterator[RowData]:
|
|
@@ -137,7 +137,7 @@ class DFTableDataConduit(TableDataConduit):
|
|
|
137
137
|
t.pxt_df = tds.source
|
|
138
138
|
return t
|
|
139
139
|
|
|
140
|
-
def infer_schema(self) -> dict[str,
|
|
140
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
141
141
|
self.pxt_schema = self.pxt_df.schema
|
|
142
142
|
self.pxt_pk = self.src_pk
|
|
143
143
|
return self.pxt_schema
|
|
@@ -168,7 +168,7 @@ class RowDataTableDataConduit(TableDataConduit):
|
|
|
168
168
|
t.batch_count = 0
|
|
169
169
|
return t
|
|
170
170
|
|
|
171
|
-
def infer_schema(self) -> dict[str,
|
|
171
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
172
172
|
from .datarows import _infer_schema_from_rows
|
|
173
173
|
|
|
174
174
|
if self.source_column_map is None:
|
|
@@ -239,7 +239,7 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
239
239
|
t.batch_count = 0
|
|
240
240
|
return t
|
|
241
241
|
|
|
242
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
242
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
243
243
|
"""Return inferred schema, inferred primary key, and source column map"""
|
|
244
244
|
if self.source_column_map is None:
|
|
245
245
|
if self.src_schema_overrides is None:
|
|
@@ -252,7 +252,7 @@ class PandasTableDataConduit(TableDataConduit):
|
|
|
252
252
|
else:
|
|
253
253
|
raise NotImplementedError()
|
|
254
254
|
|
|
255
|
-
def infer_schema(self) -> dict[str,
|
|
255
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
256
256
|
self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
|
|
257
257
|
self.normalize_pxt_schema_types()
|
|
258
258
|
_df_check_primary_key_values(self.pd_df, self.src_pk)
|
|
@@ -328,7 +328,6 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
328
328
|
hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
|
|
329
329
|
column_name_for_split: Optional[str] = None
|
|
330
330
|
categorical_features: dict[str, dict[int, str]]
|
|
331
|
-
hf_schema: dict[str, Any] = None
|
|
332
331
|
dataset_dict: dict[str, datasets.Dataset] = None
|
|
333
332
|
hf_schema_source: dict[str, Any] = None
|
|
334
333
|
|
|
@@ -356,7 +355,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
356
355
|
except ImportError:
|
|
357
356
|
return False
|
|
358
357
|
|
|
359
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
358
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
360
359
|
from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
|
|
361
360
|
|
|
362
361
|
if self.source_column_map is None:
|
|
@@ -469,7 +468,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
469
468
|
t.pq_ds = parquet.ParquetDataset(str(input_path))
|
|
470
469
|
return t
|
|
471
470
|
|
|
472
|
-
def infer_schema_part1(self) -> tuple[dict[str,
|
|
471
|
+
def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
473
472
|
from pixeltable.utils.arrow import ar_infer_schema
|
|
474
473
|
|
|
475
474
|
if self.source_column_map is None:
|
|
@@ -483,7 +482,7 @@ class ParquetTableDataConduit(TableDataConduit):
|
|
|
483
482
|
else:
|
|
484
483
|
raise NotImplementedError()
|
|
485
484
|
|
|
486
|
-
def infer_schema(self) -> dict[str,
|
|
485
|
+
def infer_schema(self) -> dict[str, ts.ColumnType]:
|
|
487
486
|
self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
|
|
488
487
|
self.normalize_pxt_schema_types()
|
|
489
488
|
self.prepare_insert()
|
pixeltable/iterators/audio.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import uuid
|
|
3
2
|
from fractions import Fraction
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import Any, ClassVar, Optional
|
|
@@ -55,12 +54,9 @@ class AudioSplitter(ComponentIterator):
|
|
|
55
54
|
def __init__(
|
|
56
55
|
self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
|
|
57
56
|
):
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
62
|
-
if overlap_sec >= chunk_duration_sec:
|
|
63
|
-
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
57
|
+
assert chunk_duration_sec > 0.0
|
|
58
|
+
assert chunk_duration_sec >= min_chunk_duration_sec
|
|
59
|
+
assert overlap_sec < chunk_duration_sec
|
|
64
60
|
audio_path = Path(audio)
|
|
65
61
|
assert audio_path.exists() and audio_path.is_file()
|
|
66
62
|
self.audio_path = audio_path
|
|
@@ -128,6 +124,19 @@ class AudioSplitter(ComponentIterator):
|
|
|
128
124
|
|
|
129
125
|
@classmethod
|
|
130
126
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
127
|
+
param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
|
|
128
|
+
params = dict(zip(param_names, args))
|
|
129
|
+
params.update(kwargs)
|
|
130
|
+
|
|
131
|
+
chunk_duration_sec = params['chunk_duration_sec']
|
|
132
|
+
min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
|
|
133
|
+
overlap_sec = params.get('overlap_sec', 0.0)
|
|
134
|
+
if chunk_duration_sec <= 0.0:
|
|
135
|
+
raise excs.Error('chunk_duration_sec must be a positive number')
|
|
136
|
+
if chunk_duration_sec < min_chunk_duration_sec:
|
|
137
|
+
raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
|
|
138
|
+
if overlap_sec >= chunk_duration_sec:
|
|
139
|
+
raise excs.Error('overlap_sec must be less than chunk_duration_sec')
|
|
131
140
|
return {
|
|
132
141
|
'start_time_sec': ts.FloatType(),
|
|
133
142
|
'end_time_sec': ts.FloatType(),
|
|
@@ -140,7 +149,7 @@ class AudioSplitter(ComponentIterator):
|
|
|
140
149
|
target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
|
|
141
150
|
chunk_start_pts = 0
|
|
142
151
|
chunk_end_pts = 0
|
|
143
|
-
chunk_file = str(env.Env.get().
|
|
152
|
+
chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
|
|
144
153
|
output_container = av.open(chunk_file, mode='w')
|
|
145
154
|
input_stream = self.container.streams.audio[0]
|
|
146
155
|
codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
|
pixeltable/iterators/image.py
CHANGED
|
@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
|
|
|
31
31
|
__j: int
|
|
32
32
|
|
|
33
33
|
def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
|
|
34
|
-
|
|
35
|
-
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
34
|
+
assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
|
|
36
35
|
|
|
37
36
|
self.__image = image
|
|
38
37
|
self.__image.load()
|
|
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
|
|
|
79
78
|
|
|
80
79
|
@classmethod
|
|
81
80
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
81
|
+
tile_size = kwargs.get('tile_size')
|
|
82
|
+
overlap = kwargs.get('overlap', (0, 0))
|
|
83
|
+
if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
|
|
84
|
+
raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
|
|
82
85
|
return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
|
pixeltable/metadata/schema.py
CHANGED
|
@@ -24,7 +24,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
|
|
|
24
24
|
"""Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
|
|
25
25
|
if dataclasses.is_dataclass(data_class_type):
|
|
26
26
|
fieldtypes = get_type_hints(data_class_type)
|
|
27
|
-
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
27
|
+
return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
|
|
28
28
|
|
|
29
29
|
origin = typing.get_origin(data_class_type)
|
|
30
30
|
if origin is not None:
|
|
@@ -182,6 +182,7 @@ class TableMd:
|
|
|
182
182
|
# sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
|
|
183
183
|
# - incremented for each add/drop of a mutable view
|
|
184
184
|
# - only maintained for mutable tables
|
|
185
|
+
# TODO: replace with mutable_views: list[UUID] to help with debugging
|
|
185
186
|
view_sn: int
|
|
186
187
|
|
|
187
188
|
# Metadata format for external stores:
|
|
@@ -193,6 +194,26 @@ class TableMd:
|
|
|
193
194
|
view_md: Optional[ViewMd]
|
|
194
195
|
additional_md: dict[str, Any]
|
|
195
196
|
|
|
197
|
+
has_pending_ops: bool = False
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def is_snapshot(self) -> bool:
|
|
201
|
+
return self.view_md is not None and self.view_md.is_snapshot
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def is_mutable(self) -> bool:
|
|
205
|
+
return not self.is_snapshot and not self.is_replica
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def is_pure_snapshot(self) -> bool:
|
|
209
|
+
return (
|
|
210
|
+
self.view_md is not None
|
|
211
|
+
and self.view_md.is_snapshot
|
|
212
|
+
and self.view_md.sample_clause is None
|
|
213
|
+
and self.view_md.predicate is None
|
|
214
|
+
and len(self.column_md) == 0
|
|
215
|
+
)
|
|
216
|
+
|
|
196
217
|
|
|
197
218
|
class Table(Base):
|
|
198
219
|
"""
|
|
@@ -215,7 +236,7 @@ class Table(Base):
|
|
|
215
236
|
lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
|
|
216
237
|
|
|
217
238
|
|
|
218
|
-
@dataclasses.dataclass
|
|
239
|
+
@dataclasses.dataclass
|
|
219
240
|
class TableVersionMd:
|
|
220
241
|
tbl_id: str # uuid.UUID
|
|
221
242
|
created_at: float # time.time()
|
|
@@ -279,6 +300,22 @@ class TableSchemaVersion(Base):
|
|
|
279
300
|
md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
|
|
280
301
|
|
|
281
302
|
|
|
303
|
+
class PendingTableOp(Base):
|
|
304
|
+
"""
|
|
305
|
+
Table operation that needs to be completed before the table can be used.
|
|
306
|
+
|
|
307
|
+
Operations need to be completed in order of increasing seq_num.
|
|
308
|
+
"""
|
|
309
|
+
|
|
310
|
+
__tablename__ = 'pendingtableops'
|
|
311
|
+
|
|
312
|
+
tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
|
|
313
|
+
UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
|
|
314
|
+
)
|
|
315
|
+
op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False) # catalog.TableOp.op_sn
|
|
316
|
+
op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # catalog.TableOp
|
|
317
|
+
|
|
318
|
+
|
|
282
319
|
@dataclasses.dataclass
|
|
283
320
|
class FunctionMd:
|
|
284
321
|
name: str
|
pixeltable/plan.py
CHANGED
|
@@ -385,14 +385,7 @@ class Planner:
|
|
|
385
385
|
TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
|
|
386
386
|
)
|
|
387
387
|
|
|
388
|
-
|
|
389
|
-
exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
|
|
390
|
-
for col_ref in row_builder.input_exprs
|
|
391
|
-
if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
|
|
392
|
-
]
|
|
393
|
-
if len(media_input_col_info) > 0:
|
|
394
|
-
# prefetch external files for all input column refs
|
|
395
|
-
plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
|
|
388
|
+
plan = cls._insert_prefetch_node(tbl.id, row_builder.input_exprs, input_node=plan)
|
|
396
389
|
|
|
397
390
|
computed_exprs = row_builder.output_exprs - row_builder.input_exprs
|
|
398
391
|
if len(computed_exprs) > 0:
|
|
@@ -789,15 +782,13 @@ class Planner:
|
|
|
789
782
|
|
|
790
783
|
@classmethod
|
|
791
784
|
def _insert_prefetch_node(
|
|
792
|
-
cls, tbl_id: UUID,
|
|
785
|
+
cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
|
|
793
786
|
) -> exec.ExecNode:
|
|
794
|
-
"""
|
|
787
|
+
"""Return a CachePrefetchNode if needed, otherwise return input"""
|
|
795
788
|
# we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
|
|
796
789
|
# of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
|
|
797
790
|
# aren't explicitly captured as dependencies
|
|
798
|
-
media_col_refs = [
|
|
799
|
-
e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
|
|
800
|
-
]
|
|
791
|
+
media_col_refs = [e for e in expressions if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()]
|
|
801
792
|
if len(media_col_refs) == 0:
|
|
802
793
|
return input_node
|
|
803
794
|
# we need to prefetch external files for media column types
|
|
@@ -967,7 +958,7 @@ class Planner:
|
|
|
967
958
|
stratify_exprs=analyzer.stratify_exprs,
|
|
968
959
|
)
|
|
969
960
|
|
|
970
|
-
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
|
|
961
|
+
plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder.unique_exprs, plan)
|
|
971
962
|
|
|
972
963
|
if analyzer.group_by_clause is not None:
|
|
973
964
|
# we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
|
pixeltable/share/packager.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
import base64
|
|
2
2
|
import datetime
|
|
3
3
|
import io
|
|
4
|
-
import itertools
|
|
5
4
|
import json
|
|
6
5
|
import logging
|
|
7
6
|
import tarfile
|
|
@@ -237,8 +236,7 @@ class TablePackager:
|
|
|
237
236
|
- Videos are replaced by their first frame and resized as above
|
|
238
237
|
- Documents are replaced by a thumbnail as a base64-encoded webp
|
|
239
238
|
"""
|
|
240
|
-
|
|
241
|
-
preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
|
|
239
|
+
preview_cols = self.table._get_schema()
|
|
242
240
|
select_list = [self.table[col_name] for col_name in preview_cols]
|
|
243
241
|
# First 5 rows
|
|
244
242
|
rows = list(self.table.select(*select_list).head(n=5))
|
|
@@ -369,7 +367,7 @@ class TableRestorer:
|
|
|
369
367
|
with cat.begin_xact(for_write=True):
|
|
370
368
|
# Create (or update) the replica table and its ancestors, along with TableVersion instances for any
|
|
371
369
|
# versions that have not been seen before.
|
|
372
|
-
cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
|
|
370
|
+
cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
|
|
373
371
|
|
|
374
372
|
# Now we need to load data for replica_tbl and its ancestors, except that we skip
|
|
375
373
|
# replica_tbl itself if it's a pure snapshot.
|
|
@@ -572,16 +570,18 @@ class TableRestorer:
|
|
|
572
570
|
for col_name in pydict:
|
|
573
571
|
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
574
572
|
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
575
|
-
|
|
573
|
+
media_cols: dict[str, catalog.Column] = {}
|
|
576
574
|
for col in tv.cols:
|
|
577
575
|
if col.is_stored and col.col_type.is_media_type():
|
|
578
|
-
|
|
576
|
+
assert tv.id == col.tbl.id
|
|
577
|
+
assert tv.version == col.tbl.version
|
|
578
|
+
media_cols[col.store_name()] = col
|
|
579
579
|
|
|
580
580
|
row_count = len(next(iter(pydict.values())))
|
|
581
581
|
rows: list[dict[str, Any]] = []
|
|
582
582
|
for i in range(row_count):
|
|
583
583
|
row = {
|
|
584
|
-
col_name: self.__from_pa_value(
|
|
584
|
+
col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
|
|
585
585
|
for col_name, col_vals in pydict.items()
|
|
586
586
|
}
|
|
587
587
|
rows.append(row)
|
|
@@ -589,19 +589,19 @@ class TableRestorer:
|
|
|
589
589
|
return rows
|
|
590
590
|
|
|
591
591
|
def __from_pa_value(
|
|
592
|
-
self,
|
|
592
|
+
self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
|
|
593
593
|
) -> Any:
|
|
594
594
|
if val is None:
|
|
595
595
|
return None
|
|
596
596
|
if isinstance(sql_type, sql.JSON):
|
|
597
597
|
return json.loads(val)
|
|
598
|
-
if
|
|
599
|
-
|
|
600
|
-
return self.__relocate_media_file(tv, media_col_id, val)
|
|
598
|
+
if media_col is not None:
|
|
599
|
+
return self.__relocate_media_file(media_col, val)
|
|
601
600
|
return val
|
|
602
601
|
|
|
603
|
-
def __relocate_media_file(self,
|
|
602
|
+
def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
|
|
604
603
|
# If this is a pxtmedia:// URL, relocate it
|
|
604
|
+
assert isinstance(url, str)
|
|
605
605
|
parsed_url = urllib.parse.urlparse(url)
|
|
606
606
|
assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
|
|
607
607
|
if parsed_url.scheme == 'pxtmedia':
|
|
@@ -610,7 +610,7 @@ class TableRestorer:
|
|
|
610
610
|
# in self.media_files.
|
|
611
611
|
src_path = self.tmp_dir / 'media' / parsed_url.netloc
|
|
612
612
|
# Move the file to the media store and update the URL.
|
|
613
|
-
self.media_files[url] = MediaStore.relocate_local_media_file(src_path,
|
|
613
|
+
self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
|
|
614
614
|
return self.media_files[url]
|
|
615
615
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
616
616
|
return url
|