pixeltable 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +3 -3
- pixeltable/catalog/column.py +49 -0
- pixeltable/catalog/insertable_table.py +0 -7
- pixeltable/catalog/schema_object.py +1 -14
- pixeltable/catalog/table.py +139 -53
- pixeltable/catalog/table_version.py +30 -138
- pixeltable/catalog/view.py +2 -1
- pixeltable/dataframe.py +2 -3
- pixeltable/env.py +43 -5
- pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
- pixeltable/exec/expr_eval/schedulers.py +36 -15
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/data_row.py +13 -0
- pixeltable/exprs/expr.py +9 -9
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/json_path.py +3 -3
- pixeltable/exprs/row_builder.py +14 -16
- pixeltable/exprs/string_op.py +3 -3
- pixeltable/func/query_template_function.py +2 -2
- pixeltable/func/signature.py +30 -3
- pixeltable/func/tools.py +2 -2
- pixeltable/functions/anthropic.py +75 -25
- pixeltable/functions/globals.py +2 -2
- pixeltable/functions/llama_cpp.py +9 -1
- pixeltable/functions/openai.py +74 -54
- pixeltable/functions/video.py +54 -1
- pixeltable/functions/vision.py +2 -2
- pixeltable/globals.py +74 -12
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/fiftyone.py +4 -4
- pixeltable/io/globals.py +3 -3
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +3 -3
- pixeltable/io/table_data_conduit.py +2 -2
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/document.py +2 -2
- pixeltable/iterators/video.py +49 -9
- pixeltable/share/packager.py +45 -36
- pixeltable/store.py +5 -25
- pixeltable/type_system.py +5 -8
- pixeltable/utils/__init__.py +2 -2
- pixeltable/utils/arrow.py +5 -5
- pixeltable/utils/description_helper.py +3 -3
- pixeltable/utils/iceberg.py +1 -2
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/METADATA +70 -19
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/RECORD +53 -53
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/licenses/LICENSE +0 -0
pixeltable/functions/vision.py
CHANGED
|
@@ -14,7 +14,7 @@ t.select(pxtv.draw_bounding_boxes(t.img, boxes=t.boxes, label=t.labels)).collect
|
|
|
14
14
|
import colorsys
|
|
15
15
|
import hashlib
|
|
16
16
|
from collections import defaultdict
|
|
17
|
-
from typing import Any, Optional
|
|
17
|
+
from typing import Any, Optional
|
|
18
18
|
|
|
19
19
|
import numpy as np
|
|
20
20
|
import PIL.Image
|
|
@@ -352,7 +352,7 @@ def draw_bounding_boxes(
|
|
|
352
352
|
from PIL import ImageColor, ImageDraw, ImageFont
|
|
353
353
|
|
|
354
354
|
# set default font if not provided
|
|
355
|
-
txt_font:
|
|
355
|
+
txt_font: ImageFont.ImageFont | ImageFont.FreeTypeFont = (
|
|
356
356
|
ImageFont.load_default() if font is None else ImageFont.truetype(font=font, size=font_size or 10)
|
|
357
357
|
)
|
|
358
358
|
|
pixeltable/globals.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Union
|
|
7
7
|
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from pandas.io.formats.style import Styler
|
|
@@ -27,8 +27,8 @@ if TYPE_CHECKING:
|
|
|
27
27
|
RowData, # list of dictionaries
|
|
28
28
|
DataFrame, # Pixeltable DataFrame
|
|
29
29
|
pd.DataFrame, # pandas DataFrame
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
datasets.Dataset,
|
|
31
|
+
datasets.DatasetDict, # Huggingface datasets
|
|
32
32
|
]
|
|
33
33
|
|
|
34
34
|
|
|
@@ -51,7 +51,7 @@ def create_table(
|
|
|
51
51
|
source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
|
|
52
52
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
53
53
|
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
54
|
-
primary_key:
|
|
54
|
+
primary_key: str | list[str] | None = None,
|
|
55
55
|
num_retained_versions: int = 10,
|
|
56
56
|
comment: str = '',
|
|
57
57
|
media_validation: Literal['on_read', 'on_write'] = 'on_write',
|
|
@@ -197,7 +197,7 @@ def create_table(
|
|
|
197
197
|
|
|
198
198
|
def create_view(
|
|
199
199
|
path: str,
|
|
200
|
-
base:
|
|
200
|
+
base: catalog.Table | DataFrame,
|
|
201
201
|
*,
|
|
202
202
|
additional_columns: Optional[dict[str, Any]] = None,
|
|
203
203
|
is_snapshot: bool = False,
|
|
@@ -317,7 +317,7 @@ def create_view(
|
|
|
317
317
|
|
|
318
318
|
def create_snapshot(
|
|
319
319
|
path_str: str,
|
|
320
|
-
base:
|
|
320
|
+
base: catalog.Table | DataFrame,
|
|
321
321
|
*,
|
|
322
322
|
additional_columns: Optional[dict[str, Any]] = None,
|
|
323
323
|
iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
|
|
@@ -396,7 +396,7 @@ def create_snapshot(
|
|
|
396
396
|
)
|
|
397
397
|
|
|
398
398
|
|
|
399
|
-
def create_replica(destination: str, source:
|
|
399
|
+
def create_replica(destination: str, source: str | catalog.Table) -> Optional[catalog.Table]:
|
|
400
400
|
"""
|
|
401
401
|
Create a replica of a table. Can be used either to create a remote replica of a local table, or to create a local
|
|
402
402
|
replica of a remote table. A given table can have at most one replica per Pixeltable instance.
|
|
@@ -484,7 +484,7 @@ def move(path: str, new_path: str) -> None:
|
|
|
484
484
|
|
|
485
485
|
|
|
486
486
|
def drop_table(
|
|
487
|
-
table:
|
|
487
|
+
table: str | catalog.Table, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error'
|
|
488
488
|
) -> None:
|
|
489
489
|
"""Drop a table, view, or snapshot.
|
|
490
490
|
|
|
@@ -534,6 +534,57 @@ def drop_table(
|
|
|
534
534
|
Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
|
|
535
535
|
|
|
536
536
|
|
|
537
|
+
def get_dir_contents(dir_path: str = '', recursive: bool = True) -> 'DirContents':
|
|
538
|
+
"""Get the contents of a Pixeltable directory.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
dir_path: Path to the directory. Defaults to the root directory.
|
|
542
|
+
recursive: If `False`, returns only those tables and directories that are directly contained in specified
|
|
543
|
+
directory; if `True`, returns all tables and directories that are descendants of the specified directory,
|
|
544
|
+
recursively.
|
|
545
|
+
|
|
546
|
+
Returns:
|
|
547
|
+
A [`DirContents`][pixeltable.DirContents] object representing the contents of the specified directory.
|
|
548
|
+
|
|
549
|
+
Raises:
|
|
550
|
+
Error: If the path does not exist or does not designate a directory.
|
|
551
|
+
|
|
552
|
+
Examples:
|
|
553
|
+
Get contents of top-level directory:
|
|
554
|
+
|
|
555
|
+
>>> pxt.get_dir_contents()
|
|
556
|
+
|
|
557
|
+
Get contents of 'dir1':
|
|
558
|
+
|
|
559
|
+
>>> pxt.get_dir_contents('dir1')
|
|
560
|
+
"""
|
|
561
|
+
path_obj = catalog.Path.parse(dir_path, allow_empty_path=True)
|
|
562
|
+
catalog_entries = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
|
|
563
|
+
dirs: list[str] = []
|
|
564
|
+
tables: list[str] = []
|
|
565
|
+
_assemble_dir_contents(dir_path, catalog_entries, dirs, tables)
|
|
566
|
+
dirs.sort()
|
|
567
|
+
tables.sort()
|
|
568
|
+
return DirContents(dirs, tables)
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def _assemble_dir_contents(
|
|
572
|
+
dir_path: str, catalog_entries: dict[str, Catalog.DirEntry], dirs: list[str], tables: list[str]
|
|
573
|
+
) -> None:
|
|
574
|
+
for name, entry in catalog_entries.items():
|
|
575
|
+
if name.startswith('_'):
|
|
576
|
+
continue # Skip system paths
|
|
577
|
+
path = f'{dir_path}.{name}' if len(dir_path) > 0 else name
|
|
578
|
+
if entry.dir is not None:
|
|
579
|
+
dirs.append(path)
|
|
580
|
+
if entry.dir_entries is not None:
|
|
581
|
+
_assemble_dir_contents(path, entry.dir_entries, dirs, tables)
|
|
582
|
+
else:
|
|
583
|
+
assert entry.table is not None
|
|
584
|
+
assert not entry.dir_entries
|
|
585
|
+
tables.append(path)
|
|
586
|
+
|
|
587
|
+
|
|
537
588
|
def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
|
|
538
589
|
"""List the [`Table`][pixeltable.Table]s in a directory.
|
|
539
590
|
|
|
@@ -667,8 +718,8 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
667
718
|
This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
|
|
668
719
|
including various attributes such as version and base table, as appropriate.
|
|
669
720
|
|
|
670
|
-
To get a programmatic list of
|
|
671
|
-
|
|
721
|
+
To get a programmatic list of the directory's contents, use [get_dir_contents()][pixeltable.get_dir_contents]
|
|
722
|
+
instead.
|
|
672
723
|
"""
|
|
673
724
|
from pixeltable.catalog import retry_loop
|
|
674
725
|
from pixeltable.metadata import schema
|
|
@@ -701,7 +752,7 @@ def ls(path: str = '') -> pd.DataFrame:
|
|
|
701
752
|
kind = 'view'
|
|
702
753
|
else:
|
|
703
754
|
kind = 'table'
|
|
704
|
-
version = '' if kind == 'snapshot' else md['version']
|
|
755
|
+
version = '' if kind == 'snapshot' else str(md['version'])
|
|
705
756
|
if md['is_replica']:
|
|
706
757
|
kind = f'{kind}-replica'
|
|
707
758
|
rows.append([name, kind, version, base])
|
|
@@ -798,7 +849,7 @@ def list_functions() -> Styler:
|
|
|
798
849
|
return pd_df.hide(axis='index')
|
|
799
850
|
|
|
800
851
|
|
|
801
|
-
def tools(*args:
|
|
852
|
+
def tools(*args: func.Function | func.tools.Tool) -> func.tools.Tools:
|
|
802
853
|
"""
|
|
803
854
|
Specifies a collection of UDFs to be used as LLM tools. Pixeltable allows any UDF to be used as an input into an
|
|
804
855
|
LLM tool-calling API. To use one or more UDFs as tools, wrap them in a `pxt.tools` call and pass the return value
|
|
@@ -875,3 +926,14 @@ def configure_logging(
|
|
|
875
926
|
|
|
876
927
|
def array(elements: Iterable) -> exprs.Expr:
|
|
877
928
|
return exprs.Expr.from_array(elements)
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
class DirContents(NamedTuple):
|
|
932
|
+
"""
|
|
933
|
+
Represents the contents of a Pixeltable directory.
|
|
934
|
+
"""
|
|
935
|
+
|
|
936
|
+
dirs: list[str]
|
|
937
|
+
"""List of directory paths contained in this directory."""
|
|
938
|
+
tables: list[str]
|
|
939
|
+
"""List of table paths contained in this directory."""
|
pixeltable/io/datarows.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, Iterable, Optional
|
|
3
|
+
from typing import Any, Iterable, Optional
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.type_system as ts
|
|
@@ -61,7 +61,7 @@ def import_rows(
|
|
|
61
61
|
rows: list[dict[str, Any]],
|
|
62
62
|
*,
|
|
63
63
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
64
|
-
primary_key:
|
|
64
|
+
primary_key: str | list[str] | None = None,
|
|
65
65
|
num_retained_versions: int = 10,
|
|
66
66
|
comment: str = '',
|
|
67
67
|
) -> pxt.Table:
|
|
@@ -105,7 +105,7 @@ def import_json(
|
|
|
105
105
|
filepath_or_url: str,
|
|
106
106
|
*,
|
|
107
107
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
108
|
-
primary_key:
|
|
108
|
+
primary_key: str | list[str] | None = None,
|
|
109
109
|
num_retained_versions: int = 10,
|
|
110
110
|
comment: str = '',
|
|
111
111
|
**kwargs: Any,
|
pixeltable/io/fiftyone.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any, Iterator, Optional
|
|
2
|
+
from typing import Any, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import fiftyone as fo # type: ignore[import-untyped]
|
|
5
5
|
import fiftyone.utils.data as foud # type: ignore[import-untyped]
|
|
@@ -28,11 +28,11 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
|
|
|
28
28
|
tbl: pxt.Table,
|
|
29
29
|
image: exprs.Expr,
|
|
30
30
|
image_format: str,
|
|
31
|
-
classifications:
|
|
32
|
-
detections:
|
|
31
|
+
classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
|
|
32
|
+
detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
|
|
33
33
|
dataset_dir: Optional[os.PathLike] = None,
|
|
34
34
|
shuffle: bool = False,
|
|
35
|
-
seed:
|
|
35
|
+
seed: int | float | str | bytes | bytearray | None = None,
|
|
36
36
|
max_samples: Optional[int] = None,
|
|
37
37
|
):
|
|
38
38
|
super().__init__(dataset_dir=dataset_dir, shuffle=shuffle, seed=seed, max_samples=max_samples)
|
pixeltable/io/globals.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Literal, Optional
|
|
4
4
|
|
|
5
5
|
import pixeltable as pxt
|
|
6
6
|
import pixeltable.exceptions as excs
|
|
@@ -143,8 +143,8 @@ def export_images_as_fo_dataset(
|
|
|
143
143
|
tbl: pxt.Table,
|
|
144
144
|
images: exprs.Expr,
|
|
145
145
|
image_format: str = 'webp',
|
|
146
|
-
classifications:
|
|
147
|
-
detections:
|
|
146
|
+
classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
|
|
147
|
+
detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
|
|
148
148
|
) -> 'fo.Dataset':
|
|
149
149
|
"""
|
|
150
150
|
Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
|
pixeltable/io/hf_datasets.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import typing
|
|
4
|
-
from typing import Any, Optional
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
import pixeltable as pxt
|
|
7
7
|
import pixeltable.type_system as ts
|
|
@@ -66,7 +66,7 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
|
|
|
66
66
|
return None
|
|
67
67
|
|
|
68
68
|
|
|
69
|
-
def _get_hf_schema(dataset:
|
|
69
|
+
def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
|
|
70
70
|
"""Get the schema of a huggingface dataset as a dictionary."""
|
|
71
71
|
import datasets
|
|
72
72
|
|
|
@@ -91,10 +91,10 @@ def huggingface_schema_to_pxt_schema(
|
|
|
91
91
|
|
|
92
92
|
def import_huggingface_dataset(
|
|
93
93
|
table_path: str,
|
|
94
|
-
dataset:
|
|
94
|
+
dataset: datasets.Dataset | datasets.DatasetDict,
|
|
95
95
|
*,
|
|
96
96
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
97
|
-
primary_key:
|
|
97
|
+
primary_key: str | list[str] | None = None,
|
|
98
98
|
**kwargs: Any,
|
|
99
99
|
) -> pxt.Table:
|
|
100
100
|
"""Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
|
pixeltable/io/pandas.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any, Optional
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
@@ -17,7 +17,7 @@ def import_pandas(
|
|
|
17
17
|
df: pd.DataFrame,
|
|
18
18
|
*,
|
|
19
19
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
20
|
-
primary_key:
|
|
20
|
+
primary_key: str | list[str] | None = None,
|
|
21
21
|
num_retained_versions: int = 10,
|
|
22
22
|
comment: str = '',
|
|
23
23
|
) -> pxt.Table:
|
|
@@ -55,9 +55,9 @@ def import_pandas(
|
|
|
55
55
|
|
|
56
56
|
def import_csv(
|
|
57
57
|
tbl_name: str,
|
|
58
|
-
filepath_or_buffer:
|
|
58
|
+
filepath_or_buffer: str | os.PathLike,
|
|
59
59
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
60
|
-
primary_key:
|
|
60
|
+
primary_key: str | list[str] | None = None,
|
|
61
61
|
num_retained_versions: int = 10,
|
|
62
62
|
comment: str = '',
|
|
63
63
|
**kwargs: Any,
|
|
@@ -84,10 +84,10 @@ def import_csv(
|
|
|
84
84
|
|
|
85
85
|
def import_excel(
|
|
86
86
|
tbl_name: str,
|
|
87
|
-
io:
|
|
87
|
+
io: str | os.PathLike,
|
|
88
88
|
*,
|
|
89
89
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
90
|
-
primary_key:
|
|
90
|
+
primary_key: str | list[str] | None = None,
|
|
91
91
|
num_retained_versions: int = 10,
|
|
92
92
|
comment: str = '',
|
|
93
93
|
**kwargs: Any,
|
pixeltable/io/parquet.py
CHANGED
|
@@ -7,7 +7,7 @@ import logging
|
|
|
7
7
|
import typing
|
|
8
8
|
from collections import deque
|
|
9
9
|
from pathlib import Path
|
|
10
|
-
from typing import Any, Optional
|
|
10
|
+
from typing import Any, Optional
|
|
11
11
|
|
|
12
12
|
import numpy as np
|
|
13
13
|
import PIL.Image
|
|
@@ -42,7 +42,7 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
def export_parquet(
|
|
45
|
-
table_or_df:
|
|
45
|
+
table_or_df: pxt.Table | pxt.DataFrame,
|
|
46
46
|
parquet_path: Path,
|
|
47
47
|
partition_size_bytes: int = 100_000_000,
|
|
48
48
|
inline_images: bool = False,
|
|
@@ -152,7 +152,7 @@ def import_parquet(
|
|
|
152
152
|
*,
|
|
153
153
|
parquet_path: str,
|
|
154
154
|
schema_overrides: Optional[dict[str, Any]] = None,
|
|
155
|
-
primary_key:
|
|
155
|
+
primary_key: str | list[str] | None = None,
|
|
156
156
|
**kwargs: Any,
|
|
157
157
|
) -> pxt.Table:
|
|
158
158
|
"""Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
|
|
@@ -8,7 +8,7 @@ import urllib.parse
|
|
|
8
8
|
import urllib.request
|
|
9
9
|
from dataclasses import dataclass, field, fields
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
14
14
|
from pyarrow.parquet import ParquetDataset
|
|
@@ -325,7 +325,7 @@ class JsonTableDataConduit(TableDataConduit):
|
|
|
325
325
|
|
|
326
326
|
|
|
327
327
|
class HFTableDataConduit(TableDataConduit):
|
|
328
|
-
hf_ds:
|
|
328
|
+
hf_ds: datasets.Dataset | datasets.DatasetDict | None = None
|
|
329
329
|
column_name_for_split: Optional[str] = None
|
|
330
330
|
categorical_features: dict[str, dict[int, str]]
|
|
331
331
|
dataset_dict: dict[str, datasets.Dataset] = None
|
pixeltable/io/utils.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from keyword import iskeyword as is_python_keyword
|
|
2
|
-
from typing import Any, Optional
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable as pxt
|
|
5
5
|
import pixeltable.exceptions as excs
|
|
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
|
|
|
21
21
|
return id
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def normalize_primary_key_parameter(primary_key:
|
|
24
|
+
def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
|
|
25
25
|
if primary_key is None:
|
|
26
26
|
primary_key = []
|
|
27
27
|
elif isinstance(primary_key, str):
|
pixeltable/iterators/document.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
import enum
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Any, ClassVar, Iterable, Iterator, Optional
|
|
4
|
+
from typing import Any, ClassVar, Iterable, Iterator, Optional
|
|
5
5
|
|
|
6
6
|
import ftfy
|
|
7
7
|
|
|
@@ -273,7 +273,7 @@ class DocumentSplitter(ComponentIterator):
|
|
|
273
273
|
yield DocumentSection(text=full_text, metadata=md)
|
|
274
274
|
accumulated_text = []
|
|
275
275
|
|
|
276
|
-
def process_element(el:
|
|
276
|
+
def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
|
|
277
277
|
# process the element and emit sections as necessary
|
|
278
278
|
nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
|
|
279
279
|
|
pixeltable/iterators/video.py
CHANGED
|
@@ -29,12 +29,29 @@ class FrameIterator(ComponentIterator):
|
|
|
29
29
|
extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
|
|
30
30
|
num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
|
|
31
31
|
`num_frames` is greater than the number of frames in the video, all frames will be extracted.
|
|
32
|
+
all_frame_attrs:
|
|
33
|
+
If True, outputs a `pxt.Json` column `frame_attrs` with the following `pyav`-provided attributes
|
|
34
|
+
(for more information, see `pyav`'s documentation on
|
|
35
|
+
[VideoFrame](https://pyav.org/docs/develop/api/video.html#module-av.video.frame) and
|
|
36
|
+
[Frame](https://pyav.org/docs/develop/api/frame.html)):
|
|
37
|
+
|
|
38
|
+
* `index` (`int`)
|
|
39
|
+
* `pts` (`Optional[int]`)
|
|
40
|
+
* `dts` (`Optional[int]`)
|
|
41
|
+
* `time` (`Optional[float]`)
|
|
42
|
+
* `is_corrupt` (`bool`)
|
|
43
|
+
* `key_frame` (`bool`)
|
|
44
|
+
* `pict_type` (`int`)
|
|
45
|
+
* `interlaced_frame` (`bool`)
|
|
46
|
+
|
|
47
|
+
If False, only outputs frame attributes `frame_idx`, `pos_msec`, and `pos_frame` as separate columns.
|
|
32
48
|
"""
|
|
33
49
|
|
|
34
50
|
# Input parameters
|
|
35
51
|
video_path: Path
|
|
36
52
|
fps: Optional[float]
|
|
37
53
|
num_frames: Optional[int]
|
|
54
|
+
all_frame_attrs: bool
|
|
38
55
|
|
|
39
56
|
# Video info
|
|
40
57
|
container: av.container.input.InputContainer
|
|
@@ -50,7 +67,14 @@ class FrameIterator(ComponentIterator):
|
|
|
50
67
|
# frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
|
|
51
68
|
next_pos: int
|
|
52
69
|
|
|
53
|
-
def __init__(
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
video: str,
|
|
73
|
+
*,
|
|
74
|
+
fps: Optional[float] = None,
|
|
75
|
+
num_frames: Optional[int] = None,
|
|
76
|
+
all_frame_attrs: bool = False,
|
|
77
|
+
):
|
|
54
78
|
if fps is not None and num_frames is not None:
|
|
55
79
|
raise excs.Error('At most one of `fps` or `num_frames` may be specified')
|
|
56
80
|
|
|
@@ -60,6 +84,7 @@ class FrameIterator(ComponentIterator):
|
|
|
60
84
|
self.container = av.open(str(video_path))
|
|
61
85
|
self.fps = fps
|
|
62
86
|
self.num_frames = num_frames
|
|
87
|
+
self.all_frame_attrs = all_frame_attrs
|
|
63
88
|
|
|
64
89
|
self.video_framerate = self.container.streams.video[0].average_rate
|
|
65
90
|
self.video_time_base = self.container.streams.video[0].time_base
|
|
@@ -115,16 +140,17 @@ class FrameIterator(ComponentIterator):
|
|
|
115
140
|
'video': ts.VideoType(nullable=False),
|
|
116
141
|
'fps': ts.FloatType(nullable=True),
|
|
117
142
|
'num_frames': ts.IntType(nullable=True),
|
|
143
|
+
'all_frame_attrs': ts.BoolType(nullable=False),
|
|
118
144
|
}
|
|
119
145
|
|
|
120
146
|
@classmethod
|
|
121
147
|
def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
'
|
|
125
|
-
|
|
126
|
-
'
|
|
127
|
-
}, ['frame']
|
|
148
|
+
attrs: dict[str, ts.ColumnType]
|
|
149
|
+
if kwargs.get('all_frame_attrs'):
|
|
150
|
+
attrs = {'frame_attrs': ts.JsonType()}
|
|
151
|
+
else:
|
|
152
|
+
attrs = {'frame_idx': ts.IntType(), 'pos_msec': ts.FloatType(), 'pos_frame': ts.IntType()}
|
|
153
|
+
return {**attrs, 'frame': ts.ImageType()}, ['frame']
|
|
128
154
|
|
|
129
155
|
def __next__(self) -> dict[str, Any]:
|
|
130
156
|
# Determine the frame index in the video corresponding to the iterator index `next_pos`;
|
|
@@ -164,8 +190,22 @@ class FrameIterator(ComponentIterator):
|
|
|
164
190
|
raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
|
|
165
191
|
img = frame.to_image()
|
|
166
192
|
assert isinstance(img, PIL.Image.Image)
|
|
167
|
-
|
|
168
|
-
result
|
|
193
|
+
pts_msec = float(pts * self.video_time_base * 1000)
|
|
194
|
+
result: dict[str, Any] = {'frame': img}
|
|
195
|
+
if self.all_frame_attrs:
|
|
196
|
+
attrs = {
|
|
197
|
+
'index': video_idx,
|
|
198
|
+
'pts': frame.pts,
|
|
199
|
+
'dts': frame.dts,
|
|
200
|
+
'time': frame.time,
|
|
201
|
+
'is_corrupt': frame.is_corrupt,
|
|
202
|
+
'key_frame': frame.key_frame,
|
|
203
|
+
'pict_type': frame.pict_type,
|
|
204
|
+
'interlaced_frame': frame.interlaced_frame,
|
|
205
|
+
}
|
|
206
|
+
result['frame_attrs'] = attrs
|
|
207
|
+
else:
|
|
208
|
+
result.update({'frame_idx': self.next_pos, 'pos_msec': pts_msec, 'pos_frame': video_idx})
|
|
169
209
|
self.next_pos += 1
|
|
170
210
|
return result
|
|
171
211
|
|
pixeltable/share/packager.py
CHANGED
|
@@ -459,42 +459,51 @@ class TableRestorer:
|
|
|
459
459
|
for col_name, col in temp_cols.items()
|
|
460
460
|
if col_name not in system_col_names and col_name not in media_col_names
|
|
461
461
|
]
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
)
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
462
|
+
|
|
463
|
+
q: sql.Executable
|
|
464
|
+
|
|
465
|
+
assert len(value_store_cols) == len(value_temp_cols)
|
|
466
|
+
if len(value_store_cols) > 0:
|
|
467
|
+
mismatch_predicates = [
|
|
468
|
+
store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
|
|
469
|
+
]
|
|
470
|
+
mismatch_clause = sql.or_(*mismatch_predicates)
|
|
471
|
+
|
|
472
|
+
# This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
|
|
473
|
+
# one value column. Pseudo-SQL:
|
|
474
|
+
#
|
|
475
|
+
# SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
|
|
476
|
+
# FROM store_tbl, temp_tbl
|
|
477
|
+
# WHERE store_tbl.rowid = temp_tbl.rowid
|
|
478
|
+
# AND store_tbl.pos_0 = temp_tbl.pos_0
|
|
479
|
+
# AND ... AND store_tbl.pos_k = temp_tbl.pos_k
|
|
480
|
+
# AND store_tbl.v_min = temp_tbl.v_min
|
|
481
|
+
# AND (
|
|
482
|
+
# store_tbl.col_0 != temp_tbl.col_0
|
|
483
|
+
# OR store_tbl.col_1 != temp_tbl.col_1
|
|
484
|
+
# OR ... OR store_tbl.col_n != temp_tbl.col_n
|
|
485
|
+
# )
|
|
486
|
+
#
|
|
487
|
+
# The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
|
|
488
|
+
# either column is NULL; this is what we want, since it may indicate a column that is present in one version
|
|
489
|
+
# but not the other.
|
|
490
|
+
q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
|
|
491
|
+
_logger.debug(q.compile())
|
|
492
|
+
result = conn.execute(q)
|
|
493
|
+
if result.rowcount > 0:
|
|
494
|
+
_logger.debug(
|
|
495
|
+
f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
|
|
496
|
+
f'{result.rowcount} inconsistent row(s).'
|
|
497
|
+
)
|
|
498
|
+
row = result.first()
|
|
499
|
+
_logger.debug('Example mismatch:')
|
|
500
|
+
_logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
|
|
501
|
+
_logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
|
|
502
|
+
raise excs.Error(
|
|
503
|
+
'Data corruption error: '
|
|
504
|
+
'the replica data are inconsistent with data retrieved from a previous replica.'
|
|
505
|
+
)
|
|
506
|
+
|
|
498
507
|
_logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
|
|
499
508
|
|
|
500
509
|
# Now rectify the v_max values in the temporary table.
|