pixeltable 0.4.6__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (53) hide show
  1. pixeltable/__init__.py +4 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/catalog.py +3 -3
  4. pixeltable/catalog/column.py +49 -0
  5. pixeltable/catalog/insertable_table.py +0 -7
  6. pixeltable/catalog/schema_object.py +1 -14
  7. pixeltable/catalog/table.py +139 -53
  8. pixeltable/catalog/table_version.py +30 -138
  9. pixeltable/catalog/view.py +2 -1
  10. pixeltable/dataframe.py +2 -3
  11. pixeltable/env.py +43 -5
  12. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  13. pixeltable/exec/expr_eval/schedulers.py +36 -15
  14. pixeltable/exprs/array_slice.py +2 -2
  15. pixeltable/exprs/data_row.py +13 -0
  16. pixeltable/exprs/expr.py +9 -9
  17. pixeltable/exprs/function_call.py +2 -2
  18. pixeltable/exprs/globals.py +1 -2
  19. pixeltable/exprs/json_path.py +3 -3
  20. pixeltable/exprs/row_builder.py +14 -16
  21. pixeltable/exprs/string_op.py +3 -3
  22. pixeltable/func/query_template_function.py +2 -2
  23. pixeltable/func/signature.py +30 -3
  24. pixeltable/func/tools.py +2 -2
  25. pixeltable/functions/anthropic.py +75 -25
  26. pixeltable/functions/globals.py +2 -2
  27. pixeltable/functions/llama_cpp.py +9 -1
  28. pixeltable/functions/openai.py +74 -54
  29. pixeltable/functions/video.py +54 -1
  30. pixeltable/functions/vision.py +2 -2
  31. pixeltable/globals.py +74 -12
  32. pixeltable/io/datarows.py +3 -3
  33. pixeltable/io/fiftyone.py +4 -4
  34. pixeltable/io/globals.py +3 -3
  35. pixeltable/io/hf_datasets.py +4 -4
  36. pixeltable/io/pandas.py +6 -6
  37. pixeltable/io/parquet.py +3 -3
  38. pixeltable/io/table_data_conduit.py +2 -2
  39. pixeltable/io/utils.py +2 -2
  40. pixeltable/iterators/document.py +2 -2
  41. pixeltable/iterators/video.py +49 -9
  42. pixeltable/share/packager.py +45 -36
  43. pixeltable/store.py +5 -25
  44. pixeltable/type_system.py +5 -8
  45. pixeltable/utils/__init__.py +2 -2
  46. pixeltable/utils/arrow.py +5 -5
  47. pixeltable/utils/description_helper.py +3 -3
  48. pixeltable/utils/iceberg.py +1 -2
  49. {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/METADATA +70 -19
  50. {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/RECORD +53 -53
  51. {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/WHEEL +0 -0
  52. {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/entry_points.txt +0 -0
  53. {pixeltable-0.4.6.dist-info → pixeltable-0.4.7.dist-info}/licenses/LICENSE +0 -0
@@ -14,7 +14,7 @@ t.select(pxtv.draw_bounding_boxes(t.img, boxes=t.boxes, label=t.labels)).collect
14
14
  import colorsys
15
15
  import hashlib
16
16
  from collections import defaultdict
17
- from typing import Any, Optional, Union
17
+ from typing import Any, Optional
18
18
 
19
19
  import numpy as np
20
20
  import PIL.Image
@@ -352,7 +352,7 @@ def draw_bounding_boxes(
352
352
  from PIL import ImageColor, ImageDraw, ImageFont
353
353
 
354
354
  # set default font if not provided
355
- txt_font: Union[ImageFont.ImageFont, ImageFont.FreeTypeFont] = (
355
+ txt_font: ImageFont.ImageFont | ImageFont.FreeTypeFont = (
356
356
  ImageFont.load_default() if font is None else ImageFont.truetype(font=font, size=font_size or 10)
357
357
  )
358
358
 
pixeltable/globals.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import logging
4
4
  import os
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union
6
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Union
7
7
 
8
8
  import pandas as pd
9
9
  from pandas.io.formats.style import Styler
@@ -27,8 +27,8 @@ if TYPE_CHECKING:
27
27
  RowData, # list of dictionaries
28
28
  DataFrame, # Pixeltable DataFrame
29
29
  pd.DataFrame, # pandas DataFrame
30
- 'datasets.Dataset',
31
- 'datasets.DatasetDict', # Huggingface datasets
30
+ datasets.Dataset,
31
+ datasets.DatasetDict, # Huggingface datasets
32
32
  ]
33
33
 
34
34
 
@@ -51,7 +51,7 @@ def create_table(
51
51
  source_format: Optional[Literal['csv', 'excel', 'parquet', 'json']] = None,
52
52
  schema_overrides: Optional[dict[str, Any]] = None,
53
53
  on_error: Literal['abort', 'ignore'] = 'abort',
54
- primary_key: Optional[Union[str, list[str]]] = None,
54
+ primary_key: str | list[str] | None = None,
55
55
  num_retained_versions: int = 10,
56
56
  comment: str = '',
57
57
  media_validation: Literal['on_read', 'on_write'] = 'on_write',
@@ -197,7 +197,7 @@ def create_table(
197
197
 
198
198
  def create_view(
199
199
  path: str,
200
- base: Union[catalog.Table, DataFrame],
200
+ base: catalog.Table | DataFrame,
201
201
  *,
202
202
  additional_columns: Optional[dict[str, Any]] = None,
203
203
  is_snapshot: bool = False,
@@ -317,7 +317,7 @@ def create_view(
317
317
 
318
318
  def create_snapshot(
319
319
  path_str: str,
320
- base: Union[catalog.Table, DataFrame],
320
+ base: catalog.Table | DataFrame,
321
321
  *,
322
322
  additional_columns: Optional[dict[str, Any]] = None,
323
323
  iterator: Optional[tuple[type[ComponentIterator], dict[str, Any]]] = None,
@@ -396,7 +396,7 @@ def create_snapshot(
396
396
  )
397
397
 
398
398
 
399
- def create_replica(destination: str, source: Union[str, catalog.Table]) -> Optional[catalog.Table]:
399
+ def create_replica(destination: str, source: str | catalog.Table) -> Optional[catalog.Table]:
400
400
  """
401
401
  Create a replica of a table. Can be used either to create a remote replica of a local table, or to create a local
402
402
  replica of a remote table. A given table can have at most one replica per Pixeltable instance.
@@ -484,7 +484,7 @@ def move(path: str, new_path: str) -> None:
484
484
 
485
485
 
486
486
  def drop_table(
487
- table: Union[str, catalog.Table], force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error'
487
+ table: str | catalog.Table, force: bool = False, if_not_exists: Literal['error', 'ignore'] = 'error'
488
488
  ) -> None:
489
489
  """Drop a table, view, or snapshot.
490
490
 
@@ -534,6 +534,57 @@ def drop_table(
534
534
  Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
535
535
 
536
536
 
537
+ def get_dir_contents(dir_path: str = '', recursive: bool = True) -> 'DirContents':
538
+ """Get the contents of a Pixeltable directory.
539
+
540
+ Args:
541
+ dir_path: Path to the directory. Defaults to the root directory.
542
+ recursive: If `False`, returns only those tables and directories that are directly contained in specified
543
+ directory; if `True`, returns all tables and directories that are descendants of the specified directory,
544
+ recursively.
545
+
546
+ Returns:
547
+ A [`DirContents`][pixeltable.DirContents] object representing the contents of the specified directory.
548
+
549
+ Raises:
550
+ Error: If the path does not exist or does not designate a directory.
551
+
552
+ Examples:
553
+ Get contents of top-level directory:
554
+
555
+ >>> pxt.get_dir_contents()
556
+
557
+ Get contents of 'dir1':
558
+
559
+ >>> pxt.get_dir_contents('dir1')
560
+ """
561
+ path_obj = catalog.Path.parse(dir_path, allow_empty_path=True)
562
+ catalog_entries = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
563
+ dirs: list[str] = []
564
+ tables: list[str] = []
565
+ _assemble_dir_contents(dir_path, catalog_entries, dirs, tables)
566
+ dirs.sort()
567
+ tables.sort()
568
+ return DirContents(dirs, tables)
569
+
570
+
571
+ def _assemble_dir_contents(
572
+ dir_path: str, catalog_entries: dict[str, Catalog.DirEntry], dirs: list[str], tables: list[str]
573
+ ) -> None:
574
+ for name, entry in catalog_entries.items():
575
+ if name.startswith('_'):
576
+ continue # Skip system paths
577
+ path = f'{dir_path}.{name}' if len(dir_path) > 0 else name
578
+ if entry.dir is not None:
579
+ dirs.append(path)
580
+ if entry.dir_entries is not None:
581
+ _assemble_dir_contents(path, entry.dir_entries, dirs, tables)
582
+ else:
583
+ assert entry.table is not None
584
+ assert not entry.dir_entries
585
+ tables.append(path)
586
+
587
+
537
588
  def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
538
589
  """List the [`Table`][pixeltable.Table]s in a directory.
539
590
 
@@ -667,8 +718,8 @@ def ls(path: str = '') -> pd.DataFrame:
667
718
  This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
668
719
  including various attributes such as version and base table, as appropriate.
669
720
 
670
- To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
671
- [list_dirs()][pixeltable.list_dirs] instead.
721
+ To get a programmatic list of the directory's contents, use [get_dir_contents()][pixeltable.get_dir_contents]
722
+ instead.
672
723
  """
673
724
  from pixeltable.catalog import retry_loop
674
725
  from pixeltable.metadata import schema
@@ -701,7 +752,7 @@ def ls(path: str = '') -> pd.DataFrame:
701
752
  kind = 'view'
702
753
  else:
703
754
  kind = 'table'
704
- version = '' if kind == 'snapshot' else md['version']
755
+ version = '' if kind == 'snapshot' else str(md['version'])
705
756
  if md['is_replica']:
706
757
  kind = f'{kind}-replica'
707
758
  rows.append([name, kind, version, base])
@@ -798,7 +849,7 @@ def list_functions() -> Styler:
798
849
  return pd_df.hide(axis='index')
799
850
 
800
851
 
801
- def tools(*args: Union[func.Function, func.tools.Tool]) -> func.tools.Tools:
852
+ def tools(*args: func.Function | func.tools.Tool) -> func.tools.Tools:
802
853
  """
803
854
  Specifies a collection of UDFs to be used as LLM tools. Pixeltable allows any UDF to be used as an input into an
804
855
  LLM tool-calling API. To use one or more UDFs as tools, wrap them in a `pxt.tools` call and pass the return value
@@ -875,3 +926,14 @@ def configure_logging(
875
926
 
876
927
  def array(elements: Iterable) -> exprs.Expr:
877
928
  return exprs.Expr.from_array(elements)
929
+
930
+
931
+ class DirContents(NamedTuple):
932
+ """
933
+ Represents the contents of a Pixeltable directory.
934
+ """
935
+
936
+ dirs: list[str]
937
+ """List of directory paths contained in this directory."""
938
+ tables: list[str]
939
+ """List of table paths contained in this directory."""
pixeltable/io/datarows.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, Iterable, Optional, Union
3
+ from typing import Any, Iterable, Optional
4
4
 
5
5
  import pixeltable as pxt
6
6
  import pixeltable.type_system as ts
@@ -61,7 +61,7 @@ def import_rows(
61
61
  rows: list[dict[str, Any]],
62
62
  *,
63
63
  schema_overrides: Optional[dict[str, Any]] = None,
64
- primary_key: Optional[Union[str, list[str]]] = None,
64
+ primary_key: str | list[str] | None = None,
65
65
  num_retained_versions: int = 10,
66
66
  comment: str = '',
67
67
  ) -> pxt.Table:
@@ -105,7 +105,7 @@ def import_json(
105
105
  filepath_or_url: str,
106
106
  *,
107
107
  schema_overrides: Optional[dict[str, Any]] = None,
108
- primary_key: Optional[Union[str, list[str]]] = None,
108
+ primary_key: str | list[str] | None = None,
109
109
  num_retained_versions: int = 10,
110
110
  comment: str = '',
111
111
  **kwargs: Any,
pixeltable/io/fiftyone.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Any, Iterator, Optional, Union
2
+ from typing import Any, Iterator, Optional
3
3
 
4
4
  import fiftyone as fo # type: ignore[import-untyped]
5
5
  import fiftyone.utils.data as foud # type: ignore[import-untyped]
@@ -28,11 +28,11 @@ class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
28
28
  tbl: pxt.Table,
29
29
  image: exprs.Expr,
30
30
  image_format: str,
31
- classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
32
- detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
31
+ classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
32
+ detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
33
33
  dataset_dir: Optional[os.PathLike] = None,
34
34
  shuffle: bool = False,
35
- seed: Union[int, float, str, bytes, bytearray, None] = None,
35
+ seed: int | float | str | bytes | bytearray | None = None,
36
36
  max_samples: Optional[int] = None,
37
37
  ):
38
38
  super().__init__(dataset_dir=dataset_dir, shuffle=shuffle, seed=seed, max_samples=max_samples)
pixeltable/io/globals.py CHANGED
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Any, Literal, Optional, Union
3
+ from typing import TYPE_CHECKING, Any, Literal, Optional
4
4
 
5
5
  import pixeltable as pxt
6
6
  import pixeltable.exceptions as excs
@@ -143,8 +143,8 @@ def export_images_as_fo_dataset(
143
143
  tbl: pxt.Table,
144
144
  images: exprs.Expr,
145
145
  image_format: str = 'webp',
146
- classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
147
- detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
146
+ classifications: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
147
+ detections: exprs.Expr | list[exprs.Expr] | dict[str, exprs.Expr] | None = None,
148
148
  ) -> 'fo.Dataset':
149
149
  """
150
150
  Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
@@ -1,7 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import typing
4
- from typing import Any, Optional, Union
4
+ from typing import Any, Optional
5
5
 
6
6
  import pixeltable as pxt
7
7
  import pixeltable.type_system as ts
@@ -66,7 +66,7 @@ def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.Column
66
66
  return None
67
67
 
68
68
 
69
- def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
69
+ def _get_hf_schema(dataset: datasets.Dataset | datasets.DatasetDict) -> datasets.Features:
70
70
  """Get the schema of a huggingface dataset as a dictionary."""
71
71
  import datasets
72
72
 
@@ -91,10 +91,10 @@ def huggingface_schema_to_pxt_schema(
91
91
 
92
92
  def import_huggingface_dataset(
93
93
  table_path: str,
94
- dataset: Union[datasets.Dataset, datasets.DatasetDict],
94
+ dataset: datasets.Dataset | datasets.DatasetDict,
95
95
  *,
96
96
  schema_overrides: Optional[dict[str, Any]] = None,
97
- primary_key: Optional[Union[str, list[str]]] = None,
97
+ primary_key: str | list[str] | None = None,
98
98
  **kwargs: Any,
99
99
  ) -> pxt.Table:
100
100
  """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
pixeltable/io/pandas.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import os
2
- from typing import Any, Optional, Union
2
+ from typing import Any, Optional
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
@@ -17,7 +17,7 @@ def import_pandas(
17
17
  df: pd.DataFrame,
18
18
  *,
19
19
  schema_overrides: Optional[dict[str, Any]] = None,
20
- primary_key: Optional[Union[str, list[str]]] = None,
20
+ primary_key: str | list[str] | None = None,
21
21
  num_retained_versions: int = 10,
22
22
  comment: str = '',
23
23
  ) -> pxt.Table:
@@ -55,9 +55,9 @@ def import_pandas(
55
55
 
56
56
  def import_csv(
57
57
  tbl_name: str,
58
- filepath_or_buffer: Union[str, os.PathLike],
58
+ filepath_or_buffer: str | os.PathLike,
59
59
  schema_overrides: Optional[dict[str, Any]] = None,
60
- primary_key: Optional[Union[str, list[str]]] = None,
60
+ primary_key: str | list[str] | None = None,
61
61
  num_retained_versions: int = 10,
62
62
  comment: str = '',
63
63
  **kwargs: Any,
@@ -84,10 +84,10 @@ def import_csv(
84
84
 
85
85
  def import_excel(
86
86
  tbl_name: str,
87
- io: Union[str, os.PathLike],
87
+ io: str | os.PathLike,
88
88
  *,
89
89
  schema_overrides: Optional[dict[str, Any]] = None,
90
- primary_key: Optional[Union[str, list[str]]] = None,
90
+ primary_key: str | list[str] | None = None,
91
91
  num_retained_versions: int = 10,
92
92
  comment: str = '',
93
93
  **kwargs: Any,
pixeltable/io/parquet.py CHANGED
@@ -7,7 +7,7 @@ import logging
7
7
  import typing
8
8
  from collections import deque
9
9
  from pathlib import Path
10
- from typing import Any, Optional, Union
10
+ from typing import Any, Optional
11
11
 
12
12
  import numpy as np
13
13
  import PIL.Image
@@ -42,7 +42,7 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
42
42
 
43
43
 
44
44
  def export_parquet(
45
- table_or_df: Union[pxt.Table, pxt.DataFrame],
45
+ table_or_df: pxt.Table | pxt.DataFrame,
46
46
  parquet_path: Path,
47
47
  partition_size_bytes: int = 100_000_000,
48
48
  inline_images: bool = False,
@@ -152,7 +152,7 @@ def import_parquet(
152
152
  *,
153
153
  parquet_path: str,
154
154
  schema_overrides: Optional[dict[str, Any]] = None,
155
- primary_key: Optional[Union[str, list[str]]] = None,
155
+ primary_key: str | list[str] | None = None,
156
156
  **kwargs: Any,
157
157
  ) -> pxt.Table:
158
158
  """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
@@ -8,7 +8,7 @@ import urllib.parse
8
8
  import urllib.request
9
9
  from dataclasses import dataclass, field, fields
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Union, cast
11
+ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, cast
12
12
 
13
13
  import pandas as pd
14
14
  from pyarrow.parquet import ParquetDataset
@@ -325,7 +325,7 @@ class JsonTableDataConduit(TableDataConduit):
325
325
 
326
326
 
327
327
  class HFTableDataConduit(TableDataConduit):
328
- hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
328
+ hf_ds: datasets.Dataset | datasets.DatasetDict | None = None
329
329
  column_name_for_split: Optional[str] = None
330
330
  categorical_features: dict[str, dict[int, str]]
331
331
  dataset_dict: dict[str, datasets.Dataset] = None
pixeltable/io/utils.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from keyword import iskeyword as is_python_keyword
2
- from typing import Any, Optional, Union
2
+ from typing import Any, Optional
3
3
 
4
4
  import pixeltable as pxt
5
5
  import pixeltable.exceptions as excs
@@ -21,7 +21,7 @@ def normalize_pxt_col_name(name: str) -> str:
21
21
  return id
22
22
 
23
23
 
24
- def normalize_primary_key_parameter(primary_key: Optional[Union[str, list[str]]] = None) -> list[str]:
24
+ def normalize_primary_key_parameter(primary_key: str | list[str] | None = None) -> list[str]:
25
25
  if primary_key is None:
26
26
  primary_key = []
27
27
  elif isinstance(primary_key, str):
@@ -1,7 +1,7 @@
1
1
  import dataclasses
2
2
  import enum
3
3
  import logging
4
- from typing import Any, ClassVar, Iterable, Iterator, Optional, Union
4
+ from typing import Any, ClassVar, Iterable, Iterator, Optional
5
5
 
6
6
  import ftfy
7
7
 
@@ -273,7 +273,7 @@ class DocumentSplitter(ComponentIterator):
273
273
  yield DocumentSection(text=full_text, metadata=md)
274
274
  accumulated_text = []
275
275
 
276
- def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
276
+ def process_element(el: bs4.element.Tag | bs4.NavigableString) -> Iterator[DocumentSection]:
277
277
  # process the element and emit sections as necessary
278
278
  nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
279
279
 
@@ -29,12 +29,29 @@ class FrameIterator(ComponentIterator):
29
29
  extracted). If `fps` is greater than the frame rate of the video, an error will be raised.
30
30
  num_frames: Exact number of frames to extract. The frames will be spaced as evenly as possible. If
31
31
  `num_frames` is greater than the number of frames in the video, all frames will be extracted.
32
+ all_frame_attrs:
33
+ If True, outputs a `pxt.Json` column `frame_attrs` with the following `pyav`-provided attributes
34
+ (for more information, see `pyav`'s documentation on
35
+ [VideoFrame](https://pyav.org/docs/develop/api/video.html#module-av.video.frame) and
36
+ [Frame](https://pyav.org/docs/develop/api/frame.html)):
37
+
38
+ * `index` (`int`)
39
+ * `pts` (`Optional[int]`)
40
+ * `dts` (`Optional[int]`)
41
+ * `time` (`Optional[float]`)
42
+ * `is_corrupt` (`bool`)
43
+ * `key_frame` (`bool`)
44
+ * `pict_type` (`int`)
45
+ * `interlaced_frame` (`bool`)
46
+
47
+ If False, only outputs frame attributes `frame_idx`, `pos_msec`, and `pos_frame` as separate columns.
32
48
  """
33
49
 
34
50
  # Input parameters
35
51
  video_path: Path
36
52
  fps: Optional[float]
37
53
  num_frames: Optional[int]
54
+ all_frame_attrs: bool
38
55
 
39
56
  # Video info
40
57
  container: av.container.input.InputContainer
@@ -50,7 +67,14 @@ class FrameIterator(ComponentIterator):
50
67
  # frame index in the video. Otherwise, the corresponding video index is `frames_to_extract[next_pos]`.
51
68
  next_pos: int
52
69
 
53
- def __init__(self, video: str, *, fps: Optional[float] = None, num_frames: Optional[int] = None):
70
+ def __init__(
71
+ self,
72
+ video: str,
73
+ *,
74
+ fps: Optional[float] = None,
75
+ num_frames: Optional[int] = None,
76
+ all_frame_attrs: bool = False,
77
+ ):
54
78
  if fps is not None and num_frames is not None:
55
79
  raise excs.Error('At most one of `fps` or `num_frames` may be specified')
56
80
 
@@ -60,6 +84,7 @@ class FrameIterator(ComponentIterator):
60
84
  self.container = av.open(str(video_path))
61
85
  self.fps = fps
62
86
  self.num_frames = num_frames
87
+ self.all_frame_attrs = all_frame_attrs
63
88
 
64
89
  self.video_framerate = self.container.streams.video[0].average_rate
65
90
  self.video_time_base = self.container.streams.video[0].time_base
@@ -115,16 +140,17 @@ class FrameIterator(ComponentIterator):
115
140
  'video': ts.VideoType(nullable=False),
116
141
  'fps': ts.FloatType(nullable=True),
117
142
  'num_frames': ts.IntType(nullable=True),
143
+ 'all_frame_attrs': ts.BoolType(nullable=False),
118
144
  }
119
145
 
120
146
  @classmethod
121
147
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
122
- return {
123
- 'frame_idx': ts.IntType(),
124
- 'pos_msec': ts.FloatType(),
125
- 'pos_frame': ts.IntType(),
126
- 'frame': ts.ImageType(),
127
- }, ['frame']
148
+ attrs: dict[str, ts.ColumnType]
149
+ if kwargs.get('all_frame_attrs'):
150
+ attrs = {'frame_attrs': ts.JsonType()}
151
+ else:
152
+ attrs = {'frame_idx': ts.IntType(), 'pos_msec': ts.FloatType(), 'pos_frame': ts.IntType()}
153
+ return {**attrs, 'frame': ts.ImageType()}, ['frame']
128
154
 
129
155
  def __next__(self) -> dict[str, Any]:
130
156
  # Determine the frame index in the video corresponding to the iterator index `next_pos`;
@@ -164,8 +190,22 @@ class FrameIterator(ComponentIterator):
164
190
  raise excs.Error(f'Frame {next_video_idx} is missing from the video (video file is corrupt)')
165
191
  img = frame.to_image()
166
192
  assert isinstance(img, PIL.Image.Image)
167
- pos_msec = float(pts * self.video_time_base * 1000)
168
- result = {'frame_idx': self.next_pos, 'pos_msec': pos_msec, 'pos_frame': video_idx, 'frame': img}
193
+ pts_msec = float(pts * self.video_time_base * 1000)
194
+ result: dict[str, Any] = {'frame': img}
195
+ if self.all_frame_attrs:
196
+ attrs = {
197
+ 'index': video_idx,
198
+ 'pts': frame.pts,
199
+ 'dts': frame.dts,
200
+ 'time': frame.time,
201
+ 'is_corrupt': frame.is_corrupt,
202
+ 'key_frame': frame.key_frame,
203
+ 'pict_type': frame.pict_type,
204
+ 'interlaced_frame': frame.interlaced_frame,
205
+ }
206
+ result['frame_attrs'] = attrs
207
+ else:
208
+ result.update({'frame_idx': self.next_pos, 'pos_msec': pts_msec, 'pos_frame': video_idx})
169
209
  self.next_pos += 1
170
210
  return result
171
211
 
@@ -459,42 +459,51 @@ class TableRestorer:
459
459
  for col_name, col in temp_cols.items()
460
460
  if col_name not in system_col_names and col_name not in media_col_names
461
461
  ]
462
- mismatch_predicates = [store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)]
463
- mismatch_clause = sql.or_(*mismatch_predicates)
464
-
465
- # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
466
- # one value column. Pseudo-SQL:
467
- #
468
- # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
469
- # FROM store_tbl, temp_tbl
470
- # WHERE store_tbl.rowid = temp_tbl.rowid
471
- # AND store_tbl.pos_0 = temp_tbl.pos_0
472
- # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
473
- # AND store_tbl.v_min = temp_tbl.v_min
474
- # AND (
475
- # store_tbl.col_0 != temp_tbl.col_0
476
- # OR store_tbl.col_1 != temp_tbl.col_1
477
- # OR ... OR store_tbl.col_n != temp_tbl.col_n
478
- # )
479
- #
480
- # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
481
- # either column is NULL; this is what we want, since it may indicate a column that is present in one version
482
- # but not the other.
483
- q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
484
- _logger.debug(q.compile())
485
- result = conn.execute(q)
486
- if result.rowcount > 0:
487
- _logger.debug(
488
- f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
489
- f'{result.rowcount} inconsistent row(s).'
490
- )
491
- row = result.first()
492
- _logger.debug('Example mismatch:')
493
- _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
494
- _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
495
- raise excs.Error(
496
- 'Data corruption error: the replica data are inconsistent with data retrieved from a previous replica.'
497
- )
462
+
463
+ q: sql.Executable
464
+
465
+ assert len(value_store_cols) == len(value_temp_cols)
466
+ if len(value_store_cols) > 0:
467
+ mismatch_predicates = [
468
+ store_col != temp_col for store_col, temp_col in zip(value_store_cols, value_temp_cols)
469
+ ]
470
+ mismatch_clause = sql.or_(*mismatch_predicates)
471
+
472
+ # This query looks for rows that have matching primary keys (rowid + pos_k + v_min), but differ in at least
473
+ # one value column. Pseudo-SQL:
474
+ #
475
+ # SELECT store_tbl.col_0, ..., store_tbl.col_n, temp_tbl.col_0, ..., temp_tbl.col_n
476
+ # FROM store_tbl, temp_tbl
477
+ # WHERE store_tbl.rowid = temp_tbl.rowid
478
+ # AND store_tbl.pos_0 = temp_tbl.pos_0
479
+ # AND ... AND store_tbl.pos_k = temp_tbl.pos_k
480
+ # AND store_tbl.v_min = temp_tbl.v_min
481
+ # AND (
482
+ # store_tbl.col_0 != temp_tbl.col_0
483
+ # OR store_tbl.col_1 != temp_tbl.col_1
484
+ # OR ... OR store_tbl.col_n != temp_tbl.col_n
485
+ # )
486
+ #
487
+ # The value column comparisons (store_tbl.col_0 != temp_tbl.col_0, etc.) will always be false for rows where
488
+ # either column is NULL; this is what we want, since it may indicate a column that is present in one version
489
+ # but not the other.
490
+ q = sql.select(*value_store_cols, *value_temp_cols).where(pk_clause).where(mismatch_clause)
491
+ _logger.debug(q.compile())
492
+ result = conn.execute(q)
493
+ if result.rowcount > 0:
494
+ _logger.debug(
495
+ f'Data corruption error between {temp_sa_tbl_name!r} and {store_sa_tbl_name!r}: '
496
+ f'{result.rowcount} inconsistent row(s).'
497
+ )
498
+ row = result.first()
499
+ _logger.debug('Example mismatch:')
500
+ _logger.debug(f'{store_sa_tbl_name}: {row[: len(value_store_cols)]}')
501
+ _logger.debug(f'{temp_sa_tbl_name}: {row[len(value_store_cols) :]}')
502
+ raise excs.Error(
503
+ 'Data corruption error: '
504
+ 'the replica data are inconsistent with data retrieved from a previous replica.'
505
+ )
506
+
498
507
  _logger.debug(f'Verified data integrity between {store_sa_tbl_name!r} and {temp_sa_tbl_name!r}.')
499
508
 
500
509
  # Now rectify the v_max values in the temporary table.