pixeltable 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/catalog.py +619 -255
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +9 -9
  6. pixeltable/catalog/path.py +59 -20
  7. pixeltable/catalog/schema_object.py +10 -4
  8. pixeltable/catalog/table.py +51 -53
  9. pixeltable/catalog/table_version.py +216 -156
  10. pixeltable/catalog/table_version_path.py +1 -1
  11. pixeltable/catalog/tbl_ops.py +44 -0
  12. pixeltable/catalog/view.py +63 -65
  13. pixeltable/config.py +12 -4
  14. pixeltable/dataframe.py +75 -6
  15. pixeltable/env.py +46 -17
  16. pixeltable/exec/aggregation_node.py +1 -1
  17. pixeltable/exec/cache_prefetch_node.py +2 -6
  18. pixeltable/exec/component_iteration_node.py +4 -3
  19. pixeltable/exec/data_row_batch.py +10 -51
  20. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  21. pixeltable/exec/in_memory_data_node.py +17 -16
  22. pixeltable/exec/sql_node.py +6 -7
  23. pixeltable/exprs/column_ref.py +2 -1
  24. pixeltable/exprs/data_row.py +13 -13
  25. pixeltable/exprs/row_builder.py +16 -4
  26. pixeltable/exprs/string_op.py +1 -1
  27. pixeltable/func/expr_template_function.py +1 -4
  28. pixeltable/functions/date.py +1 -1
  29. pixeltable/functions/gemini.py +4 -4
  30. pixeltable/functions/math.py +1 -1
  31. pixeltable/functions/openai.py +9 -6
  32. pixeltable/functions/timestamp.py +6 -6
  33. pixeltable/functions/video.py +2 -6
  34. pixeltable/globals.py +62 -33
  35. pixeltable/io/datarows.py +2 -1
  36. pixeltable/io/pandas.py +1 -0
  37. pixeltable/io/table_data_conduit.py +12 -13
  38. pixeltable/iterators/audio.py +17 -8
  39. pixeltable/iterators/image.py +5 -2
  40. pixeltable/metadata/schema.py +39 -2
  41. pixeltable/plan.py +5 -14
  42. pixeltable/share/packager.py +13 -13
  43. pixeltable/store.py +31 -7
  44. pixeltable/type_system.py +2 -1
  45. pixeltable/utils/filecache.py +1 -1
  46. pixeltable/utils/http_server.py +2 -3
  47. pixeltable/utils/media_store.py +90 -34
  48. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
  49. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/RECORD +52 -51
  50. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
  51. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
  52. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
pixeltable/globals.py CHANGED
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
8
8
  import pandas as pd
9
9
  from pandas.io.formats.style import Styler
10
10
 
11
- from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
11
+ from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
12
12
  from pixeltable.catalog import Catalog, TableVersionPath
13
13
  from pixeltable.catalog.insertable_table import OnErrorParameter
14
14
  from pixeltable.config import Config
@@ -44,7 +44,7 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
44
44
 
45
45
 
46
46
  def create_table(
47
- path_str: str,
47
+ path: str,
48
48
  schema: Optional[dict[str, Any]] = None,
49
49
  *,
50
50
  source: Optional[TableDataSource] = None,
@@ -58,14 +58,24 @@ def create_table(
58
58
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
59
59
  extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
60
60
  ) -> catalog.Table:
61
- """Create a new base table.
61
+ """Create a new base table. Exactly one of `schema` or `source` must be provided.
62
+
63
+ If a `schema` is provided, then an empty table will be created with the specified schema.
64
+
65
+ If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
66
+ contents of the specified data, and the data will be imported from the specified source into the new table. The
67
+ source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
62
68
 
63
69
  Args:
64
- path_str: Path to the table.
65
- schema: A dictionary that maps column names to column types
66
- source: A data source from which a table schema can be inferred and data imported
67
- source_format: A hint to the format of the source data
68
- schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
70
+ path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
71
+ schema: Schema for the new table, mapping column names to Pixeltable types.
72
+ source: A data source (file, URL, DataFrame, or list of rows) to import from.
73
+ source_format: Must be used in conjunction with a `source`.
74
+ If specified, then the given format will be used to read the source data. (Otherwise,
75
+ Pixeltable will attempt to infer the format from the source data.)
76
+ schema_overrides: Must be used in conjunction with a `source`.
77
+ If specified, then columns in `schema_overrides` will be given the specified types.
78
+ (Pixeltable will attempt to infer the types of any columns not specified.)
69
79
  on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
70
80
  invalid media file (such as a corrupt image) for one of the inserted rows.
71
81
 
@@ -81,14 +91,15 @@ def create_table(
81
91
 
82
92
  - `'on_read'`: validate media files at query time
83
93
  - `'on_write'`: validate media files during insert/update operations
84
- if_exists: Directive regarding how to handle if the path already exists.
85
- Must be one of the following:
94
+ if_exists: Determines the behavior if a table already exists at the specified path location.
86
95
 
87
96
  - `'error'`: raise an error
88
97
  - `'ignore'`: do nothing and return the existing table handle
89
- - `'replace'`: if the existing table has no views, drop and replace it with a new one
90
- - `'replace_force'`: drop the existing table and all its views, and create a new one
91
- extra_args: Additional arguments to pass to the source data provider
98
+ - `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
99
+ raise an error if the existing table has views or snapshots
100
+ - `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
101
+ extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
102
+ passed along to the source data provider.
92
103
 
93
104
  Returns:
94
105
  A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
@@ -114,7 +125,7 @@ def create_table(
114
125
  >>> tbl1 = pxt.get_table('orig_table')
115
126
  ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
116
127
 
117
- Create a table if does not already exist, otherwise get the existing table:
128
+ Create a table if it does not already exist, otherwise get the existing table:
118
129
 
119
130
  >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
120
131
 
@@ -130,12 +141,12 @@ def create_table(
130
141
  from pixeltable.io.utils import normalize_primary_key_parameter
131
142
 
132
143
  if (schema is None) == (source is None):
133
- raise excs.Error('Must provide either a `schema` or a `source`')
144
+ raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
134
145
 
135
146
  if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
136
147
  raise excs.Error('`schema` must be a non-empty dictionary')
137
148
 
138
- path_obj = catalog.Path(path_str)
149
+ path_obj = catalog.Path.parse(path)
139
150
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
140
151
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
141
152
  primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
@@ -146,7 +157,14 @@ def create_table(
146
157
  tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
147
158
  tds.check_source_format()
148
159
  data_source = tds.specialize()
149
- data_source.src_schema_overrides = schema_overrides
160
+ src_schema_overrides: dict[str, ts.ColumnType] = {}
161
+ if schema_overrides is not None:
162
+ for col_name, py_type in schema_overrides.items():
163
+ col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
164
+ if col_type is None:
165
+ raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
166
+ src_schema_overrides[col_name] = col_type
167
+ data_source.src_schema_overrides = src_schema_overrides
150
168
  data_source.src_pk = primary_key
151
169
  data_source.infer_schema()
152
170
  schema = data_source.pxt_schema
@@ -255,9 +273,7 @@ def create_view(
255
273
  tbl_version_path = base._tbl_version_path
256
274
  sample_clause = None
257
275
  elif isinstance(base, DataFrame):
258
- base._validate_mutable('create_view', allow_select=True)
259
- if len(base._from_clause.tbls) > 1:
260
- raise excs.Error('Cannot create a view of a join')
276
+ base._validate_mutable_op_sequence('create_view', allow_select=True)
261
277
  tbl_version_path = base._from_clause.tbls[0]
262
278
  where = base.where_clause
263
279
  sample_clause = base.sample_clause
@@ -268,7 +284,7 @@ def create_view(
268
284
  raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')
269
285
  assert isinstance(base, (catalog.Table, DataFrame))
270
286
 
271
- path_obj = catalog.Path(path)
287
+ path_obj = catalog.Path.parse(path)
272
288
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
273
289
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
274
290
 
@@ -429,8 +445,12 @@ def get_table(path: str) -> catalog.Table:
429
445
  Handles to views and snapshots are retrieved in the same way:
430
446
 
431
447
  >>> tbl = pxt.get_table('my_snapshot')
448
+
449
+ Get a handle to a specific version of a table:
450
+
451
+ >>> tbl = pxt.get_table('my_table:722')
432
452
  """
433
- path_obj = catalog.Path(path)
453
+ path_obj = catalog.Path.parse(path, allow_versioned_path=True)
434
454
  tbl = Catalog.get().get_table(path_obj)
435
455
  return tbl
436
456
 
@@ -456,7 +476,7 @@ def move(path: str, new_path: str) -> None:
456
476
  """
457
477
  if path == new_path:
458
478
  raise excs.Error('move(): source and destination cannot be identical')
459
- path_obj, new_path_obj = catalog.Path(path), catalog.Path(new_path)
479
+ path_obj, new_path_obj = catalog.Path.parse(path), catalog.Path.parse(new_path)
460
480
  if path_obj.is_ancestor(new_path_obj):
461
481
  raise excs.Error(f'move(): cannot move {path!r} into its own subdirectory')
462
482
  cat = Catalog.get()
@@ -509,7 +529,7 @@ def drop_table(
509
529
  assert isinstance(table, str)
510
530
  tbl_path = table
511
531
 
512
- path_obj = catalog.Path(tbl_path)
532
+ path_obj = catalog.Path.parse(tbl_path)
513
533
  if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
514
534
  Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
515
535
 
@@ -537,9 +557,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
537
557
 
538
558
  >>> pxt.list_tables('dir1')
539
559
  """
540
- path_obj = catalog.Path(dir_path, empty_is_valid=True) # validate format
541
- cat = Catalog.get()
542
- contents = cat.get_dir_contents(path_obj, recursive=recursive)
560
+ return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
561
+
562
+
563
+ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
564
+ path_obj = catalog.Path.parse(dir_path, allow_empty_path=True, allow_system_path=allow_system_paths)
565
+ contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
543
566
  return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
544
567
 
545
568
 
@@ -590,7 +613,7 @@ def create_dir(
590
613
 
591
614
  >>> pxt.create_dir('parent1.parent2.sub_dir', parents=True)
592
615
  """
593
- path_obj = catalog.Path(path)
616
+ path_obj = catalog.Path.parse(path)
594
617
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
595
618
  return Catalog.get().create_dir(path_obj, if_exists=if_exists_, parents=parents)
596
619
 
@@ -632,7 +655,7 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
632
655
 
633
656
  >>> pxt.drop_dir('my_dir', force=True)
634
657
  """
635
- path_obj = catalog.Path(path) # validate format
658
+ path_obj = catalog.Path.parse(path) # validate format
636
659
  if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
637
660
  Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
638
661
 
@@ -647,13 +670,16 @@ def ls(path: str = '') -> pd.DataFrame:
647
670
  To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
648
671
  [list_dirs()][pixeltable.list_dirs] instead.
649
672
  """
673
+ from pixeltable.catalog import retry_loop
650
674
  from pixeltable.metadata import schema
651
675
 
652
676
  cat = Catalog.get()
653
- path_obj = catalog.Path(path, empty_is_valid=True)
677
+ path_obj = catalog.Path.parse(path, allow_empty_path=True)
654
678
  dir_entries = cat.get_dir_contents(path_obj)
655
- rows: list[list[str]] = []
656
- with Catalog.get().begin_xact():
679
+
680
+ @retry_loop(for_write=False)
681
+ def op() -> list[list[str]]:
682
+ rows: list[list[str]] = []
657
683
  for name, entry in dir_entries.items():
658
684
  if name.startswith('_'):
659
685
  continue
@@ -679,6 +705,9 @@ def ls(path: str = '') -> pd.DataFrame:
679
705
  if md['is_replica']:
680
706
  kind = f'{kind}-replica'
681
707
  rows.append([name, kind, version, base])
708
+ return rows
709
+
710
+ rows = op()
682
711
 
683
712
  rows = sorted(rows, key=lambda x: x[0])
684
713
  df = pd.DataFrame(
@@ -734,7 +763,7 @@ def list_dirs(path: str = '', recursive: bool = True) -> list[str]:
734
763
  >>> cl.list_dirs('my_dir', recursive=True)
735
764
  ['my_dir', 'my_dir.sub_dir1']
736
765
  """
737
- path_obj = catalog.Path(path, empty_is_valid=True) # validate format
766
+ path_obj = catalog.Path.parse(path, allow_empty_path=True) # validate format
738
767
  cat = Catalog.get()
739
768
  contents = cat.get_dir_contents(path_obj, recursive=recursive)
740
769
  return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Dir)]
pixeltable/io/datarows.py CHANGED
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
8
8
 
9
9
 
10
10
  def _infer_schema_from_rows(
11
- rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
11
+ rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
12
12
  ) -> dict[str, ts.ColumnType]:
13
13
  schema: dict[str, ts.ColumnType] = {}
14
14
  cols_with_nones: set[str] = set()
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
20
20
  # in which the column names are encountered in the input data, even if `schema_overrides`
21
21
  # is specified.
22
22
  if col_name not in schema:
23
+ assert isinstance(schema_overrides[col_name], ts.ColumnType)
23
24
  schema[col_name] = schema_overrides[col_name]
24
25
  elif value is not None:
25
26
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
pixeltable/io/pandas.py CHANGED
@@ -132,6 +132,7 @@ def df_infer_schema(
132
132
  pd_schema: dict[str, ts.ColumnType] = {}
133
133
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
134
134
  if pd_name in schema_overrides:
135
+ assert isinstance(schema_overrides[pd_name], ts.ColumnType)
135
136
  pxt_type = schema_overrides[pd_name]
136
137
  else:
137
138
  pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
@@ -47,13 +47,13 @@ class TableDataConduitFormat(str, enum.Enum):
47
47
 
48
48
  @dataclass
49
49
  class TableDataConduit:
50
- source: TableDataSource
50
+ source: 'TableDataSource'
51
51
  source_format: Optional[str] = None
52
52
  source_column_map: Optional[dict[str, str]] = None
53
53
  if_row_exists: Literal['update', 'ignore', 'error'] = 'error'
54
- pxt_schema: Optional[dict[str, Any]] = None
55
- src_schema_overrides: Optional[dict[str, Any]] = None
56
- src_schema: Optional[dict[str, Any]] = None
54
+ pxt_schema: Optional[dict[str, ts.ColumnType]] = None
55
+ src_schema_overrides: Optional[dict[str, ts.ColumnType]] = None
56
+ src_schema: Optional[dict[str, ts.ColumnType]] = None
57
57
  pxt_pk: Optional[list[str]] = None
58
58
  src_pk: Optional[list[str]] = None
59
59
  valid_rows: Optional[RowData] = None
@@ -87,7 +87,7 @@ class TableDataConduit:
87
87
  for name, coltype in self.pxt_schema.items():
88
88
  self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
89
89
 
90
- def infer_schema(self) -> dict[str, Any]:
90
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
91
91
  raise NotImplementedError
92
92
 
93
93
  def valid_row_batch(self) -> Iterator[RowData]:
@@ -137,7 +137,7 @@ class DFTableDataConduit(TableDataConduit):
137
137
  t.pxt_df = tds.source
138
138
  return t
139
139
 
140
- def infer_schema(self) -> dict[str, Any]:
140
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
141
141
  self.pxt_schema = self.pxt_df.schema
142
142
  self.pxt_pk = self.src_pk
143
143
  return self.pxt_schema
@@ -168,7 +168,7 @@ class RowDataTableDataConduit(TableDataConduit):
168
168
  t.batch_count = 0
169
169
  return t
170
170
 
171
- def infer_schema(self) -> dict[str, Any]:
171
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
172
172
  from .datarows import _infer_schema_from_rows
173
173
 
174
174
  if self.source_column_map is None:
@@ -239,7 +239,7 @@ class PandasTableDataConduit(TableDataConduit):
239
239
  t.batch_count = 0
240
240
  return t
241
241
 
242
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
242
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
243
243
  """Return inferred schema, inferred primary key, and source column map"""
244
244
  if self.source_column_map is None:
245
245
  if self.src_schema_overrides is None:
@@ -252,7 +252,7 @@ class PandasTableDataConduit(TableDataConduit):
252
252
  else:
253
253
  raise NotImplementedError()
254
254
 
255
- def infer_schema(self) -> dict[str, Any]:
255
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
256
256
  self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
257
257
  self.normalize_pxt_schema_types()
258
258
  _df_check_primary_key_values(self.pd_df, self.src_pk)
@@ -328,7 +328,6 @@ class HFTableDataConduit(TableDataConduit):
328
328
  hf_ds: Optional[Union[datasets.Dataset, datasets.DatasetDict]] = None
329
329
  column_name_for_split: Optional[str] = None
330
330
  categorical_features: dict[str, dict[int, str]]
331
- hf_schema: dict[str, Any] = None
332
331
  dataset_dict: dict[str, datasets.Dataset] = None
333
332
  hf_schema_source: dict[str, Any] = None
334
333
 
@@ -356,7 +355,7 @@ class HFTableDataConduit(TableDataConduit):
356
355
  except ImportError:
357
356
  return False
358
357
 
359
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
358
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
360
359
  from pixeltable.io.hf_datasets import _get_hf_schema, huggingface_schema_to_pxt_schema
361
360
 
362
361
  if self.source_column_map is None:
@@ -469,7 +468,7 @@ class ParquetTableDataConduit(TableDataConduit):
469
468
  t.pq_ds = parquet.ParquetDataset(str(input_path))
470
469
  return t
471
470
 
472
- def infer_schema_part1(self) -> tuple[dict[str, Any], list[str]]:
471
+ def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
473
472
  from pixeltable.utils.arrow import ar_infer_schema
474
473
 
475
474
  if self.source_column_map is None:
@@ -483,7 +482,7 @@ class ParquetTableDataConduit(TableDataConduit):
483
482
  else:
484
483
  raise NotImplementedError()
485
484
 
486
- def infer_schema(self) -> dict[str, Any]:
485
+ def infer_schema(self) -> dict[str, ts.ColumnType]:
487
486
  self.pxt_schema, self.pxt_pk = self.infer_schema_part1()
488
487
  self.normalize_pxt_schema_types()
489
488
  self.prepare_insert()
@@ -1,5 +1,4 @@
1
1
  import logging
2
- import uuid
3
2
  from fractions import Fraction
4
3
  from pathlib import Path
5
4
  from typing import Any, ClassVar, Optional
@@ -55,12 +54,9 @@ class AudioSplitter(ComponentIterator):
55
54
  def __init__(
56
55
  self, audio: str, chunk_duration_sec: float, *, overlap_sec: float = 0.0, min_chunk_duration_sec: float = 0.0
57
56
  ):
58
- if chunk_duration_sec <= 0.0:
59
- raise excs.Error('chunk_duration_sec must be a positive number')
60
- if chunk_duration_sec < min_chunk_duration_sec:
61
- raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
62
- if overlap_sec >= chunk_duration_sec:
63
- raise excs.Error('overlap_sec must be less than chunk_duration_sec')
57
+ assert chunk_duration_sec > 0.0
58
+ assert chunk_duration_sec >= min_chunk_duration_sec
59
+ assert overlap_sec < chunk_duration_sec
64
60
  audio_path = Path(audio)
65
61
  assert audio_path.exists() and audio_path.is_file()
66
62
  self.audio_path = audio_path
@@ -128,6 +124,19 @@ class AudioSplitter(ComponentIterator):
128
124
 
129
125
  @classmethod
130
126
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
127
+ param_names = ['chunk_duration_sec', 'min_chunk_duration_sec', 'overlap_sec']
128
+ params = dict(zip(param_names, args))
129
+ params.update(kwargs)
130
+
131
+ chunk_duration_sec = params['chunk_duration_sec']
132
+ min_chunk_duration_sec = params.get('min_chunk_duration_sec', 0.0)
133
+ overlap_sec = params.get('overlap_sec', 0.0)
134
+ if chunk_duration_sec <= 0.0:
135
+ raise excs.Error('chunk_duration_sec must be a positive number')
136
+ if chunk_duration_sec < min_chunk_duration_sec:
137
+ raise excs.Error('chunk_duration_sec must be at least min_chunk_duration_sec')
138
+ if overlap_sec >= chunk_duration_sec:
139
+ raise excs.Error('overlap_sec must be less than chunk_duration_sec')
131
140
  return {
132
141
  'start_time_sec': ts.FloatType(),
133
142
  'end_time_sec': ts.FloatType(),
@@ -140,7 +149,7 @@ class AudioSplitter(ComponentIterator):
140
149
  target_chunk_start, target_chunk_end = self.chunks_to_extract_in_pts[self.next_pos]
141
150
  chunk_start_pts = 0
142
151
  chunk_end_pts = 0
143
- chunk_file = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}{self.audio_path.suffix}')
152
+ chunk_file = str(env.Env.get().create_tmp_path(self.audio_path.suffix))
144
153
  output_container = av.open(chunk_file, mode='w')
145
154
  input_stream = self.container.streams.audio[0]
146
155
  codec_name = AudioSplitter.__codec_map.get(input_stream.codec_context.name, input_stream.codec_context.name)
@@ -31,8 +31,7 @@ class TileIterator(ComponentIterator):
31
31
  __j: int
32
32
 
33
33
  def __init__(self, image: PIL.Image.Image, *, tile_size: tuple[int, int], overlap: tuple[int, int] = (0, 0)):
34
- if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
35
- raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
34
+ assert overlap[0] < tile_size[0] and overlap[1] < tile_size[1]
36
35
 
37
36
  self.__image = image
38
37
  self.__image.load()
@@ -79,4 +78,8 @@ class TileIterator(ComponentIterator):
79
78
 
80
79
  @classmethod
81
80
  def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ts.ColumnType], list[str]]:
81
+ tile_size = kwargs.get('tile_size')
82
+ overlap = kwargs.get('overlap', (0, 0))
83
+ if overlap[0] >= tile_size[0] or overlap[1] >= tile_size[1]:
84
+ raise excs.Error(f'overlap dimensions {overlap} are not strictly smaller than tile size {tile_size}')
82
85
  return {'tile': ts.ImageType(), 'tile_coord': ts.JsonType(), 'tile_box': ts.JsonType()}, ['tile']
@@ -24,7 +24,7 @@ def md_from_dict(data_class_type: type[T], data: Any) -> T:
24
24
  """Re-instantiate a dataclass instance that contains nested dataclasses from a dict."""
25
25
  if dataclasses.is_dataclass(data_class_type):
26
26
  fieldtypes = get_type_hints(data_class_type)
27
- return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data}) # type: ignore[return-value]
27
+ return data_class_type(**{f: md_from_dict(fieldtypes[f], data[f]) for f in data})
28
28
 
29
29
  origin = typing.get_origin(data_class_type)
30
30
  if origin is not None:
@@ -182,6 +182,7 @@ class TableMd:
182
182
  # sequence number to track changes in the set of mutable views of this table (ie, this table = the view base)
183
183
  # - incremented for each add/drop of a mutable view
184
184
  # - only maintained for mutable tables
185
+ # TODO: replace with mutable_views: list[UUID] to help with debugging
185
186
  view_sn: int
186
187
 
187
188
  # Metadata format for external stores:
@@ -193,6 +194,26 @@ class TableMd:
193
194
  view_md: Optional[ViewMd]
194
195
  additional_md: dict[str, Any]
195
196
 
197
+ has_pending_ops: bool = False
198
+
199
+ @property
200
+ def is_snapshot(self) -> bool:
201
+ return self.view_md is not None and self.view_md.is_snapshot
202
+
203
+ @property
204
+ def is_mutable(self) -> bool:
205
+ return not self.is_snapshot and not self.is_replica
206
+
207
+ @property
208
+ def is_pure_snapshot(self) -> bool:
209
+ return (
210
+ self.view_md is not None
211
+ and self.view_md.is_snapshot
212
+ and self.view_md.sample_clause is None
213
+ and self.view_md.predicate is None
214
+ and len(self.column_md) == 0
215
+ )
216
+
196
217
 
197
218
  class Table(Base):
198
219
  """
@@ -215,7 +236,7 @@ class Table(Base):
215
236
  lock_dummy: orm.Mapped[int] = orm.mapped_column(BigInteger, nullable=True)
216
237
 
217
238
 
218
- @dataclasses.dataclass(frozen=True)
239
+ @dataclasses.dataclass
219
240
  class TableVersionMd:
220
241
  tbl_id: str # uuid.UUID
221
242
  created_at: float # time.time()
@@ -279,6 +300,22 @@ class TableSchemaVersion(Base):
279
300
  md: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # TableSchemaVersionMd
280
301
 
281
302
 
303
+ class PendingTableOp(Base):
304
+ """
305
+ Table operation that needs to be completed before the table can be used.
306
+
307
+ Operations need to be completed in order of increasing seq_num.
308
+ """
309
+
310
+ __tablename__ = 'pendingtableops'
311
+
312
+ tbl_id: orm.Mapped[uuid.UUID] = orm.mapped_column(
313
+ UUID(as_uuid=True), ForeignKey('tables.id'), primary_key=True, nullable=False
314
+ )
315
+ op_sn: orm.Mapped[int] = orm.mapped_column(Integer, primary_key=True, nullable=False) # catalog.TableOp.op_sn
316
+ op: orm.Mapped[dict[str, Any]] = orm.mapped_column(JSONB, nullable=False) # catalog.TableOp
317
+
318
+
282
319
  @dataclasses.dataclass
283
320
  class FunctionMd:
284
321
  name: str
pixeltable/plan.py CHANGED
@@ -385,14 +385,7 @@ class Planner:
385
385
  TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
386
386
  )
387
387
 
388
- media_input_col_info = [
389
- exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
390
- for col_ref in row_builder.input_exprs
391
- if isinstance(col_ref, exprs.ColumnRef) and col_ref.col_type.is_media_type()
392
- ]
393
- if len(media_input_col_info) > 0:
394
- # prefetch external files for all input column refs
395
- plan = exec.CachePrefetchNode(tbl.id, media_input_col_info, input=plan)
388
+ plan = cls._insert_prefetch_node(tbl.id, row_builder.input_exprs, input_node=plan)
396
389
 
397
390
  computed_exprs = row_builder.output_exprs - row_builder.input_exprs
398
391
  if len(computed_exprs) > 0:
@@ -789,15 +782,13 @@ class Planner:
789
782
 
790
783
  @classmethod
791
784
  def _insert_prefetch_node(
792
- cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input_node: exec.ExecNode
785
+ cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
793
786
  ) -> exec.ExecNode:
794
- """Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
787
+ """Return a CachePrefetchNode if needed, otherwise return input"""
795
788
  # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
796
789
  # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
797
790
  # aren't explicitly captured as dependencies
798
- media_col_refs = [
799
- e for e in list(row_builder.unique_exprs) if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()
800
- ]
791
+ media_col_refs = [e for e in expressions if isinstance(e, exprs.ColumnRef) and e.col_type.is_media_type()]
801
792
  if len(media_col_refs) == 0:
802
793
  return input_node
803
794
  # we need to prefetch external files for media column types
@@ -967,7 +958,7 @@ class Planner:
967
958
  stratify_exprs=analyzer.stratify_exprs,
968
959
  )
969
960
 
970
- plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
961
+ plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder.unique_exprs, plan)
971
962
 
972
963
  if analyzer.group_by_clause is not None:
973
964
  # we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
@@ -1,7 +1,6 @@
1
1
  import base64
2
2
  import datetime
3
3
  import io
4
- import itertools
5
4
  import json
6
5
  import logging
7
6
  import tarfile
@@ -237,8 +236,7 @@ class TablePackager:
237
236
  - Videos are replaced by their first frame and resized as above
238
237
  - Documents are replaced by a thumbnail as a base64-encoded webp
239
238
  """
240
- # First 8 columns
241
- preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
239
+ preview_cols = self.table._get_schema()
242
240
  select_list = [self.table[col_name] for col_name in preview_cols]
243
241
  # First 5 rows
244
242
  rows = list(self.table.select(*select_list).head(n=5))
@@ -369,7 +367,7 @@ class TableRestorer:
369
367
  with cat.begin_xact(for_write=True):
370
368
  # Create (or update) the replica table and its ancestors, along with TableVersion instances for any
371
369
  # versions that have not been seen before.
372
- cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
370
+ cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
373
371
 
374
372
  # Now we need to load data for replica_tbl and its ancestors, except that we skip
375
373
  # replica_tbl itself if it's a pure snapshot.
@@ -572,16 +570,18 @@ class TableRestorer:
572
570
  for col_name in pydict:
573
571
  assert col_name in tv.store_tbl.sa_tbl.columns
574
572
  sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
575
- media_col_ids: dict[str, int] = {}
573
+ media_cols: dict[str, catalog.Column] = {}
576
574
  for col in tv.cols:
577
575
  if col.is_stored and col.col_type.is_media_type():
578
- media_col_ids[col.store_name()] = col.id
576
+ assert tv.id == col.tbl.id
577
+ assert tv.version == col.tbl.version
578
+ media_cols[col.store_name()] = col
579
579
 
580
580
  row_count = len(next(iter(pydict.values())))
581
581
  rows: list[dict[str, Any]] = []
582
582
  for i in range(row_count):
583
583
  row = {
584
- col_name: self.__from_pa_value(tv, col_vals[i], sql_types[col_name], media_col_ids.get(col_name))
584
+ col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
585
585
  for col_name, col_vals in pydict.items()
586
586
  }
587
587
  rows.append(row)
@@ -589,19 +589,19 @@ class TableRestorer:
589
589
  return rows
590
590
 
591
591
  def __from_pa_value(
592
- self, tv: catalog.TableVersion, val: Any, sql_type: sql.types.TypeEngine[Any], media_col_id: Optional[int]
592
+ self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
593
593
  ) -> Any:
594
594
  if val is None:
595
595
  return None
596
596
  if isinstance(sql_type, sql.JSON):
597
597
  return json.loads(val)
598
- if media_col_id is not None:
599
- assert isinstance(val, str)
600
- return self.__relocate_media_file(tv, media_col_id, val)
598
+ if media_col is not None:
599
+ return self.__relocate_media_file(media_col, val)
601
600
  return val
602
601
 
603
- def __relocate_media_file(self, tv: catalog.TableVersion, media_col_id: int, url: str) -> str:
602
+ def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
604
603
  # If this is a pxtmedia:// URL, relocate it
604
+ assert isinstance(url, str)
605
605
  parsed_url = urllib.parse.urlparse(url)
606
606
  assert parsed_url.scheme != 'file' # These should all have been converted to pxtmedia:// URLs
607
607
  if parsed_url.scheme == 'pxtmedia':
@@ -610,7 +610,7 @@ class TableRestorer:
610
610
  # in self.media_files.
611
611
  src_path = self.tmp_dir / 'media' / parsed_url.netloc
612
612
  # Move the file to the media store and update the URL.
613
- self.media_files[url] = MediaStore.relocate_local_media_file(src_path, tv.id, media_col_id, tv.version)
613
+ self.media_files[url] = MediaStore.relocate_local_media_file(src_path, media_col)
614
614
  return self.media_files[url]
615
615
  # For any type of URL other than a local file, just return the URL as-is.
616
616
  return url