pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (68) hide show
  1. pixeltable/__init__.py +4 -0
  2. pixeltable/catalog/catalog.py +125 -63
  3. pixeltable/catalog/column.py +7 -2
  4. pixeltable/catalog/table.py +1 -0
  5. pixeltable/catalog/table_metadata.py +4 -0
  6. pixeltable/catalog/table_version.py +174 -117
  7. pixeltable/catalog/table_version_handle.py +4 -1
  8. pixeltable/catalog/table_version_path.py +0 -11
  9. pixeltable/catalog/view.py +6 -0
  10. pixeltable/config.py +7 -0
  11. pixeltable/dataframe.py +10 -5
  12. pixeltable/env.py +56 -19
  13. pixeltable/exec/__init__.py +2 -0
  14. pixeltable/exec/cell_materialization_node.py +231 -0
  15. pixeltable/exec/cell_reconstruction_node.py +135 -0
  16. pixeltable/exec/exec_node.py +1 -1
  17. pixeltable/exec/expr_eval/evaluators.py +1 -0
  18. pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
  19. pixeltable/exec/expr_eval/globals.py +2 -0
  20. pixeltable/exec/globals.py +32 -0
  21. pixeltable/exec/object_store_save_node.py +1 -4
  22. pixeltable/exec/row_update_node.py +16 -9
  23. pixeltable/exec/sql_node.py +107 -14
  24. pixeltable/exprs/__init__.py +1 -1
  25. pixeltable/exprs/arithmetic_expr.py +23 -18
  26. pixeltable/exprs/column_property_ref.py +10 -10
  27. pixeltable/exprs/column_ref.py +2 -2
  28. pixeltable/exprs/data_row.py +106 -37
  29. pixeltable/exprs/expr.py +9 -0
  30. pixeltable/exprs/expr_set.py +14 -7
  31. pixeltable/exprs/inline_expr.py +2 -19
  32. pixeltable/exprs/json_path.py +45 -12
  33. pixeltable/exprs/row_builder.py +54 -22
  34. pixeltable/functions/__init__.py +1 -0
  35. pixeltable/functions/bedrock.py +7 -0
  36. pixeltable/functions/deepseek.py +11 -4
  37. pixeltable/functions/llama_cpp.py +7 -0
  38. pixeltable/functions/math.py +1 -1
  39. pixeltable/functions/ollama.py +7 -0
  40. pixeltable/functions/openai.py +4 -4
  41. pixeltable/functions/openrouter.py +143 -0
  42. pixeltable/functions/video.py +110 -28
  43. pixeltable/globals.py +10 -4
  44. pixeltable/io/globals.py +18 -17
  45. pixeltable/io/parquet.py +1 -1
  46. pixeltable/io/table_data_conduit.py +47 -22
  47. pixeltable/iterators/document.py +61 -23
  48. pixeltable/iterators/video.py +126 -53
  49. pixeltable/metadata/__init__.py +1 -1
  50. pixeltable/metadata/converters/convert_40.py +73 -0
  51. pixeltable/metadata/notes.py +1 -0
  52. pixeltable/plan.py +175 -46
  53. pixeltable/share/packager.py +155 -26
  54. pixeltable/store.py +2 -3
  55. pixeltable/type_system.py +5 -3
  56. pixeltable/utils/arrow.py +6 -6
  57. pixeltable/utils/av.py +65 -0
  58. pixeltable/utils/console_output.py +4 -1
  59. pixeltable/utils/exception_handler.py +5 -28
  60. pixeltable/utils/image.py +7 -0
  61. pixeltable/utils/misc.py +5 -0
  62. pixeltable/utils/object_stores.py +16 -1
  63. pixeltable/utils/s3_store.py +44 -11
  64. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
  65. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
  66. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
  67. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
  68. {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,4 @@
1
1
  import base64
2
- import datetime
3
2
  import io
4
3
  import json
5
4
  import logging
@@ -13,6 +12,7 @@ from uuid import UUID
13
12
 
14
13
  import more_itertools
15
14
  import numpy as np
15
+ import pgvector.sqlalchemy as sql_vector # type: ignore[import-untyped]
16
16
  import PIL.Image
17
17
  import pyarrow as pa
18
18
  import pyarrow.parquet as pq
@@ -21,6 +21,7 @@ import sqlalchemy as sql
21
21
  import pixeltable as pxt
22
22
  from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
23
23
  from pixeltable.env import Env
24
+ from pixeltable.exprs.data_row import CellMd
24
25
  from pixeltable.metadata import schema
25
26
  from pixeltable.utils import sha256sum
26
27
  from pixeltable.utils.formatter import Formatter
@@ -109,9 +110,12 @@ class TablePackager:
109
110
  assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
110
111
  sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
111
112
  media_cols: set[str] = set()
113
+ cellmd_cols: set[str] = set()
112
114
  for col in tv.cols:
113
115
  if col.is_stored and col.col_type.is_media_type():
114
116
  media_cols.add(col.store_name())
117
+ if col.stores_cellmd:
118
+ cellmd_cols.add(col.cellmd_store_name())
115
119
 
116
120
  parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
117
121
  # TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
@@ -126,10 +130,10 @@ class TablePackager:
126
130
  # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
127
131
  # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
128
132
  # faster compression should provide good performance while still reducing temporary storage utilization.
129
- parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
133
+ parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
130
134
  filter_tv = self.table._tbl_version_path.tbl_version.get()
131
135
  row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
132
- for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
136
+ for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
133
137
  parquet_writer.write_table(pa_table)
134
138
  parquet_writer.close()
135
139
 
@@ -138,7 +142,7 @@ class TablePackager:
138
142
  @classmethod
139
143
  def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
140
144
  entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
141
- return pa.schema(entries) # type: ignore[arg-type]
145
+ return pa.schema(entries)
142
146
 
143
147
  @classmethod
144
148
  def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
@@ -151,13 +155,17 @@ class TablePackager:
151
155
  if isinstance(col_type, sql.Float):
152
156
  return pa.float32()
153
157
  if isinstance(col_type, sql.TIMESTAMP):
154
- return pa.timestamp('us', tz=datetime.timezone.utc)
158
+ return pa.timestamp('us', tz='UTC')
155
159
  if isinstance(col_type, sql.Date):
156
160
  return pa.date32()
157
161
  if isinstance(col_type, sql.JSON):
158
162
  return pa.string() # JSON will be exported as strings
159
163
  if isinstance(col_type, sql.LargeBinary):
160
164
  return pa.binary()
165
+ if isinstance(col_type, sql_vector.Vector):
166
+ # Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
167
+ # here instead.
168
+ return pa.list_(pa.float32())
161
169
  raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
162
170
 
163
171
  def __to_pa_tables(
@@ -165,6 +173,7 @@ class TablePackager:
165
173
  row_iter: Iterator[dict[str, Any]],
166
174
  sql_types: dict[str, sql.types.TypeEngine[Any]],
167
175
  media_cols: set[str],
176
+ cellmd_cols: set[str],
168
177
  arrow_schema: pa.Schema,
169
178
  batch_size: int = 1_000,
170
179
  ) -> Iterator[pa.Table]:
@@ -176,14 +185,21 @@ class TablePackager:
176
185
  for rows in more_itertools.batched(row_iter, batch_size):
177
186
  cols = {}
178
187
  for name, sql_type in sql_types.items():
179
- is_media_col = name in media_cols
180
- values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
188
+ values = [
189
+ self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
190
+ ]
181
191
  cols[name] = values
182
192
  yield pa.Table.from_pydict(cols, schema=arrow_schema)
183
193
 
184
- def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
194
+ def __to_pa_value(
195
+ self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
196
+ ) -> Any:
185
197
  if val is None:
186
198
  return None
199
+ if is_cellmd_col:
200
+ assert isinstance(val, dict)
201
+ # Export JSON as strings
202
+ return json.dumps(self.__process_cellmd(val))
187
203
  if isinstance(sql_type, sql.JSON):
188
204
  # Export JSON as strings
189
205
  return json.dumps(val)
@@ -194,6 +210,10 @@ class TablePackager:
194
210
  return val
195
211
 
196
212
  def __process_media_url(self, url: str) -> str:
213
+ """
214
+ Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
215
+ copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
216
+ """
197
217
  parsed_url = urllib.parse.urlparse(url)
198
218
  if parsed_url.scheme == 'file':
199
219
  # It's the URL of a local file. Replace it with a pxtmedia:// URI.
@@ -214,6 +234,21 @@ class TablePackager:
214
234
  # For any type of URL other than a local file, just return the URL as-is.
215
235
  return url
216
236
 
237
+ def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
238
+ """
239
+ Process a cellmd dictionary for export. This involves replacing any local file references
240
+ with pxtmedia:// URIs, as described above.
241
+ """
242
+ cellmd_ = CellMd.from_dict(cellmd)
243
+ if cellmd_.file_urls is None:
244
+ return cellmd # No changes
245
+
246
+ updated_urls: list[str] = []
247
+ for url in cellmd_.file_urls:
248
+ updated_urls.append(self.__process_media_url(url))
249
+ cellmd_.file_urls = updated_urls
250
+ return cellmd_.as_dict()
251
+
217
252
  def __build_tarball(self) -> Path:
218
253
  bundle_path = self.tmp_dir / 'bundle.tar.bz2'
219
254
  with tarfile.open(bundle_path, 'w:bz2') as tf:
@@ -409,6 +444,9 @@ class TableRestorer:
409
444
  # 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
410
445
  # 3. Delete any row instances from the temporary table that are already present in the existing table;
411
446
  # 4. Copy the remaining rows from the temporary table into the existing table.
447
+ # 5. Rectify any index columns.
448
+
449
+ # STEP 1: Import the parquet data into a temporary table.
412
450
 
413
451
  # Create a temporary table for the initial data load, containing columns for all columns present in the
414
452
  # parquet table. The parquet columns have identical names to those in the store table, so we can use the
@@ -416,7 +454,7 @@ class TableRestorer:
416
454
  # e.g., pa.string() may hold either VARCHAR or serialized JSONB).
417
455
  temp_cols: dict[str, sql.Column] = {}
418
456
  for field in parquet_table.schema:
419
- assert field.name in store_sa_tbl.columns
457
+ assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
420
458
  col_type = store_sa_tbl.columns[field.name].type
421
459
  temp_cols[field.name] = sql.Column(field.name, col_type)
422
460
  temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
@@ -432,6 +470,8 @@ class TableRestorer:
432
470
  rows = self.__from_pa_pydict(tv, pydict)
433
471
  conn.execute(sql.insert(temp_sa_tbl), rows)
434
472
 
473
+ # STEP 2: Rectify v_max values.
474
+
435
475
  # Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
436
476
  # Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
437
477
  # In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
@@ -540,6 +580,8 @@ class TableRestorer:
540
580
  result = conn.execute(q)
541
581
  _logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
542
582
 
583
+ # STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
584
+
543
585
  # Now we need to update rows in the existing table that are also present in the temporary table. This is to
544
586
  # account for the scenario where the temporary table has columns that are not present in the existing table.
545
587
  # (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
@@ -570,7 +612,9 @@ class TableRestorer:
570
612
  result = conn.execute(q)
571
613
  _logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
572
614
 
573
- # Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
615
+ # STEP 4: Copy the remaining rows from the temporary table into the existing table.
616
+
617
+ # Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
574
618
  # the actual table.
575
619
  q = store_sa_tbl.insert().from_select(
576
620
  [store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
@@ -579,39 +623,113 @@ class TableRestorer:
579
623
  result = conn.execute(q)
580
624
  _logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
581
625
 
626
+ # STEP 5: Rectify any index columns.
627
+
628
+ # Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
629
+ # undo columns to ensure they appropriately reflect the most recent replicated version of the table.
630
+
631
+ # Get the most recent replicated version of the table. This might be the version we're currently importing,
632
+ # but it might be a different version of the table that was previously imported.
633
+ head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
634
+ head_version = head_version_md.version_md.version
635
+ _logger.debug(f'Head version for index rectification is {head_version}.')
636
+
637
+ # Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
638
+ # This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
639
+ # a previously replicated version of the table, but not in the one currently being imported.
640
+ index_md = head_version_md.tbl_md.index_md
641
+
642
+ # Now update the table. We can do this for all indices together with just two SQL queries. For each index,
643
+ # at most one of the val or undo columns will be non-NULL in any given row.
644
+ # For rows where v_min <= head_version < v_max, we set, for all indices:
645
+ # val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
646
+ # undo_col = NULL
647
+ # For rows where head_version < v_min or v_max <= head_version, vice versa.
648
+ val_sql_clauses: dict[str, sql.ColumnElement] = {}
649
+ undo_sql_clauses: dict[str, sql.ColumnElement] = {}
650
+ for index in index_md.values():
651
+ if index.class_fqn.endswith('.EmbeddingIndex'):
652
+ val_col_name = f'col_{index.index_val_col_id}'
653
+ undo_col_name = f'col_{index.index_val_undo_col_id}'
654
+ # Check that the val column for the index is actually present in the store table. We need to do this
655
+ # to properly handle the case where the replica represents a table version that was *not* the most
656
+ # recent version at the time it was published. In that case, it is possible for tbl_md to contain
657
+ # metadata for indices not known to any version that has been replicated. (However, the converse
658
+ # *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
659
+ if val_col_name in store_sa_tbl.c:
660
+ assert undo_col_name in store_sa_tbl.c
661
+ coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
662
+ val_sql_clauses[val_col_name] = coalesce
663
+ val_sql_clauses[undo_col_name] = sql.null()
664
+ undo_sql_clauses[undo_col_name] = coalesce
665
+ undo_sql_clauses[val_col_name] = sql.null()
666
+
667
+ if len(val_sql_clauses) > 0:
668
+ q2 = (
669
+ store_sa_tbl.update()
670
+ .values(**val_sql_clauses)
671
+ .where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
672
+ )
673
+ _logger.debug(q2.compile())
674
+ _ = conn.execute(q2)
675
+ q2 = (
676
+ store_sa_tbl.update()
677
+ .values(**undo_sql_clauses)
678
+ .where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
679
+ )
680
+ _logger.debug(q2.compile())
681
+ _ = conn.execute(q2)
682
+ _logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
683
+ else:
684
+ _logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
685
+
582
686
  def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
583
687
  # Data conversions from pyarrow to Pixeltable
584
688
  sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
585
689
  for col_name in pydict:
586
690
  assert col_name in tv.store_tbl.sa_tbl.columns
587
691
  sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
588
- media_cols: dict[str, catalog.Column] = {}
589
- for col in tv.cols:
590
- if col.is_stored and col.col_type.is_media_type():
591
- assert tv.id == col.tbl.id
592
- assert tv.version == col.tbl.version
593
- media_cols[col.store_name()] = col
692
+ stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
693
+ stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
594
694
 
595
695
  row_count = len(next(iter(pydict.values())))
596
- rows: list[dict[str, Any]] = []
597
- for i in range(row_count):
598
- row = {
599
- col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
600
- for col_name, col_vals in pydict.items()
601
- }
602
- rows.append(row)
696
+ rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
697
+ for col_name, col_vals in pydict.items():
698
+ assert len(col_vals) == row_count
699
+ col = stored_cols.get(col_name) # Will be None for system columns
700
+ is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
701
+ is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
702
+ assert col is None or is_cellmd_col or col_name == col.store_name()
703
+
704
+ for i, val in enumerate(col_vals):
705
+ rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
603
706
 
604
707
  return rows
605
708
 
606
709
  def __from_pa_value(
607
- self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
710
+ self,
711
+ val: Any,
712
+ sql_type: sql.types.TypeEngine[Any],
713
+ col: Optional[catalog.Column],
714
+ is_media_col: bool,
715
+ is_cellmd_col: bool,
608
716
  ) -> Any:
609
717
  if val is None:
610
718
  return None
719
+ if isinstance(sql_type, sql_vector.Vector):
720
+ if isinstance(val, list):
721
+ val = np.array(val, dtype=np.float32)
722
+ assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
723
+ return val
724
+ if is_cellmd_col:
725
+ assert col is not None
726
+ assert isinstance(val, str)
727
+ return self.__restore_cellmd(col, json.loads(val))
611
728
  if isinstance(sql_type, sql.JSON):
612
729
  return json.loads(val)
613
- if media_col is not None:
614
- return self.__relocate_media_file(media_col, val)
730
+ if is_media_col:
731
+ assert col is not None
732
+ return self.__relocate_media_file(col, val)
615
733
  return val
616
734
 
617
735
  def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
@@ -629,3 +747,14 @@ class TableRestorer:
629
747
  return self.media_files[url]
630
748
  # For any type of URL other than a local file, just return the URL as-is.
631
749
  return url
750
+
751
+ def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
752
+ cellmd_ = CellMd.from_dict(cellmd)
753
+ if cellmd_.file_urls is None:
754
+ return cellmd # No changes
755
+
756
+ updated_urls: list[str] = []
757
+ for url in cellmd_.file_urls:
758
+ updated_urls.append(self.__relocate_media_file(col, url))
759
+ cellmd_.file_urls = updated_urls
760
+ return cellmd_.as_dict()
pixeltable/store.py CHANGED
@@ -321,7 +321,7 @@ class StoreBase:
321
321
  table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
322
322
  num_excs += num_row_exc
323
323
 
324
- if show_progress:
324
+ if show_progress and Env.get().verbosity >= 1:
325
325
  if progress_bar is None:
326
326
  warnings.simplefilter('ignore', category=TqdmWarning)
327
327
  progress_bar = tqdm(
@@ -434,8 +434,7 @@ class StoreBase:
434
434
  *[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
435
435
  )
436
436
  stmt = (
437
- sql.select('*') # TODO: Use a more specific list of columns?
438
- .select_from(self.sa_tbl)
437
+ sql.select(self.sa_tbl)
439
438
  .where(self.v_min_col <= version)
440
439
  .where(self.v_max_col > version)
441
440
  .where(sql.exists().where(filter_predicate))
pixeltable/type_system.py CHANGED
@@ -25,6 +25,7 @@ import sqlalchemy as sql
25
25
  from typing_extensions import _AnnotatedAlias
26
26
 
27
27
  import pixeltable.exceptions as excs
28
+ from pixeltable.env import Env
28
29
  from pixeltable.utils import parse_local_file_path
29
30
 
30
31
 
@@ -673,8 +674,9 @@ class TimestampType(ColumnType):
673
674
  def _create_literal(self, val: Any) -> Any:
674
675
  if isinstance(val, str):
675
676
  return datetime.datetime.fromisoformat(val)
676
- if isinstance(val, datetime.datetime):
677
- return val
677
+ # Place naive timestamps in the default time zone
678
+ if isinstance(val, datetime.datetime) and val.tzinfo is None:
679
+ return val.replace(tzinfo=Env.get().default_time_zone)
678
680
  return val
679
681
 
680
682
 
@@ -760,7 +762,7 @@ class JsonType(ColumnType):
760
762
 
761
763
  @classmethod
762
764
  def __is_valid_json(cls, val: Any) -> bool:
763
- if val is None or isinstance(val, (str, int, float, bool)):
765
+ if val is None or isinstance(val, (str, int, float, bool, np.ndarray, PIL.Image.Image)):
764
766
  return True
765
767
  if isinstance(val, (list, tuple)):
766
768
  return all(cls.__is_valid_json(v) for v in val)
pixeltable/utils/arrow.py CHANGED
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
16
16
  PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
17
17
  pa.string(): ts.StringType(nullable=True),
18
18
  pa.large_string(): ts.StringType(nullable=True),
19
- pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
19
+ pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
20
20
  pa.bool_(): ts.BoolType(nullable=True),
21
21
  pa.int8(): ts.IntType(nullable=True),
22
22
  pa.int16(): ts.IntType(nullable=True),
@@ -35,7 +35,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
35
35
 
36
36
  PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
37
37
  ts.StringType: pa.string(),
38
- ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc), # postgres timestamp is microseconds
38
+ ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
39
39
  ts.DateType: pa.date32(), # This could be date64
40
40
  ts.BoolType: pa.bool_(),
41
41
  ts.IntType: pa.int64(),
@@ -61,7 +61,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
61
61
  dtype = to_pixeltable_type(arrow_type.value_type, nullable)
62
62
  if dtype is None:
63
63
  return None
64
- return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
64
+ return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
65
65
  else:
66
66
  return None
67
67
 
@@ -92,7 +92,7 @@ def to_pxt_schema(
92
92
 
93
93
 
94
94
  def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
95
- return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items()) # type: ignore[misc]
95
+ return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
96
96
 
97
97
 
98
98
  def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
@@ -106,7 +106,7 @@ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa
106
106
  else:
107
107
  pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
108
108
  pa_arrays.append(pa_array)
109
- return pa.RecordBatch.from_arrays(pa_arrays, schema=schema) # type: ignore
109
+ return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
110
110
 
111
111
 
112
112
  def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
@@ -192,7 +192,7 @@ def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
192
192
  col = batch.column(k)
193
193
  if isinstance(col.type, pa.FixedShapeTensorType):
194
194
  # treat array columns as numpy arrays to easily preserve numpy type
195
- out[name] = col.to_numpy(zero_copy_only=False) # type: ignore[call-arg]
195
+ out[name] = col.to_numpy(zero_copy_only=False)
196
196
  else:
197
197
  # for the rest, use pydict to preserve python types
198
198
  out[name] = col.to_pylist()
pixeltable/utils/av.py CHANGED
@@ -3,6 +3,8 @@ from typing import Any
3
3
  import av
4
4
  import av.stream
5
5
 
6
+ from pixeltable.env import Env
7
+
6
8
 
7
9
  def get_metadata(path: str) -> dict:
8
10
  with av.open(path) as container:
@@ -109,3 +111,66 @@ def ffmpeg_clip_cmd(input_path: str, output_path: str, start_time: float, durati
109
111
  ]
110
112
  )
111
113
  return cmd
114
+
115
+
116
+ def ffmpeg_segment_cmd(
117
+ input_path: str,
118
+ output_pattern: str,
119
+ segment_duration: float | None = None,
120
+ segment_times: list[float] | None = None,
121
+ video_encoder: str | None = None,
122
+ video_encoder_args: dict[str, Any] | None = None,
123
+ ) -> list[str]:
124
+ """Commandline for frame-accurate segmentation"""
125
+ assert (segment_duration is None) != (segment_times is None)
126
+ if video_encoder is None:
127
+ video_encoder = Env.get().default_video_encoder
128
+
129
+ cmd = [
130
+ 'ffmpeg',
131
+ '-i',
132
+ input_path,
133
+ '-f',
134
+ 'segment', # Use segment muxer
135
+ ]
136
+
137
+ if segment_duration is not None:
138
+ cmd.extend(
139
+ [
140
+ '-segment_time',
141
+ str(segment_duration), # Target segment duration
142
+ '-break_non_keyframes',
143
+ '1', # need to break at non-keyframes to get frame-accurate segments
144
+ '-force_key_frames',
145
+ f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
146
+ ]
147
+ )
148
+ else:
149
+ assert segment_times is not None
150
+ times_str = ','.join([str(t) for t in segment_times])
151
+ cmd.extend(['-segment_times', times_str, '-force_key_frames', times_str])
152
+
153
+ cmd.extend(
154
+ [
155
+ '-reset_timestamps',
156
+ '1', # Reset timestamps for each segment
157
+ '-map',
158
+ '0', # Copy all streams from input
159
+ '-c:a',
160
+ 'copy', # don't re-encode audio
161
+ '-c:v',
162
+ video_encoder, # re-encode video
163
+ ]
164
+ )
165
+ if video_encoder_args is not None:
166
+ for k, v in video_encoder_args.items():
167
+ cmd.extend([f'-{k}', str(v)])
168
+
169
+ cmd.extend(
170
+ [
171
+ '-loglevel',
172
+ 'error', # Only show errors
173
+ output_pattern,
174
+ ]
175
+ )
176
+ return cmd
@@ -1,6 +1,8 @@
1
1
  import logging
2
2
  from typing import TextIO
3
3
 
4
+ from pixeltable import exceptions as excs
5
+
4
6
 
5
7
  def map_level(verbosity: int) -> int:
6
8
  """
@@ -19,7 +21,8 @@ def map_level(verbosity: int) -> int:
19
21
  return logging.INFO
20
22
  if verbosity == 2:
21
23
  return logging.DEBUG
22
- return logging.INFO
24
+
25
+ raise excs.Error(f'Invalid verbosity level: {verbosity}')
23
26
 
24
27
 
25
28
  class ConsoleOutputHandler(logging.StreamHandler):
@@ -1,32 +1,9 @@
1
1
  import logging
2
- import sys
3
2
  from typing import Any, Callable, Optional, TypeVar
4
3
 
5
4
  R = TypeVar('R')
6
5
 
7
-
8
- def _is_in_exception() -> bool:
9
- """
10
- Check if code is currently executing within an exception context.
11
- """
12
- current_exception = sys.exc_info()[1]
13
- return current_exception is not None
14
-
15
-
16
- def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
17
- """
18
- Runs cleanup only when running in exception context.
19
-
20
- The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
21
- This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
22
- the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
23
- To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
24
- create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
25
- from the finally block.
26
- """
27
- if _is_in_exception():
28
- return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
29
- return None
6
+ logger = logging.getLogger('pixeltable')
30
7
 
31
8
 
32
9
  def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
@@ -40,20 +17,20 @@ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool =
40
17
  raise_error: raise an exception if an error occurs during cleanup.
41
18
  """
42
19
  try:
43
- logging.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
20
+ logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
44
21
  return cleanup_func(*args, **kwargs)
45
22
  except KeyboardInterrupt as interrupt:
46
23
  # Save original exception and re-attempt cleanup
47
24
  original_exception = interrupt
48
- logging.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
25
+ logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
49
26
  try:
50
27
  return cleanup_func(*args, **kwargs)
51
28
  except Exception as e:
52
29
  # Suppress this exception
53
- logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
30
+ logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
54
31
  raise KeyboardInterrupt from original_exception
55
32
  except Exception as e:
56
- logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
33
+ logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
57
34
  if raise_error:
58
35
  raise e
59
36
  return None
@@ -0,0 +1,7 @@
1
+ import PIL.Image
2
+
3
+
4
+ def default_format(img: PIL.Image.Image) -> str:
5
+ # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
6
+ # In that case, use WebP instead.
7
+ return 'webp' if img.has_transparency_data else 'jpeg'
@@ -0,0 +1,5 @@
1
+ from typing import Any
2
+
3
+
4
+ def non_none_dict_factory(d: list[tuple[str, Any]]) -> dict:
5
+ return {k: v for (k, v) in d if v is not None}
@@ -22,6 +22,7 @@ class StorageTarget(enum.Enum):
22
22
  LOCAL_STORE = 'os' # Local file system
23
23
  S3_STORE = 's3' # Amazon S3
24
24
  R2_STORE = 'r2' # Cloudflare R2
25
+ B2_STORE = 'b2' # Backblaze B2
25
26
  GCS_STORE = 'gs' # Google Cloud Storage
26
27
  AZURE_STORE = 'az' # Azure Blob Storage
27
28
  HTTP_STORE = 'http' # HTTP/HTTPS
@@ -63,6 +64,7 @@ class StorageObjectAddress(NamedTuple):
63
64
  StorageTarget.LOCAL_STORE,
64
65
  StorageTarget.S3_STORE,
65
66
  StorageTarget.R2_STORE,
67
+ StorageTarget.B2_STORE,
66
68
  StorageTarget.GCS_STORE,
67
69
  StorageTarget.AZURE_STORE,
68
70
  StorageTarget.HTTP_STORE,
@@ -218,15 +220,23 @@ class ObjectPath:
218
220
  # Standard HTTP(S) URL format
219
221
  # https://account.blob.core.windows.net/container/<optional path>/<optional object>
220
222
  # https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
223
+ # https://s3.us-west-004.backblazeb2.com/container/<optional path>/<optional object>
221
224
  # and possibly others
222
225
  key = parsed.path
223
226
  if 'cloudflare' in parsed.netloc:
224
227
  storage_target = StorageTarget.R2_STORE
228
+ elif 'backblazeb2' in parsed.netloc:
229
+ storage_target = StorageTarget.B2_STORE
225
230
  elif 'windows' in parsed.netloc:
226
231
  storage_target = StorageTarget.AZURE_STORE
227
232
  else:
228
233
  storage_target = StorageTarget.HTTP_STORE
229
- if storage_target in [StorageTarget.S3_STORE, StorageTarget.AZURE_STORE, StorageTarget.R2_STORE]:
234
+ if storage_target in (
235
+ StorageTarget.S3_STORE,
236
+ StorageTarget.AZURE_STORE,
237
+ StorageTarget.R2_STORE,
238
+ StorageTarget.B2_STORE,
239
+ ):
230
240
  account_name = parsed.netloc.split('.', 1)[0]
231
241
  account_extension = parsed.netloc.split('.', 1)[1]
232
242
  path_parts = key.lstrip('/').split('/', 1)
@@ -370,6 +380,11 @@ class ObjectOps:
370
380
  env.Env.get().require_package('boto3')
371
381
  from pixeltable.utils.s3_store import S3Store
372
382
 
383
+ return S3Store(soa)
384
+ if soa.storage_target == StorageTarget.B2_STORE:
385
+ env.Env.get().require_package('boto3')
386
+ from pixeltable.utils.s3_store import S3Store
387
+
373
388
  return S3Store(soa)
374
389
  if soa.storage_target == StorageTarget.GCS_STORE and soa.scheme == 'gs':
375
390
  env.Env.get().require_package('google.cloud.storage')