pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +4 -0
- pixeltable/catalog/catalog.py +125 -63
- pixeltable/catalog/column.py +7 -2
- pixeltable/catalog/table.py +1 -0
- pixeltable/catalog/table_metadata.py +4 -0
- pixeltable/catalog/table_version.py +174 -117
- pixeltable/catalog/table_version_handle.py +4 -1
- pixeltable/catalog/table_version_path.py +0 -11
- pixeltable/catalog/view.py +6 -0
- pixeltable/config.py +7 -0
- pixeltable/dataframe.py +10 -5
- pixeltable/env.py +56 -19
- pixeltable/exec/__init__.py +2 -0
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/exec_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +1 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
- pixeltable/exec/expr_eval/globals.py +2 -0
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/object_store_save_node.py +1 -4
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +107 -14
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +23 -18
- pixeltable/exprs/column_property_ref.py +10 -10
- pixeltable/exprs/column_ref.py +2 -2
- pixeltable/exprs/data_row.py +106 -37
- pixeltable/exprs/expr.py +9 -0
- pixeltable/exprs/expr_set.py +14 -7
- pixeltable/exprs/inline_expr.py +2 -19
- pixeltable/exprs/json_path.py +45 -12
- pixeltable/exprs/row_builder.py +54 -22
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/bedrock.py +7 -0
- pixeltable/functions/deepseek.py +11 -4
- pixeltable/functions/llama_cpp.py +7 -0
- pixeltable/functions/math.py +1 -1
- pixeltable/functions/ollama.py +7 -0
- pixeltable/functions/openai.py +4 -4
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/video.py +110 -28
- pixeltable/globals.py +10 -4
- pixeltable/io/globals.py +18 -17
- pixeltable/io/parquet.py +1 -1
- pixeltable/io/table_data_conduit.py +47 -22
- pixeltable/iterators/document.py +61 -23
- pixeltable/iterators/video.py +126 -53
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +175 -46
- pixeltable/share/packager.py +155 -26
- pixeltable/store.py +2 -3
- pixeltable/type_system.py +5 -3
- pixeltable/utils/arrow.py +6 -6
- pixeltable/utils/av.py +65 -0
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/exception_handler.py +5 -28
- pixeltable/utils/image.py +7 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +16 -1
- pixeltable/utils/s3_store.py +44 -11
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0
pixeltable/share/packager.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import base64
|
|
2
|
-
import datetime
|
|
3
2
|
import io
|
|
4
3
|
import json
|
|
5
4
|
import logging
|
|
@@ -13,6 +12,7 @@ from uuid import UUID
|
|
|
13
12
|
|
|
14
13
|
import more_itertools
|
|
15
14
|
import numpy as np
|
|
15
|
+
import pgvector.sqlalchemy as sql_vector # type: ignore[import-untyped]
|
|
16
16
|
import PIL.Image
|
|
17
17
|
import pyarrow as pa
|
|
18
18
|
import pyarrow.parquet as pq
|
|
@@ -21,6 +21,7 @@ import sqlalchemy as sql
|
|
|
21
21
|
import pixeltable as pxt
|
|
22
22
|
from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
|
|
23
23
|
from pixeltable.env import Env
|
|
24
|
+
from pixeltable.exprs.data_row import CellMd
|
|
24
25
|
from pixeltable.metadata import schema
|
|
25
26
|
from pixeltable.utils import sha256sum
|
|
26
27
|
from pixeltable.utils.formatter import Formatter
|
|
@@ -109,9 +110,12 @@ class TablePackager:
|
|
|
109
110
|
assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
|
|
110
111
|
sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
|
|
111
112
|
media_cols: set[str] = set()
|
|
113
|
+
cellmd_cols: set[str] = set()
|
|
112
114
|
for col in tv.cols:
|
|
113
115
|
if col.is_stored and col.col_type.is_media_type():
|
|
114
116
|
media_cols.add(col.store_name())
|
|
117
|
+
if col.stores_cellmd:
|
|
118
|
+
cellmd_cols.add(col.cellmd_store_name())
|
|
115
119
|
|
|
116
120
|
parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
|
|
117
121
|
# TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
|
|
@@ -126,10 +130,10 @@ class TablePackager:
|
|
|
126
130
|
# excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
|
|
127
131
|
# We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
|
|
128
132
|
# faster compression should provide good performance while still reducing temporary storage utilization.
|
|
129
|
-
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='
|
|
133
|
+
parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
|
|
130
134
|
filter_tv = self.table._tbl_version_path.tbl_version.get()
|
|
131
135
|
row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
|
|
132
|
-
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
|
|
136
|
+
for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
|
|
133
137
|
parquet_writer.write_table(pa_table)
|
|
134
138
|
parquet_writer.close()
|
|
135
139
|
|
|
@@ -138,7 +142,7 @@ class TablePackager:
|
|
|
138
142
|
@classmethod
|
|
139
143
|
def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
|
|
140
144
|
entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
|
|
141
|
-
return pa.schema(entries)
|
|
145
|
+
return pa.schema(entries)
|
|
142
146
|
|
|
143
147
|
@classmethod
|
|
144
148
|
def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
|
|
@@ -151,13 +155,17 @@ class TablePackager:
|
|
|
151
155
|
if isinstance(col_type, sql.Float):
|
|
152
156
|
return pa.float32()
|
|
153
157
|
if isinstance(col_type, sql.TIMESTAMP):
|
|
154
|
-
return pa.timestamp('us', tz=
|
|
158
|
+
return pa.timestamp('us', tz='UTC')
|
|
155
159
|
if isinstance(col_type, sql.Date):
|
|
156
160
|
return pa.date32()
|
|
157
161
|
if isinstance(col_type, sql.JSON):
|
|
158
162
|
return pa.string() # JSON will be exported as strings
|
|
159
163
|
if isinstance(col_type, sql.LargeBinary):
|
|
160
164
|
return pa.binary()
|
|
165
|
+
if isinstance(col_type, sql_vector.Vector):
|
|
166
|
+
# Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
|
|
167
|
+
# here instead.
|
|
168
|
+
return pa.list_(pa.float32())
|
|
161
169
|
raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
|
|
162
170
|
|
|
163
171
|
def __to_pa_tables(
|
|
@@ -165,6 +173,7 @@ class TablePackager:
|
|
|
165
173
|
row_iter: Iterator[dict[str, Any]],
|
|
166
174
|
sql_types: dict[str, sql.types.TypeEngine[Any]],
|
|
167
175
|
media_cols: set[str],
|
|
176
|
+
cellmd_cols: set[str],
|
|
168
177
|
arrow_schema: pa.Schema,
|
|
169
178
|
batch_size: int = 1_000,
|
|
170
179
|
) -> Iterator[pa.Table]:
|
|
@@ -176,14 +185,21 @@ class TablePackager:
|
|
|
176
185
|
for rows in more_itertools.batched(row_iter, batch_size):
|
|
177
186
|
cols = {}
|
|
178
187
|
for name, sql_type in sql_types.items():
|
|
179
|
-
|
|
180
|
-
|
|
188
|
+
values = [
|
|
189
|
+
self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
|
|
190
|
+
]
|
|
181
191
|
cols[name] = values
|
|
182
192
|
yield pa.Table.from_pydict(cols, schema=arrow_schema)
|
|
183
193
|
|
|
184
|
-
def __to_pa_value(
|
|
194
|
+
def __to_pa_value(
|
|
195
|
+
self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
|
|
196
|
+
) -> Any:
|
|
185
197
|
if val is None:
|
|
186
198
|
return None
|
|
199
|
+
if is_cellmd_col:
|
|
200
|
+
assert isinstance(val, dict)
|
|
201
|
+
# Export JSON as strings
|
|
202
|
+
return json.dumps(self.__process_cellmd(val))
|
|
187
203
|
if isinstance(sql_type, sql.JSON):
|
|
188
204
|
# Export JSON as strings
|
|
189
205
|
return json.dumps(val)
|
|
@@ -194,6 +210,10 @@ class TablePackager:
|
|
|
194
210
|
return val
|
|
195
211
|
|
|
196
212
|
def __process_media_url(self, url: str) -> str:
|
|
213
|
+
"""
|
|
214
|
+
Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
|
|
215
|
+
copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
|
|
216
|
+
"""
|
|
197
217
|
parsed_url = urllib.parse.urlparse(url)
|
|
198
218
|
if parsed_url.scheme == 'file':
|
|
199
219
|
# It's the URL of a local file. Replace it with a pxtmedia:// URI.
|
|
@@ -214,6 +234,21 @@ class TablePackager:
|
|
|
214
234
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
215
235
|
return url
|
|
216
236
|
|
|
237
|
+
def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
|
|
238
|
+
"""
|
|
239
|
+
Process a cellmd dictionary for export. This involves replacing any local file references
|
|
240
|
+
with pxtmedia:// URIs, as described above.
|
|
241
|
+
"""
|
|
242
|
+
cellmd_ = CellMd.from_dict(cellmd)
|
|
243
|
+
if cellmd_.file_urls is None:
|
|
244
|
+
return cellmd # No changes
|
|
245
|
+
|
|
246
|
+
updated_urls: list[str] = []
|
|
247
|
+
for url in cellmd_.file_urls:
|
|
248
|
+
updated_urls.append(self.__process_media_url(url))
|
|
249
|
+
cellmd_.file_urls = updated_urls
|
|
250
|
+
return cellmd_.as_dict()
|
|
251
|
+
|
|
217
252
|
def __build_tarball(self) -> Path:
|
|
218
253
|
bundle_path = self.tmp_dir / 'bundle.tar.bz2'
|
|
219
254
|
with tarfile.open(bundle_path, 'w:bz2') as tf:
|
|
@@ -409,6 +444,9 @@ class TableRestorer:
|
|
|
409
444
|
# 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
|
|
410
445
|
# 3. Delete any row instances from the temporary table that are already present in the existing table;
|
|
411
446
|
# 4. Copy the remaining rows from the temporary table into the existing table.
|
|
447
|
+
# 5. Rectify any index columns.
|
|
448
|
+
|
|
449
|
+
# STEP 1: Import the parquet data into a temporary table.
|
|
412
450
|
|
|
413
451
|
# Create a temporary table for the initial data load, containing columns for all columns present in the
|
|
414
452
|
# parquet table. The parquet columns have identical names to those in the store table, so we can use the
|
|
@@ -416,7 +454,7 @@ class TableRestorer:
|
|
|
416
454
|
# e.g., pa.string() may hold either VARCHAR or serialized JSONB).
|
|
417
455
|
temp_cols: dict[str, sql.Column] = {}
|
|
418
456
|
for field in parquet_table.schema:
|
|
419
|
-
assert field.name in store_sa_tbl.columns
|
|
457
|
+
assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
|
|
420
458
|
col_type = store_sa_tbl.columns[field.name].type
|
|
421
459
|
temp_cols[field.name] = sql.Column(field.name, col_type)
|
|
422
460
|
temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
|
|
@@ -432,6 +470,8 @@ class TableRestorer:
|
|
|
432
470
|
rows = self.__from_pa_pydict(tv, pydict)
|
|
433
471
|
conn.execute(sql.insert(temp_sa_tbl), rows)
|
|
434
472
|
|
|
473
|
+
# STEP 2: Rectify v_max values.
|
|
474
|
+
|
|
435
475
|
# Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
|
|
436
476
|
# Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
|
|
437
477
|
# In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
|
|
@@ -540,6 +580,8 @@ class TableRestorer:
|
|
|
540
580
|
result = conn.execute(q)
|
|
541
581
|
_logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
|
|
542
582
|
|
|
583
|
+
# STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
|
|
584
|
+
|
|
543
585
|
# Now we need to update rows in the existing table that are also present in the temporary table. This is to
|
|
544
586
|
# account for the scenario where the temporary table has columns that are not present in the existing table.
|
|
545
587
|
# (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
|
|
@@ -570,7 +612,9 @@ class TableRestorer:
|
|
|
570
612
|
result = conn.execute(q)
|
|
571
613
|
_logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
|
|
572
614
|
|
|
573
|
-
#
|
|
615
|
+
# STEP 4: Copy the remaining rows from the temporary table into the existing table.
|
|
616
|
+
|
|
617
|
+
# Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
|
|
574
618
|
# the actual table.
|
|
575
619
|
q = store_sa_tbl.insert().from_select(
|
|
576
620
|
[store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
|
|
@@ -579,39 +623,113 @@ class TableRestorer:
|
|
|
579
623
|
result = conn.execute(q)
|
|
580
624
|
_logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
|
|
581
625
|
|
|
626
|
+
# STEP 5: Rectify any index columns.
|
|
627
|
+
|
|
628
|
+
# Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
|
|
629
|
+
# undo columns to ensure they appropriately reflect the most recent replicated version of the table.
|
|
630
|
+
|
|
631
|
+
# Get the most recent replicated version of the table. This might be the version we're currently importing,
|
|
632
|
+
# but it might be a different version of the table that was previously imported.
|
|
633
|
+
head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
|
|
634
|
+
head_version = head_version_md.version_md.version
|
|
635
|
+
_logger.debug(f'Head version for index rectification is {head_version}.')
|
|
636
|
+
|
|
637
|
+
# Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
|
|
638
|
+
# This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
|
|
639
|
+
# a previously replicated version of the table, but not in the one currently being imported.
|
|
640
|
+
index_md = head_version_md.tbl_md.index_md
|
|
641
|
+
|
|
642
|
+
# Now update the table. We can do this for all indices together with just two SQL queries. For each index,
|
|
643
|
+
# at most one of the val or undo columns will be non-NULL in any given row.
|
|
644
|
+
# For rows where v_min <= head_version < v_max, we set, for all indices:
|
|
645
|
+
# val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
|
|
646
|
+
# undo_col = NULL
|
|
647
|
+
# For rows where head_version < v_min or v_max <= head_version, vice versa.
|
|
648
|
+
val_sql_clauses: dict[str, sql.ColumnElement] = {}
|
|
649
|
+
undo_sql_clauses: dict[str, sql.ColumnElement] = {}
|
|
650
|
+
for index in index_md.values():
|
|
651
|
+
if index.class_fqn.endswith('.EmbeddingIndex'):
|
|
652
|
+
val_col_name = f'col_{index.index_val_col_id}'
|
|
653
|
+
undo_col_name = f'col_{index.index_val_undo_col_id}'
|
|
654
|
+
# Check that the val column for the index is actually present in the store table. We need to do this
|
|
655
|
+
# to properly handle the case where the replica represents a table version that was *not* the most
|
|
656
|
+
# recent version at the time it was published. In that case, it is possible for tbl_md to contain
|
|
657
|
+
# metadata for indices not known to any version that has been replicated. (However, the converse
|
|
658
|
+
# *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
|
|
659
|
+
if val_col_name in store_sa_tbl.c:
|
|
660
|
+
assert undo_col_name in store_sa_tbl.c
|
|
661
|
+
coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
|
|
662
|
+
val_sql_clauses[val_col_name] = coalesce
|
|
663
|
+
val_sql_clauses[undo_col_name] = sql.null()
|
|
664
|
+
undo_sql_clauses[undo_col_name] = coalesce
|
|
665
|
+
undo_sql_clauses[val_col_name] = sql.null()
|
|
666
|
+
|
|
667
|
+
if len(val_sql_clauses) > 0:
|
|
668
|
+
q2 = (
|
|
669
|
+
store_sa_tbl.update()
|
|
670
|
+
.values(**val_sql_clauses)
|
|
671
|
+
.where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
|
|
672
|
+
)
|
|
673
|
+
_logger.debug(q2.compile())
|
|
674
|
+
_ = conn.execute(q2)
|
|
675
|
+
q2 = (
|
|
676
|
+
store_sa_tbl.update()
|
|
677
|
+
.values(**undo_sql_clauses)
|
|
678
|
+
.where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
|
|
679
|
+
)
|
|
680
|
+
_logger.debug(q2.compile())
|
|
681
|
+
_ = conn.execute(q2)
|
|
682
|
+
_logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
|
|
683
|
+
else:
|
|
684
|
+
_logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
|
|
685
|
+
|
|
582
686
|
def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
|
|
583
687
|
# Data conversions from pyarrow to Pixeltable
|
|
584
688
|
sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
|
|
585
689
|
for col_name in pydict:
|
|
586
690
|
assert col_name in tv.store_tbl.sa_tbl.columns
|
|
587
691
|
sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
|
|
588
|
-
|
|
589
|
-
for col in tv.cols
|
|
590
|
-
if col.is_stored and col.col_type.is_media_type():
|
|
591
|
-
assert tv.id == col.tbl.id
|
|
592
|
-
assert tv.version == col.tbl.version
|
|
593
|
-
media_cols[col.store_name()] = col
|
|
692
|
+
stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
|
|
693
|
+
stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
|
|
594
694
|
|
|
595
695
|
row_count = len(next(iter(pydict.values())))
|
|
596
|
-
rows: list[dict[str, Any]] = []
|
|
597
|
-
for
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
696
|
+
rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
|
|
697
|
+
for col_name, col_vals in pydict.items():
|
|
698
|
+
assert len(col_vals) == row_count
|
|
699
|
+
col = stored_cols.get(col_name) # Will be None for system columns
|
|
700
|
+
is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
|
|
701
|
+
is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
|
|
702
|
+
assert col is None or is_cellmd_col or col_name == col.store_name()
|
|
703
|
+
|
|
704
|
+
for i, val in enumerate(col_vals):
|
|
705
|
+
rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
|
|
603
706
|
|
|
604
707
|
return rows
|
|
605
708
|
|
|
606
709
|
def __from_pa_value(
|
|
607
|
-
self,
|
|
710
|
+
self,
|
|
711
|
+
val: Any,
|
|
712
|
+
sql_type: sql.types.TypeEngine[Any],
|
|
713
|
+
col: Optional[catalog.Column],
|
|
714
|
+
is_media_col: bool,
|
|
715
|
+
is_cellmd_col: bool,
|
|
608
716
|
) -> Any:
|
|
609
717
|
if val is None:
|
|
610
718
|
return None
|
|
719
|
+
if isinstance(sql_type, sql_vector.Vector):
|
|
720
|
+
if isinstance(val, list):
|
|
721
|
+
val = np.array(val, dtype=np.float32)
|
|
722
|
+
assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
|
|
723
|
+
return val
|
|
724
|
+
if is_cellmd_col:
|
|
725
|
+
assert col is not None
|
|
726
|
+
assert isinstance(val, str)
|
|
727
|
+
return self.__restore_cellmd(col, json.loads(val))
|
|
611
728
|
if isinstance(sql_type, sql.JSON):
|
|
612
729
|
return json.loads(val)
|
|
613
|
-
if
|
|
614
|
-
|
|
730
|
+
if is_media_col:
|
|
731
|
+
assert col is not None
|
|
732
|
+
return self.__relocate_media_file(col, val)
|
|
615
733
|
return val
|
|
616
734
|
|
|
617
735
|
def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
|
|
@@ -629,3 +747,14 @@ class TableRestorer:
|
|
|
629
747
|
return self.media_files[url]
|
|
630
748
|
# For any type of URL other than a local file, just return the URL as-is.
|
|
631
749
|
return url
|
|
750
|
+
|
|
751
|
+
def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
|
|
752
|
+
cellmd_ = CellMd.from_dict(cellmd)
|
|
753
|
+
if cellmd_.file_urls is None:
|
|
754
|
+
return cellmd # No changes
|
|
755
|
+
|
|
756
|
+
updated_urls: list[str] = []
|
|
757
|
+
for url in cellmd_.file_urls:
|
|
758
|
+
updated_urls.append(self.__relocate_media_file(col, url))
|
|
759
|
+
cellmd_.file_urls = updated_urls
|
|
760
|
+
return cellmd_.as_dict()
|
pixeltable/store.py
CHANGED
|
@@ -321,7 +321,7 @@ class StoreBase:
|
|
|
321
321
|
table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
|
|
322
322
|
num_excs += num_row_exc
|
|
323
323
|
|
|
324
|
-
if show_progress:
|
|
324
|
+
if show_progress and Env.get().verbosity >= 1:
|
|
325
325
|
if progress_bar is None:
|
|
326
326
|
warnings.simplefilter('ignore', category=TqdmWarning)
|
|
327
327
|
progress_bar = tqdm(
|
|
@@ -434,8 +434,7 @@ class StoreBase:
|
|
|
434
434
|
*[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
|
|
435
435
|
)
|
|
436
436
|
stmt = (
|
|
437
|
-
sql.select(
|
|
438
|
-
.select_from(self.sa_tbl)
|
|
437
|
+
sql.select(self.sa_tbl)
|
|
439
438
|
.where(self.v_min_col <= version)
|
|
440
439
|
.where(self.v_max_col > version)
|
|
441
440
|
.where(sql.exists().where(filter_predicate))
|
pixeltable/type_system.py
CHANGED
|
@@ -25,6 +25,7 @@ import sqlalchemy as sql
|
|
|
25
25
|
from typing_extensions import _AnnotatedAlias
|
|
26
26
|
|
|
27
27
|
import pixeltable.exceptions as excs
|
|
28
|
+
from pixeltable.env import Env
|
|
28
29
|
from pixeltable.utils import parse_local_file_path
|
|
29
30
|
|
|
30
31
|
|
|
@@ -673,8 +674,9 @@ class TimestampType(ColumnType):
|
|
|
673
674
|
def _create_literal(self, val: Any) -> Any:
|
|
674
675
|
if isinstance(val, str):
|
|
675
676
|
return datetime.datetime.fromisoformat(val)
|
|
676
|
-
|
|
677
|
-
|
|
677
|
+
# Place naive timestamps in the default time zone
|
|
678
|
+
if isinstance(val, datetime.datetime) and val.tzinfo is None:
|
|
679
|
+
return val.replace(tzinfo=Env.get().default_time_zone)
|
|
678
680
|
return val
|
|
679
681
|
|
|
680
682
|
|
|
@@ -760,7 +762,7 @@ class JsonType(ColumnType):
|
|
|
760
762
|
|
|
761
763
|
@classmethod
|
|
762
764
|
def __is_valid_json(cls, val: Any) -> bool:
|
|
763
|
-
if val is None or isinstance(val, (str, int, float, bool)):
|
|
765
|
+
if val is None or isinstance(val, (str, int, float, bool, np.ndarray, PIL.Image.Image)):
|
|
764
766
|
return True
|
|
765
767
|
if isinstance(val, (list, tuple)):
|
|
766
768
|
return all(cls.__is_valid_json(v) for v in val)
|
pixeltable/utils/arrow.py
CHANGED
|
@@ -16,7 +16,7 @@ if TYPE_CHECKING:
|
|
|
16
16
|
PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
17
17
|
pa.string(): ts.StringType(nullable=True),
|
|
18
18
|
pa.large_string(): ts.StringType(nullable=True),
|
|
19
|
-
pa.timestamp('us', tz=
|
|
19
|
+
pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
|
|
20
20
|
pa.bool_(): ts.BoolType(nullable=True),
|
|
21
21
|
pa.int8(): ts.IntType(nullable=True),
|
|
22
22
|
pa.int16(): ts.IntType(nullable=True),
|
|
@@ -35,7 +35,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
|
|
|
35
35
|
|
|
36
36
|
PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
|
|
37
37
|
ts.StringType: pa.string(),
|
|
38
|
-
ts.TimestampType: pa.timestamp('us', tz=
|
|
38
|
+
ts.TimestampType: pa.timestamp('us', tz='UTC'), # postgres timestamp is microseconds
|
|
39
39
|
ts.DateType: pa.date32(), # This could be date64
|
|
40
40
|
ts.BoolType: pa.bool_(),
|
|
41
41
|
ts.IntType: pa.int64(),
|
|
@@ -61,7 +61,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
|
|
|
61
61
|
dtype = to_pixeltable_type(arrow_type.value_type, nullable)
|
|
62
62
|
if dtype is None:
|
|
63
63
|
return None
|
|
64
|
-
return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
|
|
64
|
+
return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
|
|
65
65
|
else:
|
|
66
66
|
return None
|
|
67
67
|
|
|
@@ -92,7 +92,7 @@ def to_pxt_schema(
|
|
|
92
92
|
|
|
93
93
|
|
|
94
94
|
def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
|
|
95
|
-
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
95
|
+
return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
|
|
@@ -106,7 +106,7 @@ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa
|
|
|
106
106
|
else:
|
|
107
107
|
pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
|
|
108
108
|
pa_arrays.append(pa_array)
|
|
109
|
-
return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
|
|
109
|
+
return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
|
|
@@ -192,7 +192,7 @@ def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
|
|
|
192
192
|
col = batch.column(k)
|
|
193
193
|
if isinstance(col.type, pa.FixedShapeTensorType):
|
|
194
194
|
# treat array columns as numpy arrays to easily preserve numpy type
|
|
195
|
-
out[name] = col.to_numpy(zero_copy_only=False)
|
|
195
|
+
out[name] = col.to_numpy(zero_copy_only=False)
|
|
196
196
|
else:
|
|
197
197
|
# for the rest, use pydict to preserve python types
|
|
198
198
|
out[name] = col.to_pylist()
|
pixeltable/utils/av.py
CHANGED
|
@@ -3,6 +3,8 @@ from typing import Any
|
|
|
3
3
|
import av
|
|
4
4
|
import av.stream
|
|
5
5
|
|
|
6
|
+
from pixeltable.env import Env
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
def get_metadata(path: str) -> dict:
|
|
8
10
|
with av.open(path) as container:
|
|
@@ -109,3 +111,66 @@ def ffmpeg_clip_cmd(input_path: str, output_path: str, start_time: float, durati
|
|
|
109
111
|
]
|
|
110
112
|
)
|
|
111
113
|
return cmd
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def ffmpeg_segment_cmd(
|
|
117
|
+
input_path: str,
|
|
118
|
+
output_pattern: str,
|
|
119
|
+
segment_duration: float | None = None,
|
|
120
|
+
segment_times: list[float] | None = None,
|
|
121
|
+
video_encoder: str | None = None,
|
|
122
|
+
video_encoder_args: dict[str, Any] | None = None,
|
|
123
|
+
) -> list[str]:
|
|
124
|
+
"""Commandline for frame-accurate segmentation"""
|
|
125
|
+
assert (segment_duration is None) != (segment_times is None)
|
|
126
|
+
if video_encoder is None:
|
|
127
|
+
video_encoder = Env.get().default_video_encoder
|
|
128
|
+
|
|
129
|
+
cmd = [
|
|
130
|
+
'ffmpeg',
|
|
131
|
+
'-i',
|
|
132
|
+
input_path,
|
|
133
|
+
'-f',
|
|
134
|
+
'segment', # Use segment muxer
|
|
135
|
+
]
|
|
136
|
+
|
|
137
|
+
if segment_duration is not None:
|
|
138
|
+
cmd.extend(
|
|
139
|
+
[
|
|
140
|
+
'-segment_time',
|
|
141
|
+
str(segment_duration), # Target segment duration
|
|
142
|
+
'-break_non_keyframes',
|
|
143
|
+
'1', # need to break at non-keyframes to get frame-accurate segments
|
|
144
|
+
'-force_key_frames',
|
|
145
|
+
f'expr:gte(t,n_forced*{segment_duration})', # Force keyframe at each segment boundary
|
|
146
|
+
]
|
|
147
|
+
)
|
|
148
|
+
else:
|
|
149
|
+
assert segment_times is not None
|
|
150
|
+
times_str = ','.join([str(t) for t in segment_times])
|
|
151
|
+
cmd.extend(['-segment_times', times_str, '-force_key_frames', times_str])
|
|
152
|
+
|
|
153
|
+
cmd.extend(
|
|
154
|
+
[
|
|
155
|
+
'-reset_timestamps',
|
|
156
|
+
'1', # Reset timestamps for each segment
|
|
157
|
+
'-map',
|
|
158
|
+
'0', # Copy all streams from input
|
|
159
|
+
'-c:a',
|
|
160
|
+
'copy', # don't re-encode audio
|
|
161
|
+
'-c:v',
|
|
162
|
+
video_encoder, # re-encode video
|
|
163
|
+
]
|
|
164
|
+
)
|
|
165
|
+
if video_encoder_args is not None:
|
|
166
|
+
for k, v in video_encoder_args.items():
|
|
167
|
+
cmd.extend([f'-{k}', str(v)])
|
|
168
|
+
|
|
169
|
+
cmd.extend(
|
|
170
|
+
[
|
|
171
|
+
'-loglevel',
|
|
172
|
+
'error', # Only show errors
|
|
173
|
+
output_pattern,
|
|
174
|
+
]
|
|
175
|
+
)
|
|
176
|
+
return cmd
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import TextIO
|
|
3
3
|
|
|
4
|
+
from pixeltable import exceptions as excs
|
|
5
|
+
|
|
4
6
|
|
|
5
7
|
def map_level(verbosity: int) -> int:
|
|
6
8
|
"""
|
|
@@ -19,7 +21,8 @@ def map_level(verbosity: int) -> int:
|
|
|
19
21
|
return logging.INFO
|
|
20
22
|
if verbosity == 2:
|
|
21
23
|
return logging.DEBUG
|
|
22
|
-
|
|
24
|
+
|
|
25
|
+
raise excs.Error(f'Invalid verbosity level: {verbosity}')
|
|
23
26
|
|
|
24
27
|
|
|
25
28
|
class ConsoleOutputHandler(logging.StreamHandler):
|
|
@@ -1,32 +1,9 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import sys
|
|
3
2
|
from typing import Any, Callable, Optional, TypeVar
|
|
4
3
|
|
|
5
4
|
R = TypeVar('R')
|
|
6
5
|
|
|
7
|
-
|
|
8
|
-
def _is_in_exception() -> bool:
|
|
9
|
-
"""
|
|
10
|
-
Check if code is currently executing within an exception context.
|
|
11
|
-
"""
|
|
12
|
-
current_exception = sys.exc_info()[1]
|
|
13
|
-
return current_exception is not None
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
|
|
17
|
-
"""
|
|
18
|
-
Runs cleanup only when running in exception context.
|
|
19
|
-
|
|
20
|
-
The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
|
|
21
|
-
This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
|
|
22
|
-
the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
|
|
23
|
-
To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
|
|
24
|
-
create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
|
|
25
|
-
from the finally block.
|
|
26
|
-
"""
|
|
27
|
-
if _is_in_exception():
|
|
28
|
-
return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
|
|
29
|
-
return None
|
|
6
|
+
logger = logging.getLogger('pixeltable')
|
|
30
7
|
|
|
31
8
|
|
|
32
9
|
def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
|
|
@@ -40,20 +17,20 @@ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool =
|
|
|
40
17
|
raise_error: raise an exception if an error occurs during cleanup.
|
|
41
18
|
"""
|
|
42
19
|
try:
|
|
43
|
-
|
|
20
|
+
logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
|
|
44
21
|
return cleanup_func(*args, **kwargs)
|
|
45
22
|
except KeyboardInterrupt as interrupt:
|
|
46
23
|
# Save original exception and re-attempt cleanup
|
|
47
24
|
original_exception = interrupt
|
|
48
|
-
|
|
25
|
+
logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
|
|
49
26
|
try:
|
|
50
27
|
return cleanup_func(*args, **kwargs)
|
|
51
28
|
except Exception as e:
|
|
52
29
|
# Suppress this exception
|
|
53
|
-
|
|
30
|
+
logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
|
|
54
31
|
raise KeyboardInterrupt from original_exception
|
|
55
32
|
except Exception as e:
|
|
56
|
-
|
|
33
|
+
logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
|
|
57
34
|
if raise_error:
|
|
58
35
|
raise e
|
|
59
36
|
return None
|
pixeltable/utils/misc.py
ADDED
|
@@ -22,6 +22,7 @@ class StorageTarget(enum.Enum):
|
|
|
22
22
|
LOCAL_STORE = 'os' # Local file system
|
|
23
23
|
S3_STORE = 's3' # Amazon S3
|
|
24
24
|
R2_STORE = 'r2' # Cloudflare R2
|
|
25
|
+
B2_STORE = 'b2' # Backblaze B2
|
|
25
26
|
GCS_STORE = 'gs' # Google Cloud Storage
|
|
26
27
|
AZURE_STORE = 'az' # Azure Blob Storage
|
|
27
28
|
HTTP_STORE = 'http' # HTTP/HTTPS
|
|
@@ -63,6 +64,7 @@ class StorageObjectAddress(NamedTuple):
|
|
|
63
64
|
StorageTarget.LOCAL_STORE,
|
|
64
65
|
StorageTarget.S3_STORE,
|
|
65
66
|
StorageTarget.R2_STORE,
|
|
67
|
+
StorageTarget.B2_STORE,
|
|
66
68
|
StorageTarget.GCS_STORE,
|
|
67
69
|
StorageTarget.AZURE_STORE,
|
|
68
70
|
StorageTarget.HTTP_STORE,
|
|
@@ -218,15 +220,23 @@ class ObjectPath:
|
|
|
218
220
|
# Standard HTTP(S) URL format
|
|
219
221
|
# https://account.blob.core.windows.net/container/<optional path>/<optional object>
|
|
220
222
|
# https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
|
|
223
|
+
# https://s3.us-west-004.backblazeb2.com/container/<optional path>/<optional object>
|
|
221
224
|
# and possibly others
|
|
222
225
|
key = parsed.path
|
|
223
226
|
if 'cloudflare' in parsed.netloc:
|
|
224
227
|
storage_target = StorageTarget.R2_STORE
|
|
228
|
+
elif 'backblazeb2' in parsed.netloc:
|
|
229
|
+
storage_target = StorageTarget.B2_STORE
|
|
225
230
|
elif 'windows' in parsed.netloc:
|
|
226
231
|
storage_target = StorageTarget.AZURE_STORE
|
|
227
232
|
else:
|
|
228
233
|
storage_target = StorageTarget.HTTP_STORE
|
|
229
|
-
if storage_target in
|
|
234
|
+
if storage_target in (
|
|
235
|
+
StorageTarget.S3_STORE,
|
|
236
|
+
StorageTarget.AZURE_STORE,
|
|
237
|
+
StorageTarget.R2_STORE,
|
|
238
|
+
StorageTarget.B2_STORE,
|
|
239
|
+
):
|
|
230
240
|
account_name = parsed.netloc.split('.', 1)[0]
|
|
231
241
|
account_extension = parsed.netloc.split('.', 1)[1]
|
|
232
242
|
path_parts = key.lstrip('/').split('/', 1)
|
|
@@ -370,6 +380,11 @@ class ObjectOps:
|
|
|
370
380
|
env.Env.get().require_package('boto3')
|
|
371
381
|
from pixeltable.utils.s3_store import S3Store
|
|
372
382
|
|
|
383
|
+
return S3Store(soa)
|
|
384
|
+
if soa.storage_target == StorageTarget.B2_STORE:
|
|
385
|
+
env.Env.get().require_package('boto3')
|
|
386
|
+
from pixeltable.utils.s3_store import S3Store
|
|
387
|
+
|
|
373
388
|
return S3Store(soa)
|
|
374
389
|
if soa.storage_target == StorageTarget.GCS_STORE and soa.scheme == 'gs':
|
|
375
390
|
env.Env.get().require_package('google.cloud.storage')
|