pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -10
- pixeltable/catalog/catalog.py +139 -59
- pixeltable/catalog/column.py +32 -23
- pixeltable/catalog/globals.py +2 -45
- pixeltable/catalog/insertable_table.py +5 -2
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/table.py +173 -23
- pixeltable/catalog/table_version.py +156 -92
- pixeltable/catalog/table_version_handle.py +26 -1
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +12 -3
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +1 -1
- pixeltable/env.py +29 -0
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exprs/column_property_ref.py +23 -20
- pixeltable/exprs/column_ref.py +24 -18
- pixeltable/exprs/data_row.py +9 -0
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +46 -14
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/video.py +57 -10
- pixeltable/globals.py +61 -1
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +39 -64
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +52 -48
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +14 -2
- pixeltable/metadata/utils.py +78 -0
- pixeltable/plan.py +26 -18
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +121 -142
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +39 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
|
@@ -26,6 +26,7 @@ class ColumnPropertyRef(Expr):
|
|
|
26
26
|
ERRORMSG = 1
|
|
27
27
|
FILEURL = 2
|
|
28
28
|
LOCALPATH = 3
|
|
29
|
+
CELLMD = 4 # JSON metadata for the cell, e.g. errortype, errormsg for media columns
|
|
29
30
|
|
|
30
31
|
def __init__(self, col_ref: ColumnRef, prop: Property):
|
|
31
32
|
super().__init__(ts.StringType(nullable=True))
|
|
@@ -51,42 +52,39 @@ class ColumnPropertyRef(Expr):
|
|
|
51
52
|
def __repr__(self) -> str:
|
|
52
53
|
return f'{self._col_ref}.{self.prop.name.lower()}'
|
|
53
54
|
|
|
54
|
-
def
|
|
55
|
-
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
|
|
55
|
+
def is_cellmd_prop(self) -> bool:
|
|
56
|
+
return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
|
|
56
57
|
|
|
57
58
|
def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
58
|
-
if not self._col_ref.
|
|
59
|
+
if not self._col_ref.col_handle.get().is_stored:
|
|
59
60
|
return None
|
|
60
|
-
|
|
61
|
-
# we need to reestablish that we have the correct Column instance, there could have been a metadata
|
|
62
|
-
# reload since init()
|
|
63
|
-
# TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
|
|
64
|
-
# perform runtime checks and update state
|
|
65
|
-
tv = self._col_ref.tbl_version.get()
|
|
66
|
-
assert tv.is_validated
|
|
67
|
-
# we can assume at this point during query execution that the column exists
|
|
68
|
-
assert self._col_ref.col_id in tv.cols_by_id
|
|
69
|
-
col = tv.cols_by_id[self._col_ref.col_id]
|
|
61
|
+
col = self._col_ref.col_handle.get()
|
|
70
62
|
|
|
71
63
|
# the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
|
|
72
64
|
if (
|
|
73
65
|
col.col_type.is_media_type()
|
|
74
66
|
and col.media_validation == catalog.MediaValidation.ON_READ
|
|
75
|
-
and self.
|
|
67
|
+
and self.is_cellmd_prop()
|
|
76
68
|
):
|
|
77
69
|
return None
|
|
78
70
|
|
|
79
71
|
if self.prop == self.Property.ERRORTYPE:
|
|
80
|
-
|
|
81
|
-
return col.sa_errortype_col
|
|
72
|
+
return col.sa_cellmd_col.op('->>')('errortype')
|
|
82
73
|
if self.prop == self.Property.ERRORMSG:
|
|
83
|
-
|
|
84
|
-
|
|
74
|
+
return col.sa_cellmd_col.op('->>')('errormsg')
|
|
75
|
+
if self.prop == self.Property.CELLMD:
|
|
76
|
+
assert col.sa_cellmd_col is not None
|
|
77
|
+
return col.sa_cellmd_col
|
|
85
78
|
if self.prop == self.Property.FILEURL:
|
|
86
79
|
# the file url is stored as the column value
|
|
87
80
|
return sql_elements.get(self._col_ref)
|
|
88
81
|
return None
|
|
89
82
|
|
|
83
|
+
@classmethod
|
|
84
|
+
def create_cellmd_exc(cls, exc: Exception) -> dict[str, str]:
|
|
85
|
+
"""Create a cellmd value from an exception."""
|
|
86
|
+
return {'errortype': type(exc).__name__, 'errormsg': str(exc)}
|
|
87
|
+
|
|
90
88
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
91
89
|
if self.prop == self.Property.FILEURL:
|
|
92
90
|
assert data_row.has_val[self._col_ref.slot_idx]
|
|
@@ -96,14 +94,19 @@ class ColumnPropertyRef(Expr):
|
|
|
96
94
|
assert data_row.has_val[self._col_ref.slot_idx]
|
|
97
95
|
data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
|
|
98
96
|
return
|
|
99
|
-
elif self.
|
|
97
|
+
elif self.is_cellmd_prop():
|
|
100
98
|
exc = data_row.get_exc(self._col_ref.slot_idx)
|
|
101
99
|
if exc is None:
|
|
102
100
|
data_row[self.slot_idx] = None
|
|
103
101
|
elif self.prop == self.Property.ERRORTYPE:
|
|
104
102
|
data_row[self.slot_idx] = type(exc).__name__
|
|
105
|
-
|
|
103
|
+
elif self.prop == self.Property.ERRORMSG:
|
|
106
104
|
data_row[self.slot_idx] = str(exc)
|
|
105
|
+
elif self.prop == self.Property.CELLMD:
|
|
106
|
+
data_row[self.slot_idx] = self.create_cellmd_exc(exc)
|
|
107
|
+
else:
|
|
108
|
+
raise AssertionError(f'Unknown property {self.prop}')
|
|
109
|
+
return
|
|
107
110
|
else:
|
|
108
111
|
raise AssertionError()
|
|
109
112
|
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -10,6 +10,7 @@ import pixeltable as pxt
|
|
|
10
10
|
from pixeltable import catalog, exceptions as excs, iterators as iters
|
|
11
11
|
|
|
12
12
|
from ..utils.description_helper import DescriptionHelper
|
|
13
|
+
from ..utils.filecache import FileCache
|
|
13
14
|
from .data_row import DataRow
|
|
14
15
|
from .expr import Expr
|
|
15
16
|
from .row_builder import RowBuilder
|
|
@@ -41,7 +42,8 @@ class ColumnRef(Expr):
|
|
|
41
42
|
insert them into the EvalCtxs as needed
|
|
42
43
|
"""
|
|
43
44
|
|
|
44
|
-
col: catalog.Column
|
|
45
|
+
col: catalog.Column # TODO: merge with col_handle
|
|
46
|
+
col_handle: catalog.ColumnHandle
|
|
45
47
|
reference_tbl: Optional[catalog.TableVersionPath]
|
|
46
48
|
is_unstored_iter_col: bool
|
|
47
49
|
iter_arg_ctx: Optional[RowBuilder.EvalCtx]
|
|
@@ -52,10 +54,6 @@ class ColumnRef(Expr):
|
|
|
52
54
|
id: int
|
|
53
55
|
perform_validation: bool # if True, performs media validation
|
|
54
56
|
|
|
55
|
-
# needed by sql_expr() to re-resolve Column instance after a metadata reload
|
|
56
|
-
tbl_version: catalog.TableVersionHandle
|
|
57
|
-
col_id: int
|
|
58
|
-
|
|
59
57
|
def __init__(
|
|
60
58
|
self,
|
|
61
59
|
col: catalog.Column,
|
|
@@ -66,8 +64,7 @@ class ColumnRef(Expr):
|
|
|
66
64
|
assert col.tbl is not None
|
|
67
65
|
self.col = col
|
|
68
66
|
self.reference_tbl = reference_tbl
|
|
69
|
-
self.
|
|
70
|
-
self.col_id = col.id
|
|
67
|
+
self.col_handle = catalog.ColumnHandle(col.tbl.handle, col.id)
|
|
71
68
|
|
|
72
69
|
self.is_unstored_iter_col = col.tbl.is_component_view and col.tbl.is_iterator_column(col) and not col.is_stored
|
|
73
70
|
self.iter_arg_ctx = None
|
|
@@ -118,11 +115,15 @@ class ColumnRef(Expr):
|
|
|
118
115
|
from .column_property_ref import ColumnPropertyRef
|
|
119
116
|
|
|
120
117
|
# resolve column properties
|
|
118
|
+
if name == ColumnPropertyRef.Property.CELLMD.name.lower():
|
|
119
|
+
# This is not user accessible, but used internally to store cell metadata
|
|
120
|
+
return super().__getattr__(name)
|
|
121
|
+
|
|
121
122
|
if (
|
|
122
123
|
name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
|
|
123
124
|
or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
|
|
124
125
|
):
|
|
125
|
-
property_is_present = self.col.
|
|
126
|
+
property_is_present = self.col.stores_cellmd
|
|
126
127
|
if not property_is_present:
|
|
127
128
|
raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
|
|
128
129
|
return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
|
|
@@ -170,6 +171,20 @@ class ColumnRef(Expr):
|
|
|
170
171
|
idx_info = embedding_idx_info
|
|
171
172
|
return idx_info
|
|
172
173
|
|
|
174
|
+
def recompute(self, *, cascade: bool = True, errors_only: bool = False) -> catalog.UpdateStatus:
|
|
175
|
+
cat = catalog.Catalog.get()
|
|
176
|
+
# lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
|
|
177
|
+
with cat.begin_xact(tbl=self.reference_tbl, for_write=True, lock_mutable_tree=True):
|
|
178
|
+
tbl_version = self.col_handle.tbl_version.get()
|
|
179
|
+
if tbl_version.id != self.reference_tbl.tbl_id:
|
|
180
|
+
raise excs.Error('Cannot recompute column of a base.')
|
|
181
|
+
if tbl_version.is_snapshot:
|
|
182
|
+
raise excs.Error('Cannot recompute column of a snapshot.')
|
|
183
|
+
col_name = self.col_handle.get().name
|
|
184
|
+
status = tbl_version.recompute_columns([col_name], errors_only=errors_only, cascade=cascade)
|
|
185
|
+
FileCache.get().emit_eviction_warnings()
|
|
186
|
+
return status
|
|
187
|
+
|
|
173
188
|
def similarity(self, item: Any, *, idx: Optional[str] = None) -> Expr:
|
|
174
189
|
from .similarity_expr import SimilarityExpr
|
|
175
190
|
|
|
@@ -241,16 +256,7 @@ class ColumnRef(Expr):
|
|
|
241
256
|
def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
|
|
242
257
|
if self.perform_validation:
|
|
243
258
|
return None
|
|
244
|
-
|
|
245
|
-
# reload since init()
|
|
246
|
-
# TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
|
|
247
|
-
# perform runtime checks and update state
|
|
248
|
-
tv = self.tbl_version.get()
|
|
249
|
-
assert tv.is_validated
|
|
250
|
-
# we can assume at this point during query execution that the column exists
|
|
251
|
-
assert self.col_id in tv.cols_by_id
|
|
252
|
-
self.col = tv.cols_by_id[self.col_id]
|
|
253
|
-
assert self.col.tbl is tv
|
|
259
|
+
self.col = self.col_handle.get()
|
|
254
260
|
return self.col.sa_col
|
|
255
261
|
|
|
256
262
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -42,6 +42,10 @@ class DataRow:
|
|
|
42
42
|
has_val: np.ndarray # of bool
|
|
43
43
|
excs: np.ndarray # of object
|
|
44
44
|
|
|
45
|
+
# If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
|
|
46
|
+
# exception handling under normal operation.
|
|
47
|
+
_may_have_exc: bool
|
|
48
|
+
|
|
45
49
|
# expr evaluation state; indexed by slot idx
|
|
46
50
|
missing_slots: np.ndarray # of bool; number of missing dependencies
|
|
47
51
|
missing_dependents: np.ndarray # of int16; number of missing dependents
|
|
@@ -90,6 +94,7 @@ class DataRow:
|
|
|
90
94
|
self.vals = np.full(num_slots, None, dtype=object)
|
|
91
95
|
self.has_val = np.zeros(num_slots, dtype=bool)
|
|
92
96
|
self.excs = np.full(num_slots, None, dtype=object)
|
|
97
|
+
self._may_have_exc = False
|
|
93
98
|
self.missing_slots = np.zeros(num_slots, dtype=bool)
|
|
94
99
|
self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
|
|
95
100
|
self.is_scheduled = np.zeros(num_slots, dtype=bool)
|
|
@@ -136,6 +141,9 @@ class DataRow:
|
|
|
136
141
|
"""
|
|
137
142
|
Returns True if an exception has been set for the given slot index, or for any slot index if slot_idx is None
|
|
138
143
|
"""
|
|
144
|
+
if not self._may_have_exc:
|
|
145
|
+
return False
|
|
146
|
+
|
|
139
147
|
if slot_idx is not None:
|
|
140
148
|
return self.excs[slot_idx] is not None
|
|
141
149
|
return (self.excs != None).any()
|
|
@@ -154,6 +162,7 @@ class DataRow:
|
|
|
154
162
|
def set_exc(self, slot_idx: int, exc: Exception) -> None:
|
|
155
163
|
assert self.excs[slot_idx] is None
|
|
156
164
|
self.excs[slot_idx] = exc
|
|
165
|
+
self._may_have_exc = True
|
|
157
166
|
|
|
158
167
|
# an exception means the value is None
|
|
159
168
|
self.has_val[slot_idx] = True
|
|
@@ -446,11 +446,11 @@ class FunctionCall(Expr):
|
|
|
446
446
|
dedent(
|
|
447
447
|
f"""
|
|
448
448
|
The UDF '{fn.self_path}' cannot be located, because
|
|
449
|
-
{{
|
|
449
|
+
{{error_msg}}
|
|
450
450
|
"""
|
|
451
451
|
)
|
|
452
452
|
.strip()
|
|
453
|
-
.format(
|
|
453
|
+
.format(error_msg=fn.error_msg)
|
|
454
454
|
)
|
|
455
455
|
return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
|
|
456
456
|
|
pixeltable/exprs/row_builder.py
CHANGED
|
@@ -63,6 +63,7 @@ class RowBuilder:
|
|
|
63
63
|
|
|
64
64
|
input_exprs: ExprSet
|
|
65
65
|
|
|
66
|
+
tbl: Optional[catalog.TableVersion] # reference table of the RowBuilder; used to identify pk columns for writes
|
|
66
67
|
table_columns: list[ColumnSlotIdx]
|
|
67
68
|
default_eval_ctx: EvalCtx
|
|
68
69
|
unstored_iter_args: dict[UUID, Expr]
|
|
@@ -93,7 +94,13 @@ class RowBuilder:
|
|
|
93
94
|
target_slot_idxs: list[int] # slot idxs of target exprs; might contain duplicates
|
|
94
95
|
target_exprs: list[Expr] # exprs corresponding to target_slot_idxs
|
|
95
96
|
|
|
96
|
-
def __init__(
|
|
97
|
+
def __init__(
|
|
98
|
+
self,
|
|
99
|
+
output_exprs: Sequence[Expr],
|
|
100
|
+
columns: Sequence[catalog.Column],
|
|
101
|
+
input_exprs: Iterable[Expr],
|
|
102
|
+
tbl: Optional[catalog.TableVersion] = None,
|
|
103
|
+
):
|
|
97
104
|
"""
|
|
98
105
|
Args:
|
|
99
106
|
output_exprs: list of Exprs to be evaluated
|
|
@@ -125,6 +132,7 @@ class RowBuilder:
|
|
|
125
132
|
# * further references to that column (eg, computed cols) need to resolve to the validating ColumnRef
|
|
126
133
|
from .column_ref import ColumnRef
|
|
127
134
|
|
|
135
|
+
self.tbl = tbl
|
|
128
136
|
self.table_columns: list[ColumnSlotIdx] = []
|
|
129
137
|
self.input_exprs = ExprSet()
|
|
130
138
|
validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
|
|
@@ -201,7 +209,7 @@ class RowBuilder:
|
|
|
201
209
|
# this is input and therefore doesn't depend on other exprs
|
|
202
210
|
continue
|
|
203
211
|
# error properties don't have exceptions themselves
|
|
204
|
-
if isinstance(expr, ColumnPropertyRef) and expr.
|
|
212
|
+
if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
|
|
205
213
|
continue
|
|
206
214
|
dependency_idxs = [d.slot_idx for d in expr.dependencies()]
|
|
207
215
|
self.dependencies[expr.slot_idx, dependency_idxs] = True
|
|
@@ -229,6 +237,7 @@ class RowBuilder:
|
|
|
229
237
|
|
|
230
238
|
def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
|
|
231
239
|
"""Record a column that is part of the table row"""
|
|
240
|
+
assert self.tbl is not None
|
|
232
241
|
self.table_columns.append(ColumnSlotIdx(col, slot_idx))
|
|
233
242
|
|
|
234
243
|
def output_slot_idxs(self) -> list[ColumnSlotIdx]:
|
|
@@ -427,33 +436,56 @@ class RowBuilder:
|
|
|
427
436
|
expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
|
|
428
437
|
) from exc
|
|
429
438
|
|
|
430
|
-
def create_table_row(
|
|
439
|
+
def create_table_row(
|
|
440
|
+
self, data_row: DataRow, cols_with_excs: Optional[set[int]], pk: tuple[int, ...]
|
|
441
|
+
) -> tuple[list[Any], int]:
|
|
431
442
|
"""Create a table row from the slots that have an output column assigned
|
|
432
443
|
|
|
433
|
-
Return tuple[
|
|
444
|
+
Return tuple[list of row values in `self.table_columns` order, # of exceptions]
|
|
434
445
|
This excludes system columns.
|
|
435
446
|
"""
|
|
447
|
+
from pixeltable.exprs.column_property_ref import ColumnPropertyRef
|
|
448
|
+
|
|
436
449
|
num_excs = 0
|
|
437
|
-
table_row:
|
|
450
|
+
table_row: list[Any] = list(pk)
|
|
438
451
|
for info in self.table_columns:
|
|
439
452
|
col, slot_idx = info.col, info.slot_idx
|
|
440
453
|
if data_row.has_exc(slot_idx):
|
|
441
|
-
# exceptions get stored in the errortype/-msg columns
|
|
442
454
|
exc = data_row.get_exc(slot_idx)
|
|
443
455
|
num_excs += 1
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
table_row
|
|
447
|
-
|
|
456
|
+
if cols_with_excs is not None:
|
|
457
|
+
cols_with_excs.add(col.id)
|
|
458
|
+
table_row.append(None)
|
|
459
|
+
if col.stores_cellmd:
|
|
460
|
+
# exceptions get stored in the errortype/-msg properties of the cellmd column
|
|
461
|
+
table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
|
|
448
462
|
else:
|
|
449
463
|
if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
|
|
450
464
|
# we have yet to store this image
|
|
451
465
|
filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
|
|
452
466
|
data_row.flush_img(slot_idx, filepath)
|
|
453
467
|
val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
|
|
454
|
-
table_row
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
table_row[col.errormsg_store_name()] = None
|
|
468
|
+
table_row.append(val)
|
|
469
|
+
if col.stores_cellmd:
|
|
470
|
+
table_row.append(None) # placeholder for cellmd column
|
|
458
471
|
|
|
459
472
|
return table_row, num_excs
|
|
473
|
+
|
|
474
|
+
def store_column_names(self) -> tuple[list[str], dict[int, catalog.Column]]:
|
|
475
|
+
"""
|
|
476
|
+
Returns the list of store column names corresponding to the table_columns of this RowBuilder.
|
|
477
|
+
The second tuple element of the return value is a dictionary containing all media columns in the
|
|
478
|
+
table; it's the mapping {list_index: column}.
|
|
479
|
+
"""
|
|
480
|
+
assert self.tbl is not None, self.table_columns
|
|
481
|
+
store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
|
|
482
|
+
media_cols: dict[int, catalog.Column] = {}
|
|
483
|
+
|
|
484
|
+
for col in self.table_columns:
|
|
485
|
+
if col.col.col_type.is_media_type():
|
|
486
|
+
media_cols[len(store_col_names)] = col.col
|
|
487
|
+
store_col_names.append(col.col.store_name())
|
|
488
|
+
if col.col.stores_cellmd:
|
|
489
|
+
store_col_names.append(col.col.cellmd_store_name())
|
|
490
|
+
|
|
491
|
+
return store_col_names, media_cols
|
pixeltable/exprs/rowid_ref.py
CHANGED
|
@@ -105,10 +105,6 @@ class RowidRef(Expr):
|
|
|
105
105
|
assert self.rowid_component_idx <= len(rowid_cols), (
|
|
106
106
|
f'{self.rowid_component_idx} not consistent with {rowid_cols}'
|
|
107
107
|
)
|
|
108
|
-
# _logger.debug(
|
|
109
|
-
# f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
|
|
110
|
-
# f'tv={id(tbl):x}'
|
|
111
|
-
# )
|
|
112
108
|
return rowid_cols[self.rowid_component_idx]
|
|
113
109
|
|
|
114
110
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
pixeltable/func/function.py
CHANGED
|
@@ -504,12 +504,12 @@ class Function(ABC):
|
|
|
504
504
|
|
|
505
505
|
class InvalidFunction(Function):
|
|
506
506
|
fn_dict: dict[str, Any]
|
|
507
|
-
|
|
507
|
+
error_msg: str
|
|
508
508
|
|
|
509
|
-
def __init__(self, self_path: str, fn_dict: dict[str, Any],
|
|
509
|
+
def __init__(self, self_path: str, fn_dict: dict[str, Any], error_msg: str):
|
|
510
510
|
super().__init__([], self_path)
|
|
511
511
|
self.fn_dict = fn_dict
|
|
512
|
-
self.
|
|
512
|
+
self.error_msg = error_msg
|
|
513
513
|
|
|
514
514
|
def _as_dict(self) -> dict:
|
|
515
515
|
"""
|
pixeltable/functions/audio.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `AudioType`.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
```python
|
|
6
|
-
import pixeltable as pxt
|
|
7
|
-
import pixeltable.functions as pxtf
|
|
8
|
-
|
|
9
|
-
t = pxt.get_table(...)
|
|
10
|
-
t.select(pxtf.audio.get_metadata()).collect()
|
|
11
|
-
```
|
|
12
3
|
"""
|
|
13
4
|
|
|
14
5
|
import pixeltable as pxt
|
|
@@ -19,6 +10,42 @@ from pixeltable.utils.code import local_public_names
|
|
|
19
10
|
def get_metadata(audio: pxt.Audio) -> dict:
|
|
20
11
|
"""
|
|
21
12
|
Gets various metadata associated with an audio file and returns it as a dictionary.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
audio: The audio to get metadata for.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
A `dict` such as the following:
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
'size': 2568827,
|
|
23
|
+
'streams': [
|
|
24
|
+
{
|
|
25
|
+
'type': 'audio',
|
|
26
|
+
'frames': 0,
|
|
27
|
+
'duration': 2646000,
|
|
28
|
+
'metadata': {},
|
|
29
|
+
'time_base': 2.2675736961451248e-05,
|
|
30
|
+
'codec_context': {
|
|
31
|
+
'name': 'flac',
|
|
32
|
+
'profile': None,
|
|
33
|
+
'channels': 1,
|
|
34
|
+
'codec_tag': '\\x00\\x00\\x00\\x00',
|
|
35
|
+
},
|
|
36
|
+
'duration_seconds': 60.0,
|
|
37
|
+
}
|
|
38
|
+
],
|
|
39
|
+
'bit_rate': 342510,
|
|
40
|
+
'metadata': {'encoder': 'Lavf61.1.100'},
|
|
41
|
+
'bit_exact': False,
|
|
42
|
+
}
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
Extract metadata for files in the `audio_col` column of the table `tbl`:
|
|
47
|
+
|
|
48
|
+
>>> tbl.select(tbl.audio_col.get_metadata()).collect()
|
|
22
49
|
"""
|
|
23
50
|
return pxt.functions.video._get_metadata(audio)
|
|
24
51
|
|
pixeltable/functions/video.py
CHANGED
|
@@ -1,14 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
|
|
3
|
-
|
|
4
|
-
Example:
|
|
5
|
-
```python
|
|
6
|
-
import pixeltable as pxt
|
|
7
|
-
import pixeltable.functions as pxtf
|
|
8
|
-
|
|
9
|
-
t = pxt.get_table(...)
|
|
10
|
-
t.select(pxtf.video.extract_audio(t.video_col)).collect()
|
|
11
|
-
```
|
|
12
3
|
"""
|
|
13
4
|
|
|
14
5
|
import tempfile
|
|
@@ -92,12 +83,22 @@ def extract_audio(
|
|
|
92
83
|
video_path: pxt.Video, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
|
|
93
84
|
) -> pxt.Audio:
|
|
94
85
|
"""
|
|
95
|
-
Extract an audio stream from a video
|
|
86
|
+
Extract an audio stream from a video.
|
|
96
87
|
|
|
97
88
|
Args:
|
|
98
89
|
stream_idx: Index of the audio stream to extract.
|
|
99
90
|
format: The target audio format. (`'wav'`, `'mp3'`, `'flac'`).
|
|
100
91
|
codec: The codec to use for the audio stream. If not provided, a default codec will be used.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The extracted audio.
|
|
95
|
+
|
|
96
|
+
Examples:
|
|
97
|
+
Add a computed column to a table `tbl` that extracts audio from an existing column `video_col`:
|
|
98
|
+
|
|
99
|
+
>>> tbl.add_computed_column(
|
|
100
|
+
... extracted_audio=tbl.video_col.extract_audio(format='flac')
|
|
101
|
+
... )
|
|
101
102
|
"""
|
|
102
103
|
if format not in _format_defaults:
|
|
103
104
|
raise ValueError(f'extract_audio(): unsupported audio format: {format}')
|
|
@@ -124,6 +125,52 @@ def extract_audio(
|
|
|
124
125
|
def get_metadata(video: pxt.Video) -> dict:
|
|
125
126
|
"""
|
|
126
127
|
Gets various metadata associated with a video file and returns it as a dictionary.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
video: The video to get metadata for.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
A `dict` such as the following:
|
|
134
|
+
|
|
135
|
+
```json
|
|
136
|
+
{
|
|
137
|
+
'bit_exact': False,
|
|
138
|
+
'bit_rate': 967260,
|
|
139
|
+
'size': 2234371,
|
|
140
|
+
'metadata': {
|
|
141
|
+
'encoder': 'Lavf60.16.100',
|
|
142
|
+
'major_brand': 'isom',
|
|
143
|
+
'minor_version': '512',
|
|
144
|
+
'compatible_brands': 'isomiso2avc1mp41',
|
|
145
|
+
},
|
|
146
|
+
'streams': [
|
|
147
|
+
{
|
|
148
|
+
'type': 'video',
|
|
149
|
+
'width': 640,
|
|
150
|
+
'height': 360,
|
|
151
|
+
'frames': 462,
|
|
152
|
+
'time_base': 1.0 / 12800,
|
|
153
|
+
'duration': 236544,
|
|
154
|
+
'duration_seconds': 236544.0 / 12800,
|
|
155
|
+
'average_rate': 25.0,
|
|
156
|
+
'base_rate': 25.0,
|
|
157
|
+
'guessed_rate': 25.0,
|
|
158
|
+
'metadata': {
|
|
159
|
+
'language': 'und',
|
|
160
|
+
'handler_name': 'L-SMASH Video Handler',
|
|
161
|
+
'vendor_id': '[0][0][0][0]',
|
|
162
|
+
'encoder': 'Lavc60.31.102 libx264',
|
|
163
|
+
},
|
|
164
|
+
'codec_context': {'name': 'h264', 'codec_tag': 'avc1', 'profile': 'High', 'pix_fmt': 'yuv420p'},
|
|
165
|
+
}
|
|
166
|
+
],
|
|
167
|
+
}
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Examples:
|
|
171
|
+
Extract metadata for files in the `video_col` column of the table `tbl`:
|
|
172
|
+
|
|
173
|
+
>>> tbl.select(tbl.video_col.get_metadata()).collect()
|
|
127
174
|
"""
|
|
128
175
|
return _get_metadata(video)
|
|
129
176
|
|
pixeltable/globals.py
CHANGED
|
@@ -11,6 +11,7 @@ from pandas.io.formats.style import Styler
|
|
|
11
11
|
from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
|
|
12
12
|
from pixeltable.catalog import Catalog, TableVersionPath
|
|
13
13
|
from pixeltable.catalog.insertable_table import OnErrorParameter
|
|
14
|
+
from pixeltable.config import Config
|
|
14
15
|
from pixeltable.env import Env
|
|
15
16
|
from pixeltable.iterators import ComponentIterator
|
|
16
17
|
|
|
@@ -34,8 +35,11 @@ if TYPE_CHECKING:
|
|
|
34
35
|
_logger = logging.getLogger('pixeltable')
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def init() -> None:
|
|
38
|
+
def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
|
|
38
39
|
"""Initializes the Pixeltable environment."""
|
|
40
|
+
if config_overrides is None:
|
|
41
|
+
config_overrides = {}
|
|
42
|
+
Config.init(config_overrides)
|
|
39
43
|
_ = Catalog.get()
|
|
40
44
|
|
|
41
45
|
|
|
@@ -633,6 +637,62 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
|
|
|
633
637
|
Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
|
|
634
638
|
|
|
635
639
|
|
|
640
|
+
def ls(path: str = '') -> pd.DataFrame:
|
|
641
|
+
"""
|
|
642
|
+
List the contents of a Pixeltable directory.
|
|
643
|
+
|
|
644
|
+
This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
|
|
645
|
+
including various attributes such as version and base table, as appropriate.
|
|
646
|
+
|
|
647
|
+
To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
|
|
648
|
+
[list_dirs()][pixeltable.list_dirs] instead.
|
|
649
|
+
"""
|
|
650
|
+
from pixeltable.metadata import schema
|
|
651
|
+
|
|
652
|
+
cat = Catalog.get()
|
|
653
|
+
path_obj = catalog.Path(path, empty_is_valid=True)
|
|
654
|
+
dir_entries = cat.get_dir_contents(path_obj)
|
|
655
|
+
rows: list[list[str]] = []
|
|
656
|
+
with Catalog.get().begin_xact():
|
|
657
|
+
for name, entry in dir_entries.items():
|
|
658
|
+
if name.startswith('_'):
|
|
659
|
+
continue
|
|
660
|
+
if entry.dir is not None:
|
|
661
|
+
kind = 'dir'
|
|
662
|
+
version = ''
|
|
663
|
+
base = ''
|
|
664
|
+
else:
|
|
665
|
+
assert entry.table is not None
|
|
666
|
+
assert isinstance(entry.table, schema.Table)
|
|
667
|
+
tbl = cat.get_table_by_id(entry.table.id)
|
|
668
|
+
md = tbl.get_metadata()
|
|
669
|
+
base = md['base'] or ''
|
|
670
|
+
if base.startswith('_'):
|
|
671
|
+
base = '<anonymous base table>'
|
|
672
|
+
if md['is_snapshot']:
|
|
673
|
+
kind = 'snapshot'
|
|
674
|
+
elif md['is_view']:
|
|
675
|
+
kind = 'view'
|
|
676
|
+
else:
|
|
677
|
+
kind = 'table'
|
|
678
|
+
version = '' if kind == 'snapshot' else md['version']
|
|
679
|
+
if md['is_replica']:
|
|
680
|
+
kind = f'{kind}-replica'
|
|
681
|
+
rows.append([name, kind, version, base])
|
|
682
|
+
|
|
683
|
+
rows = sorted(rows, key=lambda x: x[0])
|
|
684
|
+
df = pd.DataFrame(
|
|
685
|
+
{
|
|
686
|
+
'Name': [row[0] for row in rows],
|
|
687
|
+
'Kind': [row[1] for row in rows],
|
|
688
|
+
'Version': [row[2] for row in rows],
|
|
689
|
+
'Base': [row[3] for row in rows],
|
|
690
|
+
},
|
|
691
|
+
index=([''] * len(rows)),
|
|
692
|
+
)
|
|
693
|
+
return df
|
|
694
|
+
|
|
695
|
+
|
|
636
696
|
def _extract_paths(
|
|
637
697
|
dir_entries: dict[str, Catalog.DirEntry],
|
|
638
698
|
parent: catalog.Path,
|
pixeltable/io/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# ruff: noqa: F401
|
|
2
2
|
|
|
3
3
|
from .datarows import import_json, import_rows
|
|
4
|
-
from .external_store import ExternalStore
|
|
4
|
+
from .external_store import ExternalStore
|
|
5
5
|
from .globals import create_label_studio_project, export_images_as_fo_dataset
|
|
6
6
|
from .hf_datasets import import_huggingface_dataset
|
|
7
7
|
from .pandas import import_csv, import_excel, import_pandas
|