pixeltable 0.4.1__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (51) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -10
  4. pixeltable/catalog/catalog.py +139 -59
  5. pixeltable/catalog/column.py +32 -23
  6. pixeltable/catalog/globals.py +2 -45
  7. pixeltable/catalog/insertable_table.py +5 -2
  8. pixeltable/catalog/path.py +6 -0
  9. pixeltable/catalog/table.py +173 -23
  10. pixeltable/catalog/table_version.py +156 -92
  11. pixeltable/catalog/table_version_handle.py +26 -1
  12. pixeltable/catalog/update_status.py +179 -0
  13. pixeltable/catalog/view.py +12 -3
  14. pixeltable/config.py +76 -12
  15. pixeltable/dataframe.py +1 -1
  16. pixeltable/env.py +29 -0
  17. pixeltable/exec/exec_node.py +7 -24
  18. pixeltable/exec/expr_eval/schedulers.py +134 -7
  19. pixeltable/exprs/column_property_ref.py +23 -20
  20. pixeltable/exprs/column_ref.py +24 -18
  21. pixeltable/exprs/data_row.py +9 -0
  22. pixeltable/exprs/function_call.py +2 -2
  23. pixeltable/exprs/row_builder.py +46 -14
  24. pixeltable/exprs/rowid_ref.py +0 -4
  25. pixeltable/func/function.py +3 -3
  26. pixeltable/functions/audio.py +36 -9
  27. pixeltable/functions/video.py +57 -10
  28. pixeltable/globals.py +61 -1
  29. pixeltable/io/__init__.py +1 -1
  30. pixeltable/io/external_store.py +39 -64
  31. pixeltable/io/globals.py +4 -4
  32. pixeltable/io/hf_datasets.py +10 -2
  33. pixeltable/io/label_studio.py +52 -48
  34. pixeltable/metadata/__init__.py +1 -1
  35. pixeltable/metadata/converters/convert_38.py +39 -0
  36. pixeltable/metadata/converters/convert_39.py +125 -0
  37. pixeltable/metadata/converters/util.py +3 -0
  38. pixeltable/metadata/notes.py +2 -0
  39. pixeltable/metadata/schema.py +14 -2
  40. pixeltable/metadata/utils.py +78 -0
  41. pixeltable/plan.py +26 -18
  42. pixeltable/share/packager.py +20 -38
  43. pixeltable/store.py +121 -142
  44. pixeltable/type_system.py +2 -2
  45. pixeltable/utils/coroutine.py +6 -23
  46. pixeltable/utils/media_store.py +39 -0
  47. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/METADATA +1 -1
  48. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/RECORD +51 -47
  49. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/LICENSE +0 -0
  50. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/WHEEL +0 -0
  51. {pixeltable-0.4.1.dist-info → pixeltable-0.4.3.dist-info}/entry_points.txt +0 -0
@@ -26,6 +26,7 @@ class ColumnPropertyRef(Expr):
26
26
  ERRORMSG = 1
27
27
  FILEURL = 2
28
28
  LOCALPATH = 3
29
+ CELLMD = 4 # JSON metadata for the cell, e.g. errortype, errormsg for media columns
29
30
 
30
31
  def __init__(self, col_ref: ColumnRef, prop: Property):
31
32
  super().__init__(ts.StringType(nullable=True))
@@ -51,42 +52,39 @@ class ColumnPropertyRef(Expr):
51
52
  def __repr__(self) -> str:
52
53
  return f'{self._col_ref}.{self.prop.name.lower()}'
53
54
 
54
- def is_error_prop(self) -> bool:
55
- return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG)
55
+ def is_cellmd_prop(self) -> bool:
56
+ return self.prop in (self.Property.ERRORTYPE, self.Property.ERRORMSG, self.Property.CELLMD)
56
57
 
57
58
  def sql_expr(self, sql_elements: SqlElementCache) -> Optional[sql.ColumnElement]:
58
- if not self._col_ref.col.is_stored:
59
+ if not self._col_ref.col_handle.get().is_stored:
59
60
  return None
60
-
61
- # we need to reestablish that we have the correct Column instance, there could have been a metadata
62
- # reload since init()
63
- # TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
64
- # perform runtime checks and update state
65
- tv = self._col_ref.tbl_version.get()
66
- assert tv.is_validated
67
- # we can assume at this point during query execution that the column exists
68
- assert self._col_ref.col_id in tv.cols_by_id
69
- col = tv.cols_by_id[self._col_ref.col_id]
61
+ col = self._col_ref.col_handle.get()
70
62
 
71
63
  # the errortype/-msg properties of a read-validated media column need to be extracted from the DataRow
72
64
  if (
73
65
  col.col_type.is_media_type()
74
66
  and col.media_validation == catalog.MediaValidation.ON_READ
75
- and self.is_error_prop()
67
+ and self.is_cellmd_prop()
76
68
  ):
77
69
  return None
78
70
 
79
71
  if self.prop == self.Property.ERRORTYPE:
80
- assert col.sa_errortype_col is not None
81
- return col.sa_errortype_col
72
+ return col.sa_cellmd_col.op('->>')('errortype')
82
73
  if self.prop == self.Property.ERRORMSG:
83
- assert col.sa_errormsg_col is not None
84
- return col.sa_errormsg_col
74
+ return col.sa_cellmd_col.op('->>')('errormsg')
75
+ if self.prop == self.Property.CELLMD:
76
+ assert col.sa_cellmd_col is not None
77
+ return col.sa_cellmd_col
85
78
  if self.prop == self.Property.FILEURL:
86
79
  # the file url is stored as the column value
87
80
  return sql_elements.get(self._col_ref)
88
81
  return None
89
82
 
83
+ @classmethod
84
+ def create_cellmd_exc(cls, exc: Exception) -> dict[str, str]:
85
+ """Create a cellmd value from an exception."""
86
+ return {'errortype': type(exc).__name__, 'errormsg': str(exc)}
87
+
90
88
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
91
89
  if self.prop == self.Property.FILEURL:
92
90
  assert data_row.has_val[self._col_ref.slot_idx]
@@ -96,14 +94,19 @@ class ColumnPropertyRef(Expr):
96
94
  assert data_row.has_val[self._col_ref.slot_idx]
97
95
  data_row[self.slot_idx] = data_row.file_paths[self._col_ref.slot_idx]
98
96
  return
99
- elif self.is_error_prop():
97
+ elif self.is_cellmd_prop():
100
98
  exc = data_row.get_exc(self._col_ref.slot_idx)
101
99
  if exc is None:
102
100
  data_row[self.slot_idx] = None
103
101
  elif self.prop == self.Property.ERRORTYPE:
104
102
  data_row[self.slot_idx] = type(exc).__name__
105
- else:
103
+ elif self.prop == self.Property.ERRORMSG:
106
104
  data_row[self.slot_idx] = str(exc)
105
+ elif self.prop == self.Property.CELLMD:
106
+ data_row[self.slot_idx] = self.create_cellmd_exc(exc)
107
+ else:
108
+ raise AssertionError(f'Unknown property {self.prop}')
109
+ return
107
110
  else:
108
111
  raise AssertionError()
109
112
 
@@ -10,6 +10,7 @@ import pixeltable as pxt
10
10
  from pixeltable import catalog, exceptions as excs, iterators as iters
11
11
 
12
12
  from ..utils.description_helper import DescriptionHelper
13
+ from ..utils.filecache import FileCache
13
14
  from .data_row import DataRow
14
15
  from .expr import Expr
15
16
  from .row_builder import RowBuilder
@@ -41,7 +42,8 @@ class ColumnRef(Expr):
41
42
  insert them into the EvalCtxs as needed
42
43
  """
43
44
 
44
- col: catalog.Column
45
+ col: catalog.Column # TODO: merge with col_handle
46
+ col_handle: catalog.ColumnHandle
45
47
  reference_tbl: Optional[catalog.TableVersionPath]
46
48
  is_unstored_iter_col: bool
47
49
  iter_arg_ctx: Optional[RowBuilder.EvalCtx]
@@ -52,10 +54,6 @@ class ColumnRef(Expr):
52
54
  id: int
53
55
  perform_validation: bool # if True, performs media validation
54
56
 
55
- # needed by sql_expr() to re-resolve Column instance after a metadata reload
56
- tbl_version: catalog.TableVersionHandle
57
- col_id: int
58
-
59
57
  def __init__(
60
58
  self,
61
59
  col: catalog.Column,
@@ -66,8 +64,7 @@ class ColumnRef(Expr):
66
64
  assert col.tbl is not None
67
65
  self.col = col
68
66
  self.reference_tbl = reference_tbl
69
- self.tbl_version = catalog.TableVersionHandle(col.tbl.id, col.tbl.effective_version)
70
- self.col_id = col.id
67
+ self.col_handle = catalog.ColumnHandle(col.tbl.handle, col.id)
71
68
 
72
69
  self.is_unstored_iter_col = col.tbl.is_component_view and col.tbl.is_iterator_column(col) and not col.is_stored
73
70
  self.iter_arg_ctx = None
@@ -118,11 +115,15 @@ class ColumnRef(Expr):
118
115
  from .column_property_ref import ColumnPropertyRef
119
116
 
120
117
  # resolve column properties
118
+ if name == ColumnPropertyRef.Property.CELLMD.name.lower():
119
+ # This is not user accessible, but used internally to store cell metadata
120
+ return super().__getattr__(name)
121
+
121
122
  if (
122
123
  name == ColumnPropertyRef.Property.ERRORTYPE.name.lower()
123
124
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower()
124
125
  ):
125
- property_is_present = self.col.is_stored and (self.col.is_computed or self.col_type.is_media_type())
126
+ property_is_present = self.col.stores_cellmd
126
127
  if not property_is_present:
127
128
  raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
128
129
  return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
@@ -170,6 +171,20 @@ class ColumnRef(Expr):
170
171
  idx_info = embedding_idx_info
171
172
  return idx_info
172
173
 
174
+ def recompute(self, *, cascade: bool = True, errors_only: bool = False) -> catalog.UpdateStatus:
175
+ cat = catalog.Catalog.get()
176
+ # lock_mutable_tree=True: we need to be able to see whether any transitive view has column dependents
177
+ with cat.begin_xact(tbl=self.reference_tbl, for_write=True, lock_mutable_tree=True):
178
+ tbl_version = self.col_handle.tbl_version.get()
179
+ if tbl_version.id != self.reference_tbl.tbl_id:
180
+ raise excs.Error('Cannot recompute column of a base.')
181
+ if tbl_version.is_snapshot:
182
+ raise excs.Error('Cannot recompute column of a snapshot.')
183
+ col_name = self.col_handle.get().name
184
+ status = tbl_version.recompute_columns([col_name], errors_only=errors_only, cascade=cascade)
185
+ FileCache.get().emit_eviction_warnings()
186
+ return status
187
+
173
188
  def similarity(self, item: Any, *, idx: Optional[str] = None) -> Expr:
174
189
  from .similarity_expr import SimilarityExpr
175
190
 
@@ -241,16 +256,7 @@ class ColumnRef(Expr):
241
256
  def sql_expr(self, _: SqlElementCache) -> Optional[sql.ColumnElement]:
242
257
  if self.perform_validation:
243
258
  return None
244
- # we need to reestablish that we have the correct Column instance, there could have been a metadata
245
- # reload since init()
246
- # TODO: add an explicit prepare phase (ie, Expr.prepare()) that gives every subclass instance a chance to
247
- # perform runtime checks and update state
248
- tv = self.tbl_version.get()
249
- assert tv.is_validated
250
- # we can assume at this point during query execution that the column exists
251
- assert self.col_id in tv.cols_by_id
252
- self.col = tv.cols_by_id[self.col_id]
253
- assert self.col.tbl is tv
259
+ self.col = self.col_handle.get()
254
260
  return self.col.sa_col
255
261
 
256
262
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -42,6 +42,10 @@ class DataRow:
42
42
  has_val: np.ndarray # of bool
43
43
  excs: np.ndarray # of object
44
44
 
45
+ # If `may_have_exc` is False, then we guarantee that no slot has an exception set. This is used to optimize
46
+ # exception handling under normal operation.
47
+ _may_have_exc: bool
48
+
45
49
  # expr evaluation state; indexed by slot idx
46
50
  missing_slots: np.ndarray # of bool; number of missing dependencies
47
51
  missing_dependents: np.ndarray # of int16; number of missing dependents
@@ -90,6 +94,7 @@ class DataRow:
90
94
  self.vals = np.full(num_slots, None, dtype=object)
91
95
  self.has_val = np.zeros(num_slots, dtype=bool)
92
96
  self.excs = np.full(num_slots, None, dtype=object)
97
+ self._may_have_exc = False
93
98
  self.missing_slots = np.zeros(num_slots, dtype=bool)
94
99
  self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
95
100
  self.is_scheduled = np.zeros(num_slots, dtype=bool)
@@ -136,6 +141,9 @@ class DataRow:
136
141
  """
137
142
  Returns True if an exception has been set for the given slot index, or for any slot index if slot_idx is None
138
143
  """
144
+ if not self._may_have_exc:
145
+ return False
146
+
139
147
  if slot_idx is not None:
140
148
  return self.excs[slot_idx] is not None
141
149
  return (self.excs != None).any()
@@ -154,6 +162,7 @@ class DataRow:
154
162
  def set_exc(self, slot_idx: int, exc: Exception) -> None:
155
163
  assert self.excs[slot_idx] is None
156
164
  self.excs[slot_idx] = exc
165
+ self._may_have_exc = True
157
166
 
158
167
  # an exception means the value is None
159
168
  self.has_val[slot_idx] = True
@@ -446,11 +446,11 @@ class FunctionCall(Expr):
446
446
  dedent(
447
447
  f"""
448
448
  The UDF '{fn.self_path}' cannot be located, because
449
- {{errormsg}}
449
+ {{error_msg}}
450
450
  """
451
451
  )
452
452
  .strip()
453
- .format(errormsg=fn.errormsg)
453
+ .format(error_msg=fn.error_msg)
454
454
  )
455
455
  return cls(fn, args, kwargs, return_type, is_method_call=is_method_call, validation_error=validation_error)
456
456
 
@@ -63,6 +63,7 @@ class RowBuilder:
63
63
 
64
64
  input_exprs: ExprSet
65
65
 
66
+ tbl: Optional[catalog.TableVersion] # reference table of the RowBuilder; used to identify pk columns for writes
66
67
  table_columns: list[ColumnSlotIdx]
67
68
  default_eval_ctx: EvalCtx
68
69
  unstored_iter_args: dict[UUID, Expr]
@@ -93,7 +94,13 @@ class RowBuilder:
93
94
  target_slot_idxs: list[int] # slot idxs of target exprs; might contain duplicates
94
95
  target_exprs: list[Expr] # exprs corresponding to target_slot_idxs
95
96
 
96
- def __init__(self, output_exprs: Sequence[Expr], columns: Sequence[catalog.Column], input_exprs: Iterable[Expr]):
97
+ def __init__(
98
+ self,
99
+ output_exprs: Sequence[Expr],
100
+ columns: Sequence[catalog.Column],
101
+ input_exprs: Iterable[Expr],
102
+ tbl: Optional[catalog.TableVersion] = None,
103
+ ):
97
104
  """
98
105
  Args:
99
106
  output_exprs: list of Exprs to be evaluated
@@ -125,6 +132,7 @@ class RowBuilder:
125
132
  # * further references to that column (eg, computed cols) need to resolve to the validating ColumnRef
126
133
  from .column_ref import ColumnRef
127
134
 
135
+ self.tbl = tbl
128
136
  self.table_columns: list[ColumnSlotIdx] = []
129
137
  self.input_exprs = ExprSet()
130
138
  validating_colrefs: dict[Expr, Expr] = {} # key: non-validating colref, value: corresp. validating colref
@@ -201,7 +209,7 @@ class RowBuilder:
201
209
  # this is input and therefore doesn't depend on other exprs
202
210
  continue
203
211
  # error properties don't have exceptions themselves
204
- if isinstance(expr, ColumnPropertyRef) and expr.is_error_prop():
212
+ if isinstance(expr, ColumnPropertyRef) and expr.is_cellmd_prop():
205
213
  continue
206
214
  dependency_idxs = [d.slot_idx for d in expr.dependencies()]
207
215
  self.dependencies[expr.slot_idx, dependency_idxs] = True
@@ -229,6 +237,7 @@ class RowBuilder:
229
237
 
230
238
  def add_table_column(self, col: catalog.Column, slot_idx: int) -> None:
231
239
  """Record a column that is part of the table row"""
240
+ assert self.tbl is not None
232
241
  self.table_columns.append(ColumnSlotIdx(col, slot_idx))
233
242
 
234
243
  def output_slot_idxs(self) -> list[ColumnSlotIdx]:
@@ -427,33 +436,56 @@ class RowBuilder:
427
436
  expr, f'expression {expr}', data_row.get_exc(expr.slot_idx), exc_tb, input_vals, 0
428
437
  ) from exc
429
438
 
430
- def create_table_row(self, data_row: DataRow, exc_col_ids: set[int]) -> tuple[dict[str, Any], int]:
439
+ def create_table_row(
440
+ self, data_row: DataRow, cols_with_excs: Optional[set[int]], pk: tuple[int, ...]
441
+ ) -> tuple[list[Any], int]:
431
442
  """Create a table row from the slots that have an output column assigned
432
443
 
433
- Return tuple[dict that represents a stored row (can be passed to sql.insert()), # of exceptions]
444
+ Return tuple[list of row values in `self.table_columns` order, # of exceptions]
434
445
  This excludes system columns.
435
446
  """
447
+ from pixeltable.exprs.column_property_ref import ColumnPropertyRef
448
+
436
449
  num_excs = 0
437
- table_row: dict[str, Any] = {}
450
+ table_row: list[Any] = list(pk)
438
451
  for info in self.table_columns:
439
452
  col, slot_idx = info.col, info.slot_idx
440
453
  if data_row.has_exc(slot_idx):
441
- # exceptions get stored in the errortype/-msg columns
442
454
  exc = data_row.get_exc(slot_idx)
443
455
  num_excs += 1
444
- exc_col_ids.add(col.id)
445
- table_row[col.store_name()] = None
446
- table_row[col.errortype_store_name()] = type(exc).__name__
447
- table_row[col.errormsg_store_name()] = str(exc)
456
+ if cols_with_excs is not None:
457
+ cols_with_excs.add(col.id)
458
+ table_row.append(None)
459
+ if col.stores_cellmd:
460
+ # exceptions get stored in the errortype/-msg properties of the cellmd column
461
+ table_row.append(ColumnPropertyRef.create_cellmd_exc(exc))
448
462
  else:
449
463
  if col.col_type.is_image_type() and data_row.file_urls[slot_idx] is None:
450
464
  # we have yet to store this image
451
465
  filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
452
466
  data_row.flush_img(slot_idx, filepath)
453
467
  val = data_row.get_stored_val(slot_idx, col.get_sa_col_type())
454
- table_row[col.store_name()] = val
455
- # we unfortunately need to set these, even if there are no errors
456
- table_row[col.errortype_store_name()] = None
457
- table_row[col.errormsg_store_name()] = None
468
+ table_row.append(val)
469
+ if col.stores_cellmd:
470
+ table_row.append(None) # placeholder for cellmd column
458
471
 
459
472
  return table_row, num_excs
473
+
474
+ def store_column_names(self) -> tuple[list[str], dict[int, catalog.Column]]:
475
+ """
476
+ Returns the list of store column names corresponding to the table_columns of this RowBuilder.
477
+ The second tuple element of the return value is a dictionary containing all media columns in the
478
+ table; it's the mapping {list_index: column}.
479
+ """
480
+ assert self.tbl is not None, self.table_columns
481
+ store_col_names: list[str] = [pk_col.name for pk_col in self.tbl.store_tbl.pk_columns()]
482
+ media_cols: dict[int, catalog.Column] = {}
483
+
484
+ for col in self.table_columns:
485
+ if col.col.col_type.is_media_type():
486
+ media_cols[len(store_col_names)] = col.col
487
+ store_col_names.append(col.col.store_name())
488
+ if col.col.stores_cellmd:
489
+ store_col_names.append(col.col.cellmd_store_name())
490
+
491
+ return store_col_names, media_cols
@@ -105,10 +105,6 @@ class RowidRef(Expr):
105
105
  assert self.rowid_component_idx <= len(rowid_cols), (
106
106
  f'{self.rowid_component_idx} not consistent with {rowid_cols}'
107
107
  )
108
- # _logger.debug(
109
- # f'RowidRef.sql_expr: tbl={tbl.id}{tbl.effective_version} sa_tbl={id(tbl.store_tbl.sa_tbl):x} '
110
- # f'tv={id(tbl):x}'
111
- # )
112
108
  return rowid_cols[self.rowid_component_idx]
113
109
 
114
110
  def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
@@ -504,12 +504,12 @@ class Function(ABC):
504
504
 
505
505
  class InvalidFunction(Function):
506
506
  fn_dict: dict[str, Any]
507
- errormsg: str
507
+ error_msg: str
508
508
 
509
- def __init__(self, self_path: str, fn_dict: dict[str, Any], errormsg: str):
509
+ def __init__(self, self_path: str, fn_dict: dict[str, Any], error_msg: str):
510
510
  super().__init__([], self_path)
511
511
  self.fn_dict = fn_dict
512
- self.errormsg = errormsg
512
+ self.error_msg = error_msg
513
513
 
514
514
  def _as_dict(self) -> dict:
515
515
  """
@@ -1,14 +1,5 @@
1
1
  """
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `AudioType`.
3
-
4
- Example:
5
- ```python
6
- import pixeltable as pxt
7
- import pixeltable.functions as pxtf
8
-
9
- t = pxt.get_table(...)
10
- t.select(pxtf.audio.get_metadata()).collect()
11
- ```
12
3
  """
13
4
 
14
5
  import pixeltable as pxt
@@ -19,6 +10,42 @@ from pixeltable.utils.code import local_public_names
19
10
  def get_metadata(audio: pxt.Audio) -> dict:
20
11
  """
21
12
  Gets various metadata associated with an audio file and returns it as a dictionary.
13
+
14
+ Args:
15
+ audio: The audio to get metadata for.
16
+
17
+ Returns:
18
+ A `dict` such as the following:
19
+
20
+ ```json
21
+ {
22
+ 'size': 2568827,
23
+ 'streams': [
24
+ {
25
+ 'type': 'audio',
26
+ 'frames': 0,
27
+ 'duration': 2646000,
28
+ 'metadata': {},
29
+ 'time_base': 2.2675736961451248e-05,
30
+ 'codec_context': {
31
+ 'name': 'flac',
32
+ 'profile': None,
33
+ 'channels': 1,
34
+ 'codec_tag': '\\x00\\x00\\x00\\x00',
35
+ },
36
+ 'duration_seconds': 60.0,
37
+ }
38
+ ],
39
+ 'bit_rate': 342510,
40
+ 'metadata': {'encoder': 'Lavf61.1.100'},
41
+ 'bit_exact': False,
42
+ }
43
+ ```
44
+
45
+ Examples:
46
+ Extract metadata for files in the `audio_col` column of the table `tbl`:
47
+
48
+ >>> tbl.select(tbl.audio_col.get_metadata()).collect()
22
49
  """
23
50
  return pxt.functions.video._get_metadata(audio)
24
51
 
@@ -1,14 +1,5 @@
1
1
  """
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
3
-
4
- Example:
5
- ```python
6
- import pixeltable as pxt
7
- import pixeltable.functions as pxtf
8
-
9
- t = pxt.get_table(...)
10
- t.select(pxtf.video.extract_audio(t.video_col)).collect()
11
- ```
12
3
  """
13
4
 
14
5
  import tempfile
@@ -92,12 +83,22 @@ def extract_audio(
92
83
  video_path: pxt.Video, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
93
84
  ) -> pxt.Audio:
94
85
  """
95
- Extract an audio stream from a video file, save it as a media file and return its path.
86
+ Extract an audio stream from a video.
96
87
 
97
88
  Args:
98
89
  stream_idx: Index of the audio stream to extract.
99
90
  format: The target audio format. (`'wav'`, `'mp3'`, `'flac'`).
100
91
  codec: The codec to use for the audio stream. If not provided, a default codec will be used.
92
+
93
+ Returns:
94
+ The extracted audio.
95
+
96
+ Examples:
97
+ Add a computed column to a table `tbl` that extracts audio from an existing column `video_col`:
98
+
99
+ >>> tbl.add_computed_column(
100
+ ... extracted_audio=tbl.video_col.extract_audio(format='flac')
101
+ ... )
101
102
  """
102
103
  if format not in _format_defaults:
103
104
  raise ValueError(f'extract_audio(): unsupported audio format: {format}')
@@ -124,6 +125,52 @@ def extract_audio(
124
125
  def get_metadata(video: pxt.Video) -> dict:
125
126
  """
126
127
  Gets various metadata associated with a video file and returns it as a dictionary.
128
+
129
+ Args:
130
+ video: The video to get metadata for.
131
+
132
+ Returns:
133
+ A `dict` such as the following:
134
+
135
+ ```json
136
+ {
137
+ 'bit_exact': False,
138
+ 'bit_rate': 967260,
139
+ 'size': 2234371,
140
+ 'metadata': {
141
+ 'encoder': 'Lavf60.16.100',
142
+ 'major_brand': 'isom',
143
+ 'minor_version': '512',
144
+ 'compatible_brands': 'isomiso2avc1mp41',
145
+ },
146
+ 'streams': [
147
+ {
148
+ 'type': 'video',
149
+ 'width': 640,
150
+ 'height': 360,
151
+ 'frames': 462,
152
+ 'time_base': 1.0 / 12800,
153
+ 'duration': 236544,
154
+ 'duration_seconds': 236544.0 / 12800,
155
+ 'average_rate': 25.0,
156
+ 'base_rate': 25.0,
157
+ 'guessed_rate': 25.0,
158
+ 'metadata': {
159
+ 'language': 'und',
160
+ 'handler_name': 'L-SMASH Video Handler',
161
+ 'vendor_id': '[0][0][0][0]',
162
+ 'encoder': 'Lavc60.31.102 libx264',
163
+ },
164
+ 'codec_context': {'name': 'h264', 'codec_tag': 'avc1', 'profile': 'High', 'pix_fmt': 'yuv420p'},
165
+ }
166
+ ],
167
+ }
168
+ ```
169
+
170
+ Examples:
171
+ Extract metadata for files in the `video_col` column of the table `tbl`:
172
+
173
+ >>> tbl.select(tbl.video_col.get_metadata()).collect()
127
174
  """
128
175
  return _get_metadata(video)
129
176
 
pixeltable/globals.py CHANGED
@@ -11,6 +11,7 @@ from pandas.io.formats.style import Styler
11
11
  from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
12
12
  from pixeltable.catalog import Catalog, TableVersionPath
13
13
  from pixeltable.catalog.insertable_table import OnErrorParameter
14
+ from pixeltable.config import Config
14
15
  from pixeltable.env import Env
15
16
  from pixeltable.iterators import ComponentIterator
16
17
 
@@ -34,8 +35,11 @@ if TYPE_CHECKING:
34
35
  _logger = logging.getLogger('pixeltable')
35
36
 
36
37
 
37
- def init() -> None:
38
+ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
38
39
  """Initializes the Pixeltable environment."""
40
+ if config_overrides is None:
41
+ config_overrides = {}
42
+ Config.init(config_overrides)
39
43
  _ = Catalog.get()
40
44
 
41
45
 
@@ -633,6 +637,62 @@ def drop_dir(path: str, force: bool = False, if_not_exists: Literal['error', 'ig
633
637
  Catalog.get().drop_dir(path_obj, if_not_exists=if_not_exists_, force=force)
634
638
 
635
639
 
640
+ def ls(path: str = '') -> pd.DataFrame:
641
+ """
642
+ List the contents of a Pixeltable directory.
643
+
644
+ This function returns a Pandas DataFrame representing a human-readable listing of the specified directory,
645
+ including various attributes such as version and base table, as appropriate.
646
+
647
+ To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
648
+ [list_dirs()][pixeltable.list_dirs] instead.
649
+ """
650
+ from pixeltable.metadata import schema
651
+
652
+ cat = Catalog.get()
653
+ path_obj = catalog.Path(path, empty_is_valid=True)
654
+ dir_entries = cat.get_dir_contents(path_obj)
655
+ rows: list[list[str]] = []
656
+ with Catalog.get().begin_xact():
657
+ for name, entry in dir_entries.items():
658
+ if name.startswith('_'):
659
+ continue
660
+ if entry.dir is not None:
661
+ kind = 'dir'
662
+ version = ''
663
+ base = ''
664
+ else:
665
+ assert entry.table is not None
666
+ assert isinstance(entry.table, schema.Table)
667
+ tbl = cat.get_table_by_id(entry.table.id)
668
+ md = tbl.get_metadata()
669
+ base = md['base'] or ''
670
+ if base.startswith('_'):
671
+ base = '<anonymous base table>'
672
+ if md['is_snapshot']:
673
+ kind = 'snapshot'
674
+ elif md['is_view']:
675
+ kind = 'view'
676
+ else:
677
+ kind = 'table'
678
+ version = '' if kind == 'snapshot' else md['version']
679
+ if md['is_replica']:
680
+ kind = f'{kind}-replica'
681
+ rows.append([name, kind, version, base])
682
+
683
+ rows = sorted(rows, key=lambda x: x[0])
684
+ df = pd.DataFrame(
685
+ {
686
+ 'Name': [row[0] for row in rows],
687
+ 'Kind': [row[1] for row in rows],
688
+ 'Version': [row[2] for row in rows],
689
+ 'Base': [row[3] for row in rows],
690
+ },
691
+ index=([''] * len(rows)),
692
+ )
693
+ return df
694
+
695
+
636
696
  def _extract_paths(
637
697
  dir_entries: dict[str, Catalog.DirEntry],
638
698
  parent: catalog.Path,
pixeltable/io/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  # ruff: noqa: F401
2
2
 
3
3
  from .datarows import import_json, import_rows
4
- from .external_store import ExternalStore, SyncStatus
4
+ from .external_store import ExternalStore
5
5
  from .globals import create_label_studio_project, export_images_as_fo_dataset
6
6
  from .hf_datasets import import_huggingface_dataset
7
7
  from .pandas import import_csv, import_excel, import_pandas