pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (60) hide show
  1. pixeltable/__init__.py +1 -0
  2. pixeltable/__version__.py +2 -2
  3. pixeltable/catalog/__init__.py +3 -11
  4. pixeltable/catalog/catalog.py +575 -220
  5. pixeltable/catalog/column.py +22 -23
  6. pixeltable/catalog/dir.py +1 -2
  7. pixeltable/catalog/globals.py +2 -148
  8. pixeltable/catalog/insertable_table.py +15 -13
  9. pixeltable/catalog/path.py +6 -0
  10. pixeltable/catalog/schema_object.py +9 -4
  11. pixeltable/catalog/table.py +96 -85
  12. pixeltable/catalog/table_version.py +257 -174
  13. pixeltable/catalog/table_version_path.py +1 -1
  14. pixeltable/catalog/tbl_ops.py +44 -0
  15. pixeltable/catalog/update_status.py +179 -0
  16. pixeltable/catalog/view.py +50 -56
  17. pixeltable/config.py +76 -12
  18. pixeltable/dataframe.py +19 -6
  19. pixeltable/env.py +50 -4
  20. pixeltable/exec/data_row_batch.py +3 -1
  21. pixeltable/exec/exec_node.py +7 -24
  22. pixeltable/exec/expr_eval/schedulers.py +134 -7
  23. pixeltable/exec/in_memory_data_node.py +6 -7
  24. pixeltable/exprs/column_property_ref.py +21 -9
  25. pixeltable/exprs/column_ref.py +7 -2
  26. pixeltable/exprs/function_call.py +2 -2
  27. pixeltable/exprs/row_builder.py +10 -9
  28. pixeltable/exprs/rowid_ref.py +0 -4
  29. pixeltable/func/function.py +3 -3
  30. pixeltable/functions/audio.py +36 -9
  31. pixeltable/functions/gemini.py +4 -4
  32. pixeltable/functions/openai.py +1 -2
  33. pixeltable/functions/video.py +59 -16
  34. pixeltable/globals.py +109 -24
  35. pixeltable/io/__init__.py +1 -1
  36. pixeltable/io/datarows.py +2 -1
  37. pixeltable/io/external_store.py +3 -55
  38. pixeltable/io/globals.py +4 -4
  39. pixeltable/io/hf_datasets.py +10 -2
  40. pixeltable/io/label_studio.py +16 -16
  41. pixeltable/io/pandas.py +1 -0
  42. pixeltable/io/table_data_conduit.py +12 -13
  43. pixeltable/iterators/audio.py +17 -8
  44. pixeltable/iterators/image.py +5 -2
  45. pixeltable/metadata/__init__.py +1 -1
  46. pixeltable/metadata/converters/convert_39.py +125 -0
  47. pixeltable/metadata/converters/util.py +3 -0
  48. pixeltable/metadata/notes.py +1 -0
  49. pixeltable/metadata/schema.py +50 -1
  50. pixeltable/plan.py +4 -0
  51. pixeltable/share/packager.py +20 -38
  52. pixeltable/store.py +40 -51
  53. pixeltable/type_system.py +2 -2
  54. pixeltable/utils/coroutine.py +6 -23
  55. pixeltable/utils/media_store.py +50 -0
  56. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
  57. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
  58. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
  59. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
  60. {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
@@ -76,7 +76,7 @@ class TableVersionPath:
76
76
  elif self._cached_tbl_version is not None:
77
77
  return
78
78
 
79
- with Catalog.get().begin_xact(for_write=False):
79
+ with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
80
80
  self._cached_tbl_version = self.tbl_version.get()
81
81
 
82
82
  def clear_cached_md(self) -> None:
@@ -0,0 +1,44 @@
1
+ # This file contains all dataclasses related to schema.PendingTableOp:
2
+ # - TableOp: the container for each log entry
3
+ # - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
4
+ # enough information for exec_op() to perform the operation without having to reference data outside of
5
+ # TableVersion
6
+
7
+ import dataclasses
8
+ from typing import Any, Optional
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class CreateStoreTableOp:
13
+ pass
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class LoadViewOp:
18
+ view_path: dict[str, Any] # needed to create the view load plan
19
+
20
+
21
+ @dataclasses.dataclass
22
+ class DeleteTableMdOp:
23
+ pass
24
+
25
+
26
+ @dataclasses.dataclass
27
+ class DeleteTableMediaFilesOp:
28
+ pass
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class DropStoreTableOp:
33
+ pass
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class TableOp:
38
+ tbl_id: str # uuid.UUID
39
+ op_sn: int # sequence number within the update operation; [0, num_ops)
40
+ num_ops: int # total number of ops forming the update operation
41
+ needs_xact: bool # if True, op must be run as part of a transaction
42
+
43
+ create_store_table_op: Optional[CreateStoreTableOp] = None
44
+ load_view_op: Optional[LoadViewOp] = None
@@ -0,0 +1,179 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from IPython.lib.pretty import RepresentationPrinter
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class RowCountStats:
12
+ """
13
+ Statistics about the counts of rows affected by a table operation.
14
+ """
15
+
16
+ ins_rows: int = 0 # rows inserted
17
+ del_rows: int = 0 # rows deleted
18
+ upd_rows: int = 0 # rows updated
19
+ num_excs: int = 0 # total number of exceptions
20
+ # TODO: disambiguate what this means: # of slots computed or # of columns computed?
21
+ computed_values: int = 0 # number of computed values (e.g., computed columns) affected by the operation
22
+
23
+ @property
24
+ def num_rows(self) -> int:
25
+ return self.ins_rows + self.del_rows + self.upd_rows
26
+
27
+ def insert_to_update(self) -> 'RowCountStats':
28
+ """
29
+ Convert insert row count stats to update row count stats.
30
+ This is used when an insert operation is treated as an update.
31
+ """
32
+ return RowCountStats(
33
+ ins_rows=0,
34
+ del_rows=self.del_rows,
35
+ upd_rows=self.upd_rows + self.ins_rows,
36
+ num_excs=self.num_excs,
37
+ computed_values=self.computed_values,
38
+ )
39
+
40
+ def __add__(self, other: 'RowCountStats') -> 'RowCountStats':
41
+ """
42
+ Add the stats from two RowCountStats objects together.
43
+ """
44
+ return RowCountStats(
45
+ ins_rows=self.ins_rows + other.ins_rows,
46
+ del_rows=self.del_rows + other.del_rows,
47
+ upd_rows=self.upd_rows + other.upd_rows,
48
+ num_excs=self.num_excs + other.num_excs,
49
+ computed_values=self.computed_values + other.computed_values,
50
+ )
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class UpdateStatus:
55
+ """
56
+ Information about changes to table data or table schema
57
+ """
58
+
59
+ updated_cols: list[str] = field(default_factory=list)
60
+ cols_with_excs: list[str] = field(default_factory=list)
61
+
62
+ # stats for the rows affected by the operation
63
+ row_count_stats: RowCountStats = field(default_factory=RowCountStats)
64
+
65
+ # stats for changes cascaded to other tables
66
+ cascade_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
67
+
68
+ # stats for the rows affected by the operation in an external store
69
+ ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
70
+
71
+ @property
72
+ def num_rows(self) -> int:
73
+ return self.row_count_stats.num_rows + self.cascade_row_count_stats.num_rows
74
+
75
+ @property
76
+ def num_excs(self) -> int:
77
+ return self.row_count_stats.num_excs + self.cascade_row_count_stats.num_excs
78
+
79
+ @property
80
+ def num_computed_values(self) -> int:
81
+ return self.row_count_stats.computed_values + self.cascade_row_count_stats.computed_values
82
+
83
+ def insert_to_update(self) -> 'UpdateStatus':
84
+ """
85
+ Convert the update status from an insert operation to an update operation.
86
+ This is used when an insert operation is treated as an update.
87
+ """
88
+ return UpdateStatus(
89
+ updated_cols=self.updated_cols,
90
+ cols_with_excs=self.cols_with_excs,
91
+ row_count_stats=self.row_count_stats.insert_to_update(),
92
+ cascade_row_count_stats=self.cascade_row_count_stats.insert_to_update(),
93
+ ext_row_count_stats=self.ext_row_count_stats,
94
+ )
95
+
96
+ def to_cascade(self) -> 'UpdateStatus':
97
+ """
98
+ Convert the update status to a cascade update status.
99
+ This is used when an operation cascades changes to other tables.
100
+ """
101
+ return UpdateStatus(
102
+ updated_cols=self.updated_cols,
103
+ cols_with_excs=self.cols_with_excs,
104
+ row_count_stats=RowCountStats(),
105
+ cascade_row_count_stats=self.cascade_row_count_stats + self.row_count_stats,
106
+ ext_row_count_stats=self.ext_row_count_stats,
107
+ )
108
+
109
+ def __add__(self, other: 'UpdateStatus') -> UpdateStatus:
110
+ """
111
+ Add the update status from two UpdateStatus objects together.
112
+ """
113
+ return UpdateStatus(
114
+ updated_cols=list(dict.fromkeys(self.updated_cols + other.updated_cols)),
115
+ cols_with_excs=list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs)),
116
+ row_count_stats=self.row_count_stats + other.row_count_stats,
117
+ cascade_row_count_stats=self.cascade_row_count_stats + other.cascade_row_count_stats,
118
+ ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
119
+ )
120
+
121
+ @property
122
+ def insert_msg(self) -> str:
123
+ """Return a message describing the results of an insert operation."""
124
+ if self.num_excs == 0:
125
+ cols_with_excs_str = ''
126
+ else:
127
+ cols_with_excs_str = (
128
+ f' across {len(self.cols_with_excs)} column{"" if len(self.cols_with_excs) == 1 else "s"}'
129
+ )
130
+ cols_with_excs_str += f' ({", ".join(self.cols_with_excs)})'
131
+ msg = (
132
+ f'Inserted {self.num_rows} row{"" if self.num_rows == 1 else "s"} '
133
+ f'with {self.num_excs} error{"" if self.num_excs == 1 else "s"}{cols_with_excs_str}.'
134
+ )
135
+ return msg
136
+
137
+ @classmethod
138
+ def __cnt_str(cls, cnt: int, item: str) -> str:
139
+ assert cnt > 0
140
+ return f'{cnt} {item}{"" if cnt == 1 else "s"}'
141
+
142
+ def _repr_pretty_(self, p: 'RepresentationPrinter', cycle: bool) -> None:
143
+ messages = []
144
+ # Combine row count stats and cascade row count stats
145
+ stats = self.row_count_stats + self.cascade_row_count_stats
146
+ if stats.ins_rows > 0:
147
+ messages.append(f'{self.__cnt_str(stats.ins_rows, "row")} inserted')
148
+ if stats.del_rows > 0:
149
+ messages.append(f'{self.__cnt_str(stats.del_rows, "row")} deleted')
150
+ if stats.upd_rows > 0:
151
+ messages.append(f'{self.__cnt_str(stats.upd_rows, "row")} updated')
152
+ if stats.computed_values > 0:
153
+ messages.append(f'{self.__cnt_str(stats.computed_values, "value")} computed')
154
+ if stats.num_excs > 0:
155
+ messages.append(self.__cnt_str(stats.num_excs, 'exception'))
156
+ p.text(', '.join(messages) + '.' if len(messages) > 0 else 'No rows affected.')
157
+
158
+ @property
159
+ def pxt_rows_updated(self) -> int:
160
+ """
161
+ Returns the number of Pixeltable rows that were updated as a result of the operation.
162
+ """
163
+ return (self.row_count_stats + self.cascade_row_count_stats).upd_rows
164
+
165
+ @property
166
+ def external_rows_updated(self) -> int:
167
+ return self.ext_row_count_stats.upd_rows
168
+
169
+ @property
170
+ def external_rows_created(self) -> int:
171
+ return self.ext_row_count_stats.ins_rows
172
+
173
+ @property
174
+ def external_rows_deleted(self) -> int:
175
+ return self.ext_row_count_stats.del_rows
176
+
177
+ @property
178
+ def ext_num_rows(self) -> int:
179
+ return self.ext_row_count_stats.num_rows
@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
9
9
  import pixeltable.metadata.schema as md_schema
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import catalog, exprs, func
12
- from pixeltable.env import Env
13
12
  from pixeltable.iterators import ComponentIterator
14
13
 
15
14
  if TYPE_CHECKING:
@@ -17,11 +16,13 @@ if TYPE_CHECKING:
17
16
 
18
17
 
19
18
  from .column import Column
20
- from .globals import _POS_COLUMN_NAME, MediaValidation, UpdateStatus
19
+ from .globals import _POS_COLUMN_NAME, MediaValidation
21
20
  from .table import Table
22
- from .table_version import TableVersion
21
+ from .table_version import TableVersion, TableVersionMd
23
22
  from .table_version_handle import TableVersionHandle
24
23
  from .table_version_path import TableVersionPath
24
+ from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
25
+ from .update_status import UpdateStatus
25
26
 
26
27
  if TYPE_CHECKING:
27
28
  from pixeltable.globals import TableDataSource
@@ -44,9 +45,18 @@ class View(Table):
44
45
  if not snapshot_only:
45
46
  self._tbl_version = tbl_version_path.tbl_version
46
47
 
47
- @classmethod
48
- def _display_name(cls) -> str:
49
- return 'view'
48
+ def _display_name(self) -> str:
49
+ name: str
50
+ if self._tbl_version_path.is_snapshot():
51
+ name = 'snapshot'
52
+ elif self._tbl_version_path.is_view():
53
+ name = 'view'
54
+ else:
55
+ assert self._tbl_version_path.is_replica()
56
+ name = 'table'
57
+ if self._tbl_version_path.is_replica():
58
+ name = f'{name}-replica'
59
+ return name
50
60
 
51
61
  @classmethod
52
62
  def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -79,7 +89,7 @@ class View(Table):
79
89
  media_validation: MediaValidation,
80
90
  iterator_cls: Optional[type[ComponentIterator]],
81
91
  iterator_args: Optional[dict],
82
- ) -> View:
92
+ ) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
83
93
  from pixeltable.plan import SampleClause
84
94
 
85
95
  # Convert select_list to more additional_columns if present
@@ -166,11 +176,10 @@ class View(Table):
166
176
  for col in columns:
167
177
  if col.name in iterator_col_names:
168
178
  raise excs.Error(
169
- f'Duplicate name: column {col.name} is already present in the iterator output schema'
179
+ f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
170
180
  )
171
181
  columns = iterator_cols + columns
172
182
 
173
- session = Env.get().session
174
183
  from pixeltable.exprs import InlineDict
175
184
 
176
185
  iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
@@ -199,51 +208,26 @@ class View(Table):
199
208
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
200
209
  )
201
210
 
202
- id, tbl_version = TableVersion.create(
203
- dir_id,
204
- name,
205
- columns,
206
- num_retained_versions,
207
- comment,
208
- media_validation=media_validation,
209
- # base_path=base_version_path,
210
- view_md=view_md,
211
+ md = TableVersion.create_initial_md(
212
+ name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
211
213
  )
212
- if tbl_version is None:
213
- # this is purely a snapshot: we use the base's tbl version path
214
- view = cls(id, dir_id, name, base_version_path, snapshot_only=True)
215
- _logger.info(f'created snapshot {name}')
214
+ if md.tbl_md.is_pure_snapshot:
215
+ # this is purely a snapshot: no store table to create or load
216
+ return md, None
216
217
  else:
217
- view = cls(
218
- id,
219
- dir_id,
220
- name,
221
- TableVersionPath(
222
- TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
223
- ),
224
- snapshot_only=False,
225
- )
226
- _logger.info(f'Created view `{name}`, id={tbl_version.id}')
227
-
228
- from pixeltable.plan import Planner
229
-
230
- try:
231
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
232
- _, status = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
233
- except:
234
- # we need to remove the orphaned TableVersion instance
235
- del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
236
- base_tbl_version = base.tbl_version.get()
237
- if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
238
- # also remove tbl_version from the base
239
- base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
240
- raise
241
- Env.get().console_logger.info(
242
- f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
218
+ tbl_id = md.tbl_md.tbl_id
219
+ view_path = TableVersionPath(
220
+ TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
243
221
  )
244
-
245
- session.commit()
246
- return view
222
+ ops = [
223
+ TableOp(
224
+ tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
225
+ ),
226
+ TableOp(
227
+ tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
228
+ ),
229
+ ]
230
+ return md, ops
247
231
 
248
232
  @classmethod
249
233
  def _verify_column(cls, col: Column) -> None:
@@ -275,6 +259,12 @@ class View(Table):
275
259
  md = super()._get_metadata()
276
260
  md['is_view'] = True
277
261
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
262
+ base_tbl = self._get_base_table()
263
+ if base_tbl is None:
264
+ md['base'] = None
265
+ else:
266
+ base_version = self._effective_base_versions[0]
267
+ md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
278
268
  return md
279
269
 
280
270
  def insert(
@@ -288,16 +278,21 @@ class View(Table):
288
278
  print_stats: bool = False,
289
279
  **kwargs: Any,
290
280
  ) -> UpdateStatus:
291
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
281
+ raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
292
282
 
293
283
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
294
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot delete from view')
284
+ raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
295
285
 
296
286
  def _get_base_table(self) -> Optional['Table']:
287
+ if self._tbl_version_path.base is None and not self._snapshot_only:
288
+ return None # this can happen for a replica of a base table
297
289
  # if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
298
290
  # for the snapshot itself)
291
+ from pixeltable.catalog import Catalog
292
+
299
293
  base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
300
- return catalog.Catalog.get().get_table_by_id(base_id)
294
+ with Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
295
+ return catalog.Catalog.get().get_table_by_id(base_id)
301
296
 
302
297
  @property
303
298
  def _effective_base_versions(self) -> list[Optional[int]]:
@@ -308,8 +303,7 @@ class View(Table):
308
303
  return effective_versions[1:]
309
304
 
310
305
  def _table_descriptor(self) -> str:
311
- display_name = 'Snapshot' if self._snapshot_only else 'View'
312
- result = [f'{display_name} {self._path()!r}']
306
+ result = [self._display_str()]
313
307
  bases_descrs: list[str] = []
314
308
  for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
315
309
  if effective_version is None:
pixeltable/config.py CHANGED
@@ -25,19 +25,26 @@ class Config:
25
25
 
26
26
  __home: Path
27
27
  __config_file: Path
28
+ __config_overrides: dict[str, Any]
28
29
  __config_dict: dict[str, Any]
29
30
 
30
- def __init__(self) -> None:
31
+ def __init__(self, config_overrides: dict[str, Any]) -> None:
31
32
  assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
32
33
 
33
- self.__home = Path(os.environ.get('PIXELTABLE_HOME', str(Path.home() / '.pixeltable')))
34
+ for var in config_overrides:
35
+ if var not in KNOWN_CONFIG_OVERRIDES:
36
+ raise excs.Error(f'Unrecognized configuration variable: {var}')
37
+
38
+ self.__config_overrides = config_overrides
39
+
40
+ self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
34
41
  if self.__home.exists() and not self.__home.is_dir():
35
- raise RuntimeError(f'{self.__home} is not a directory')
42
+ raise excs.Error(f'Not a directory: {self.__home}')
36
43
  if not self.__home.exists():
37
44
  print(f'Creating a Pixeltable instance at: {self.__home}')
38
45
  self.__home.mkdir()
39
46
 
40
- self.__config_file = Path(os.environ.get('PIXELTABLE_CONFIG', str(self.__home / 'config.toml')))
47
+ self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
41
48
 
42
49
  self.__config_dict: dict[str, Any]
43
50
  if os.path.isfile(self.__config_file):
@@ -46,6 +53,12 @@ class Config:
46
53
  self.__config_dict = toml.load(stream)
47
54
  except Exception as exc:
48
55
  raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
56
+ for section, section_dict in self.__config_dict.items():
57
+ if section not in KNOWN_CONFIG_OPTIONS:
58
+ raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
59
+ for key in section_dict:
60
+ if key not in KNOWN_CONFIG_OPTIONS[section]:
61
+ raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
49
62
  else:
50
63
  self.__config_dict = self.__create_default_config(self.__config_file)
51
64
  with open(self.__config_file, 'w', encoding='utf-8') as stream:
@@ -65,10 +78,18 @@ class Config:
65
78
 
66
79
  @classmethod
67
80
  def get(cls) -> Config:
68
- if cls.__instance is None:
69
- cls.__instance = cls()
81
+ cls.init({})
70
82
  return cls.__instance
71
83
 
84
+ @classmethod
85
+ def init(cls, config_overrides: dict[str, Any]) -> None:
86
+ if cls.__instance is None:
87
+ cls.__instance = cls(config_overrides)
88
+ elif len(config_overrides) > 0:
89
+ raise excs.Error(
90
+ 'Pixeltable has already been initialized; cannot specify new config values in the same session'
91
+ )
92
+
72
93
  @classmethod
73
94
  def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
74
95
  free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
@@ -76,14 +97,23 @@ class Config:
76
97
  file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
77
98
  return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
78
99
 
79
- def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
100
+ def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
101
+ override_var = f'{section}.{key}'
80
102
  env_var = f'{section.upper()}_{key.upper()}'
103
+ if override_var in self.__config_overrides:
104
+ return self.__config_overrides[override_var]
81
105
  if env_var in os.environ:
82
- value = os.environ[env_var]
83
- elif section in self.__config_dict and key in self.__config_dict[section]:
106
+ return os.environ[env_var]
107
+ return default
108
+
109
+ def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
110
+ value = self.lookup_env(section, key) # Try to get from environment first
111
+ # Next try the config file
112
+ if value is None and section in self.__config_dict and key in self.__config_dict[section]:
84
113
  value = self.__config_dict[section][key]
85
- else:
86
- return None
114
+
115
+ if value is None:
116
+ return None # Not specified
87
117
 
88
118
  try:
89
119
  if expected_type is bool and isinstance(value, str):
@@ -91,7 +121,7 @@ class Config:
91
121
  raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
92
122
  return value.lower() == 'true' # type: ignore[return-value]
93
123
  return expected_type(value) # type: ignore[call-arg]
94
- except ValueError as exc:
124
+ except (ValueError, TypeError) as exc:
95
125
  raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
96
126
 
97
127
  def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
@@ -105,3 +135,37 @@ class Config:
105
135
 
106
136
  def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
107
137
  return self.get_value(key, bool, section)
138
+
139
+
140
+ KNOWN_CONFIG_OPTIONS = {
141
+ 'pixeltable': {
142
+ 'home': 'Path to the Pixeltable home directory',
143
+ 'config': 'Path to the Pixeltable config file',
144
+ 'pgdata': 'Path to the Pixeltable postgres data directory',
145
+ 'db': 'Postgres database name',
146
+ 'file_cache_size_g': 'Size of the file cache in GB',
147
+ 'time_zone': 'Default time zone for timestamps',
148
+ 'hide_warnings': 'Hide warnings from the console',
149
+ 'verbosity': 'Verbosity level for console output',
150
+ 'api_key': 'API key for Pixeltable cloud',
151
+ },
152
+ 'anthropic': {'api_key': 'Anthropic API key'},
153
+ 'bedrock': {'api_key': 'AWS Bedrock API key'},
154
+ 'deepseek': {'api_key': 'Deepseek API key'},
155
+ 'fireworks': {'api_key': 'Fireworks API key'},
156
+ 'gemini': {'api_key': 'Gemini API key'},
157
+ 'groq': {'api_key': 'Groq API key'},
158
+ 'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
159
+ 'mistral': {'api_key': 'Mistral API key'},
160
+ 'openai': {'api_key': 'OpenAI API key'},
161
+ 'replicate': {'api_token': 'Replicate API token'},
162
+ 'together': {'api_key': 'Together API key'},
163
+ 'pypi': {'api_key': 'PyPI API key (for internal use only)'},
164
+ }
165
+
166
+
167
+ KNOWN_CONFIG_OVERRIDES = {
168
+ f'{section}.{key}': info
169
+ for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
170
+ for key, info in section_dict.items()
171
+ }
pixeltable/dataframe.py CHANGED
@@ -15,7 +15,7 @@ import sqlalchemy as sql
15
15
 
16
16
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
17
17
  from pixeltable.catalog import Catalog, is_valid_identifier
18
- from pixeltable.catalog.globals import UpdateStatus
18
+ from pixeltable.catalog.update_status import UpdateStatus
19
19
  from pixeltable.env import Env
20
20
  from pixeltable.plan import Planner, SampleClause
21
21
  from pixeltable.type_system import ColumnType
@@ -1185,7 +1185,7 @@ class DataFrame:
1185
1185
  """
1186
1186
  self._validate_mutable('delete', False)
1187
1187
  if not self._first_tbl.is_insertable():
1188
- raise excs.Error('Cannot delete from view')
1188
+ raise excs.Error('Cannot use `delete` on a view.')
1189
1189
  with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1190
1190
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1191
1191
 
@@ -1196,14 +1196,27 @@ class DataFrame:
1196
1196
  op_name: The name of the operation for which the test is being performed.
1197
1197
  allow_select: If True, allow a select() specification in the Dataframe.
1198
1198
  """
1199
+ self._validate_mutable_op_sequence(op_name, allow_select)
1200
+
1201
+ # TODO: Reconcile these with Table.__check_mutable()
1202
+ assert len(self._from_clause.tbls) == 1
1203
+ if self._first_tbl.is_snapshot():
1204
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1205
+ if self._first_tbl.is_replica():
1206
+ raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1207
+
1208
+ def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1209
+ """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
1199
1210
  if self.group_by_clause is not None or self.grouping_tbl is not None:
1200
- raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
1211
+ raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
1201
1212
  if self.order_by_clause is not None:
1202
- raise excs.Error(f'Cannot use `{op_name}` after `order_by`')
1213
+ raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
1203
1214
  if self.select_list is not None and not allow_select:
1204
- raise excs.Error(f'Cannot use `{op_name}` after `select`')
1215
+ raise excs.Error(f'Cannot use `{op_name}` after `select`.')
1205
1216
  if self.limit_val is not None:
1206
- raise excs.Error(f'Cannot use `{op_name}` after `limit`')
1217
+ raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
1218
+ if self._has_joins():
1219
+ raise excs.Error(f'Cannot use `{op_name}` after `join`.')
1207
1220
 
1208
1221
  def as_dict(self) -> dict[str, Any]:
1209
1222
  """