pixeltable 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -0
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +3 -11
- pixeltable/catalog/catalog.py +575 -220
- pixeltable/catalog/column.py +22 -23
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +2 -148
- pixeltable/catalog/insertable_table.py +15 -13
- pixeltable/catalog/path.py +6 -0
- pixeltable/catalog/schema_object.py +9 -4
- pixeltable/catalog/table.py +96 -85
- pixeltable/catalog/table_version.py +257 -174
- pixeltable/catalog/table_version_path.py +1 -1
- pixeltable/catalog/tbl_ops.py +44 -0
- pixeltable/catalog/update_status.py +179 -0
- pixeltable/catalog/view.py +50 -56
- pixeltable/config.py +76 -12
- pixeltable/dataframe.py +19 -6
- pixeltable/env.py +50 -4
- pixeltable/exec/data_row_batch.py +3 -1
- pixeltable/exec/exec_node.py +7 -24
- pixeltable/exec/expr_eval/schedulers.py +134 -7
- pixeltable/exec/in_memory_data_node.py +6 -7
- pixeltable/exprs/column_property_ref.py +21 -9
- pixeltable/exprs/column_ref.py +7 -2
- pixeltable/exprs/function_call.py +2 -2
- pixeltable/exprs/row_builder.py +10 -9
- pixeltable/exprs/rowid_ref.py +0 -4
- pixeltable/func/function.py +3 -3
- pixeltable/functions/audio.py +36 -9
- pixeltable/functions/gemini.py +4 -4
- pixeltable/functions/openai.py +1 -2
- pixeltable/functions/video.py +59 -16
- pixeltable/globals.py +109 -24
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/datarows.py +2 -1
- pixeltable/io/external_store.py +3 -55
- pixeltable/io/globals.py +4 -4
- pixeltable/io/hf_datasets.py +10 -2
- pixeltable/io/label_studio.py +16 -16
- pixeltable/io/pandas.py +1 -0
- pixeltable/io/table_data_conduit.py +12 -13
- pixeltable/iterators/audio.py +17 -8
- pixeltable/iterators/image.py +5 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_39.py +125 -0
- pixeltable/metadata/converters/util.py +3 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +50 -1
- pixeltable/plan.py +4 -0
- pixeltable/share/packager.py +20 -38
- pixeltable/store.py +40 -51
- pixeltable/type_system.py +2 -2
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/media_store.py +50 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/METADATA +1 -1
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/RECORD +60 -57
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.2.dist-info → pixeltable-0.4.4.dist-info}/entry_points.txt +0 -0
|
@@ -76,7 +76,7 @@ class TableVersionPath:
|
|
|
76
76
|
elif self._cached_tbl_version is not None:
|
|
77
77
|
return
|
|
78
78
|
|
|
79
|
-
with Catalog.get().begin_xact(for_write=False):
|
|
79
|
+
with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
|
|
80
80
|
self._cached_tbl_version = self.tbl_version.get()
|
|
81
81
|
|
|
82
82
|
def clear_cached_md(self) -> None:
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# This file contains all dataclasses related to schema.PendingTableOp:
|
|
2
|
+
# - TableOp: the container for each log entry
|
|
3
|
+
# - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
|
|
4
|
+
# enough information for exec_op() to perform the operation without having to reference data outside of
|
|
5
|
+
# TableVersion
|
|
6
|
+
|
|
7
|
+
import dataclasses
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclasses.dataclass
|
|
12
|
+
class CreateStoreTableOp:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclasses.dataclass
|
|
17
|
+
class LoadViewOp:
|
|
18
|
+
view_path: dict[str, Any] # needed to create the view load plan
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclasses.dataclass
|
|
22
|
+
class DeleteTableMdOp:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclasses.dataclass
|
|
27
|
+
class DeleteTableMediaFilesOp:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclasses.dataclass
|
|
32
|
+
class DropStoreTableOp:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclasses.dataclass
|
|
37
|
+
class TableOp:
|
|
38
|
+
tbl_id: str # uuid.UUID
|
|
39
|
+
op_sn: int # sequence number within the update operation; [0, num_ops)
|
|
40
|
+
num_ops: int # total number of ops forming the update operation
|
|
41
|
+
needs_xact: bool # if True, op must be run as part of a transaction
|
|
42
|
+
|
|
43
|
+
create_store_table_op: Optional[CreateStoreTableOp] = None
|
|
44
|
+
load_view_op: Optional[LoadViewOp] = None
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from IPython.lib.pretty import RepresentationPrinter
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class RowCountStats:
|
|
12
|
+
"""
|
|
13
|
+
Statistics about the counts of rows affected by a table operation.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
ins_rows: int = 0 # rows inserted
|
|
17
|
+
del_rows: int = 0 # rows deleted
|
|
18
|
+
upd_rows: int = 0 # rows updated
|
|
19
|
+
num_excs: int = 0 # total number of exceptions
|
|
20
|
+
# TODO: disambiguate what this means: # of slots computed or # of columns computed?
|
|
21
|
+
computed_values: int = 0 # number of computed values (e.g., computed columns) affected by the operation
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def num_rows(self) -> int:
|
|
25
|
+
return self.ins_rows + self.del_rows + self.upd_rows
|
|
26
|
+
|
|
27
|
+
def insert_to_update(self) -> 'RowCountStats':
|
|
28
|
+
"""
|
|
29
|
+
Convert insert row count stats to update row count stats.
|
|
30
|
+
This is used when an insert operation is treated as an update.
|
|
31
|
+
"""
|
|
32
|
+
return RowCountStats(
|
|
33
|
+
ins_rows=0,
|
|
34
|
+
del_rows=self.del_rows,
|
|
35
|
+
upd_rows=self.upd_rows + self.ins_rows,
|
|
36
|
+
num_excs=self.num_excs,
|
|
37
|
+
computed_values=self.computed_values,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def __add__(self, other: 'RowCountStats') -> 'RowCountStats':
|
|
41
|
+
"""
|
|
42
|
+
Add the stats from two RowCountStats objects together.
|
|
43
|
+
"""
|
|
44
|
+
return RowCountStats(
|
|
45
|
+
ins_rows=self.ins_rows + other.ins_rows,
|
|
46
|
+
del_rows=self.del_rows + other.del_rows,
|
|
47
|
+
upd_rows=self.upd_rows + other.upd_rows,
|
|
48
|
+
num_excs=self.num_excs + other.num_excs,
|
|
49
|
+
computed_values=self.computed_values + other.computed_values,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class UpdateStatus:
|
|
55
|
+
"""
|
|
56
|
+
Information about changes to table data or table schema
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
updated_cols: list[str] = field(default_factory=list)
|
|
60
|
+
cols_with_excs: list[str] = field(default_factory=list)
|
|
61
|
+
|
|
62
|
+
# stats for the rows affected by the operation
|
|
63
|
+
row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
64
|
+
|
|
65
|
+
# stats for changes cascaded to other tables
|
|
66
|
+
cascade_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
67
|
+
|
|
68
|
+
# stats for the rows affected by the operation in an external store
|
|
69
|
+
ext_row_count_stats: RowCountStats = field(default_factory=RowCountStats)
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def num_rows(self) -> int:
|
|
73
|
+
return self.row_count_stats.num_rows + self.cascade_row_count_stats.num_rows
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def num_excs(self) -> int:
|
|
77
|
+
return self.row_count_stats.num_excs + self.cascade_row_count_stats.num_excs
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def num_computed_values(self) -> int:
|
|
81
|
+
return self.row_count_stats.computed_values + self.cascade_row_count_stats.computed_values
|
|
82
|
+
|
|
83
|
+
def insert_to_update(self) -> 'UpdateStatus':
|
|
84
|
+
"""
|
|
85
|
+
Convert the update status from an insert operation to an update operation.
|
|
86
|
+
This is used when an insert operation is treated as an update.
|
|
87
|
+
"""
|
|
88
|
+
return UpdateStatus(
|
|
89
|
+
updated_cols=self.updated_cols,
|
|
90
|
+
cols_with_excs=self.cols_with_excs,
|
|
91
|
+
row_count_stats=self.row_count_stats.insert_to_update(),
|
|
92
|
+
cascade_row_count_stats=self.cascade_row_count_stats.insert_to_update(),
|
|
93
|
+
ext_row_count_stats=self.ext_row_count_stats,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def to_cascade(self) -> 'UpdateStatus':
|
|
97
|
+
"""
|
|
98
|
+
Convert the update status to a cascade update status.
|
|
99
|
+
This is used when an operation cascades changes to other tables.
|
|
100
|
+
"""
|
|
101
|
+
return UpdateStatus(
|
|
102
|
+
updated_cols=self.updated_cols,
|
|
103
|
+
cols_with_excs=self.cols_with_excs,
|
|
104
|
+
row_count_stats=RowCountStats(),
|
|
105
|
+
cascade_row_count_stats=self.cascade_row_count_stats + self.row_count_stats,
|
|
106
|
+
ext_row_count_stats=self.ext_row_count_stats,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def __add__(self, other: 'UpdateStatus') -> UpdateStatus:
|
|
110
|
+
"""
|
|
111
|
+
Add the update status from two UpdateStatus objects together.
|
|
112
|
+
"""
|
|
113
|
+
return UpdateStatus(
|
|
114
|
+
updated_cols=list(dict.fromkeys(self.updated_cols + other.updated_cols)),
|
|
115
|
+
cols_with_excs=list(dict.fromkeys(self.cols_with_excs + other.cols_with_excs)),
|
|
116
|
+
row_count_stats=self.row_count_stats + other.row_count_stats,
|
|
117
|
+
cascade_row_count_stats=self.cascade_row_count_stats + other.cascade_row_count_stats,
|
|
118
|
+
ext_row_count_stats=self.ext_row_count_stats + other.ext_row_count_stats,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def insert_msg(self) -> str:
|
|
123
|
+
"""Return a message describing the results of an insert operation."""
|
|
124
|
+
if self.num_excs == 0:
|
|
125
|
+
cols_with_excs_str = ''
|
|
126
|
+
else:
|
|
127
|
+
cols_with_excs_str = (
|
|
128
|
+
f' across {len(self.cols_with_excs)} column{"" if len(self.cols_with_excs) == 1 else "s"}'
|
|
129
|
+
)
|
|
130
|
+
cols_with_excs_str += f' ({", ".join(self.cols_with_excs)})'
|
|
131
|
+
msg = (
|
|
132
|
+
f'Inserted {self.num_rows} row{"" if self.num_rows == 1 else "s"} '
|
|
133
|
+
f'with {self.num_excs} error{"" if self.num_excs == 1 else "s"}{cols_with_excs_str}.'
|
|
134
|
+
)
|
|
135
|
+
return msg
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def __cnt_str(cls, cnt: int, item: str) -> str:
|
|
139
|
+
assert cnt > 0
|
|
140
|
+
return f'{cnt} {item}{"" if cnt == 1 else "s"}'
|
|
141
|
+
|
|
142
|
+
def _repr_pretty_(self, p: 'RepresentationPrinter', cycle: bool) -> None:
|
|
143
|
+
messages = []
|
|
144
|
+
# Combine row count stats and cascade row count stats
|
|
145
|
+
stats = self.row_count_stats + self.cascade_row_count_stats
|
|
146
|
+
if stats.ins_rows > 0:
|
|
147
|
+
messages.append(f'{self.__cnt_str(stats.ins_rows, "row")} inserted')
|
|
148
|
+
if stats.del_rows > 0:
|
|
149
|
+
messages.append(f'{self.__cnt_str(stats.del_rows, "row")} deleted')
|
|
150
|
+
if stats.upd_rows > 0:
|
|
151
|
+
messages.append(f'{self.__cnt_str(stats.upd_rows, "row")} updated')
|
|
152
|
+
if stats.computed_values > 0:
|
|
153
|
+
messages.append(f'{self.__cnt_str(stats.computed_values, "value")} computed')
|
|
154
|
+
if stats.num_excs > 0:
|
|
155
|
+
messages.append(self.__cnt_str(stats.num_excs, 'exception'))
|
|
156
|
+
p.text(', '.join(messages) + '.' if len(messages) > 0 else 'No rows affected.')
|
|
157
|
+
|
|
158
|
+
@property
|
|
159
|
+
def pxt_rows_updated(self) -> int:
|
|
160
|
+
"""
|
|
161
|
+
Returns the number of Pixeltable rows that were updated as a result of the operation.
|
|
162
|
+
"""
|
|
163
|
+
return (self.row_count_stats + self.cascade_row_count_stats).upd_rows
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def external_rows_updated(self) -> int:
|
|
167
|
+
return self.ext_row_count_stats.upd_rows
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def external_rows_created(self) -> int:
|
|
171
|
+
return self.ext_row_count_stats.ins_rows
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def external_rows_deleted(self) -> int:
|
|
175
|
+
return self.ext_row_count_stats.del_rows
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def ext_num_rows(self) -> int:
|
|
179
|
+
return self.ext_row_count_stats.num_rows
|
pixeltable/catalog/view.py
CHANGED
|
@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
|
|
|
9
9
|
import pixeltable.metadata.schema as md_schema
|
|
10
10
|
import pixeltable.type_system as ts
|
|
11
11
|
from pixeltable import catalog, exprs, func
|
|
12
|
-
from pixeltable.env import Env
|
|
13
12
|
from pixeltable.iterators import ComponentIterator
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
@@ -17,11 +16,13 @@ if TYPE_CHECKING:
|
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
from .column import Column
|
|
20
|
-
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
19
|
+
from .globals import _POS_COLUMN_NAME, MediaValidation
|
|
21
20
|
from .table import Table
|
|
22
|
-
from .table_version import TableVersion
|
|
21
|
+
from .table_version import TableVersion, TableVersionMd
|
|
23
22
|
from .table_version_handle import TableVersionHandle
|
|
24
23
|
from .table_version_path import TableVersionPath
|
|
24
|
+
from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
|
|
25
|
+
from .update_status import UpdateStatus
|
|
25
26
|
|
|
26
27
|
if TYPE_CHECKING:
|
|
27
28
|
from pixeltable.globals import TableDataSource
|
|
@@ -44,9 +45,18 @@ class View(Table):
|
|
|
44
45
|
if not snapshot_only:
|
|
45
46
|
self._tbl_version = tbl_version_path.tbl_version
|
|
46
47
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
48
|
+
def _display_name(self) -> str:
|
|
49
|
+
name: str
|
|
50
|
+
if self._tbl_version_path.is_snapshot():
|
|
51
|
+
name = 'snapshot'
|
|
52
|
+
elif self._tbl_version_path.is_view():
|
|
53
|
+
name = 'view'
|
|
54
|
+
else:
|
|
55
|
+
assert self._tbl_version_path.is_replica()
|
|
56
|
+
name = 'table'
|
|
57
|
+
if self._tbl_version_path.is_replica():
|
|
58
|
+
name = f'{name}-replica'
|
|
59
|
+
return name
|
|
50
60
|
|
|
51
61
|
@classmethod
|
|
52
62
|
def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
|
|
@@ -79,7 +89,7 @@ class View(Table):
|
|
|
79
89
|
media_validation: MediaValidation,
|
|
80
90
|
iterator_cls: Optional[type[ComponentIterator]],
|
|
81
91
|
iterator_args: Optional[dict],
|
|
82
|
-
) ->
|
|
92
|
+
) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
|
|
83
93
|
from pixeltable.plan import SampleClause
|
|
84
94
|
|
|
85
95
|
# Convert select_list to more additional_columns if present
|
|
@@ -166,11 +176,10 @@ class View(Table):
|
|
|
166
176
|
for col in columns:
|
|
167
177
|
if col.name in iterator_col_names:
|
|
168
178
|
raise excs.Error(
|
|
169
|
-
f'Duplicate name: column {col.name} is already present in the iterator output schema'
|
|
179
|
+
f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
|
|
170
180
|
)
|
|
171
181
|
columns = iterator_cols + columns
|
|
172
182
|
|
|
173
|
-
session = Env.get().session
|
|
174
183
|
from pixeltable.exprs import InlineDict
|
|
175
184
|
|
|
176
185
|
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
@@ -199,51 +208,26 @@ class View(Table):
|
|
|
199
208
|
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
|
|
200
209
|
)
|
|
201
210
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
name,
|
|
205
|
-
columns,
|
|
206
|
-
num_retained_versions,
|
|
207
|
-
comment,
|
|
208
|
-
media_validation=media_validation,
|
|
209
|
-
# base_path=base_version_path,
|
|
210
|
-
view_md=view_md,
|
|
211
|
+
md = TableVersion.create_initial_md(
|
|
212
|
+
name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
|
|
211
213
|
)
|
|
212
|
-
if
|
|
213
|
-
# this is purely a snapshot:
|
|
214
|
-
|
|
215
|
-
_logger.info(f'created snapshot {name}')
|
|
214
|
+
if md.tbl_md.is_pure_snapshot:
|
|
215
|
+
# this is purely a snapshot: no store table to create or load
|
|
216
|
+
return md, None
|
|
216
217
|
else:
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
name,
|
|
221
|
-
TableVersionPath(
|
|
222
|
-
TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
|
|
223
|
-
),
|
|
224
|
-
snapshot_only=False,
|
|
225
|
-
)
|
|
226
|
-
_logger.info(f'Created view `{name}`, id={tbl_version.id}')
|
|
227
|
-
|
|
228
|
-
from pixeltable.plan import Planner
|
|
229
|
-
|
|
230
|
-
try:
|
|
231
|
-
plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
|
|
232
|
-
_, status = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
|
|
233
|
-
except:
|
|
234
|
-
# we need to remove the orphaned TableVersion instance
|
|
235
|
-
del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
|
|
236
|
-
base_tbl_version = base.tbl_version.get()
|
|
237
|
-
if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
|
|
238
|
-
# also remove tbl_version from the base
|
|
239
|
-
base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
|
|
240
|
-
raise
|
|
241
|
-
Env.get().console_logger.info(
|
|
242
|
-
f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
|
|
218
|
+
tbl_id = md.tbl_md.tbl_id
|
|
219
|
+
view_path = TableVersionPath(
|
|
220
|
+
TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
|
|
243
221
|
)
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
222
|
+
ops = [
|
|
223
|
+
TableOp(
|
|
224
|
+
tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
|
|
225
|
+
),
|
|
226
|
+
TableOp(
|
|
227
|
+
tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
|
|
228
|
+
),
|
|
229
|
+
]
|
|
230
|
+
return md, ops
|
|
247
231
|
|
|
248
232
|
@classmethod
|
|
249
233
|
def _verify_column(cls, col: Column) -> None:
|
|
@@ -275,6 +259,12 @@ class View(Table):
|
|
|
275
259
|
md = super()._get_metadata()
|
|
276
260
|
md['is_view'] = True
|
|
277
261
|
md['is_snapshot'] = self._tbl_version_path.is_snapshot()
|
|
262
|
+
base_tbl = self._get_base_table()
|
|
263
|
+
if base_tbl is None:
|
|
264
|
+
md['base'] = None
|
|
265
|
+
else:
|
|
266
|
+
base_version = self._effective_base_versions[0]
|
|
267
|
+
md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
|
|
278
268
|
return md
|
|
279
269
|
|
|
280
270
|
def insert(
|
|
@@ -288,16 +278,21 @@ class View(Table):
|
|
|
288
278
|
print_stats: bool = False,
|
|
289
279
|
**kwargs: Any,
|
|
290
280
|
) -> UpdateStatus:
|
|
291
|
-
raise excs.Error(f'{self.
|
|
281
|
+
raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
|
|
292
282
|
|
|
293
283
|
def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
|
|
294
|
-
raise excs.Error(f'{self.
|
|
284
|
+
raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
|
|
295
285
|
|
|
296
286
|
def _get_base_table(self) -> Optional['Table']:
|
|
287
|
+
if self._tbl_version_path.base is None and not self._snapshot_only:
|
|
288
|
+
return None # this can happen for a replica of a base table
|
|
297
289
|
# if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
|
|
298
290
|
# for the snapshot itself)
|
|
291
|
+
from pixeltable.catalog import Catalog
|
|
292
|
+
|
|
299
293
|
base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
|
|
300
|
-
|
|
294
|
+
with Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
|
|
295
|
+
return catalog.Catalog.get().get_table_by_id(base_id)
|
|
301
296
|
|
|
302
297
|
@property
|
|
303
298
|
def _effective_base_versions(self) -> list[Optional[int]]:
|
|
@@ -308,8 +303,7 @@ class View(Table):
|
|
|
308
303
|
return effective_versions[1:]
|
|
309
304
|
|
|
310
305
|
def _table_descriptor(self) -> str:
|
|
311
|
-
|
|
312
|
-
result = [f'{display_name} {self._path()!r}']
|
|
306
|
+
result = [self._display_str()]
|
|
313
307
|
bases_descrs: list[str] = []
|
|
314
308
|
for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
|
|
315
309
|
if effective_version is None:
|
pixeltable/config.py
CHANGED
|
@@ -25,19 +25,26 @@ class Config:
|
|
|
25
25
|
|
|
26
26
|
__home: Path
|
|
27
27
|
__config_file: Path
|
|
28
|
+
__config_overrides: dict[str, Any]
|
|
28
29
|
__config_dict: dict[str, Any]
|
|
29
30
|
|
|
30
|
-
def __init__(self) -> None:
|
|
31
|
+
def __init__(self, config_overrides: dict[str, Any]) -> None:
|
|
31
32
|
assert self.__instance is None, 'Config is a singleton; use Config.get() to access the instance'
|
|
32
33
|
|
|
33
|
-
|
|
34
|
+
for var in config_overrides:
|
|
35
|
+
if var not in KNOWN_CONFIG_OVERRIDES:
|
|
36
|
+
raise excs.Error(f'Unrecognized configuration variable: {var}')
|
|
37
|
+
|
|
38
|
+
self.__config_overrides = config_overrides
|
|
39
|
+
|
|
40
|
+
self.__home = Path(self.lookup_env('pixeltable', 'home', str(Path.home() / '.pixeltable')))
|
|
34
41
|
if self.__home.exists() and not self.__home.is_dir():
|
|
35
|
-
raise
|
|
42
|
+
raise excs.Error(f'Not a directory: {self.__home}')
|
|
36
43
|
if not self.__home.exists():
|
|
37
44
|
print(f'Creating a Pixeltable instance at: {self.__home}')
|
|
38
45
|
self.__home.mkdir()
|
|
39
46
|
|
|
40
|
-
self.__config_file = Path(
|
|
47
|
+
self.__config_file = Path(self.lookup_env('pixeltable', 'config', str(self.__home / 'config.toml')))
|
|
41
48
|
|
|
42
49
|
self.__config_dict: dict[str, Any]
|
|
43
50
|
if os.path.isfile(self.__config_file):
|
|
@@ -46,6 +53,12 @@ class Config:
|
|
|
46
53
|
self.__config_dict = toml.load(stream)
|
|
47
54
|
except Exception as exc:
|
|
48
55
|
raise excs.Error(f'Could not read config file: {self.__config_file}') from exc
|
|
56
|
+
for section, section_dict in self.__config_dict.items():
|
|
57
|
+
if section not in KNOWN_CONFIG_OPTIONS:
|
|
58
|
+
raise excs.Error(f'Unrecognized section {section!r} in config file: {self.__config_file}')
|
|
59
|
+
for key in section_dict:
|
|
60
|
+
if key not in KNOWN_CONFIG_OPTIONS[section]:
|
|
61
|
+
raise excs.Error(f"Unrecognized option '{section}.{key}' in config file: {self.__config_file}")
|
|
49
62
|
else:
|
|
50
63
|
self.__config_dict = self.__create_default_config(self.__config_file)
|
|
51
64
|
with open(self.__config_file, 'w', encoding='utf-8') as stream:
|
|
@@ -65,10 +78,18 @@ class Config:
|
|
|
65
78
|
|
|
66
79
|
@classmethod
|
|
67
80
|
def get(cls) -> Config:
|
|
68
|
-
|
|
69
|
-
cls.__instance = cls()
|
|
81
|
+
cls.init({})
|
|
70
82
|
return cls.__instance
|
|
71
83
|
|
|
84
|
+
@classmethod
|
|
85
|
+
def init(cls, config_overrides: dict[str, Any]) -> None:
|
|
86
|
+
if cls.__instance is None:
|
|
87
|
+
cls.__instance = cls(config_overrides)
|
|
88
|
+
elif len(config_overrides) > 0:
|
|
89
|
+
raise excs.Error(
|
|
90
|
+
'Pixeltable has already been initialized; cannot specify new config values in the same session'
|
|
91
|
+
)
|
|
92
|
+
|
|
72
93
|
@classmethod
|
|
73
94
|
def __create_default_config(cls, config_path: Path) -> dict[str, Any]:
|
|
74
95
|
free_disk_space_bytes = shutil.disk_usage(config_path.parent).free
|
|
@@ -76,14 +97,23 @@ class Config:
|
|
|
76
97
|
file_cache_size_g = free_disk_space_bytes / 5 / (1 << 30)
|
|
77
98
|
return {'pixeltable': {'file_cache_size_g': round(file_cache_size_g, 1), 'hide_warnings': False}}
|
|
78
99
|
|
|
79
|
-
def
|
|
100
|
+
def lookup_env(self, section: str, key: str, default: Any = None) -> Any:
|
|
101
|
+
override_var = f'{section}.{key}'
|
|
80
102
|
env_var = f'{section.upper()}_{key.upper()}'
|
|
103
|
+
if override_var in self.__config_overrides:
|
|
104
|
+
return self.__config_overrides[override_var]
|
|
81
105
|
if env_var in os.environ:
|
|
82
|
-
|
|
83
|
-
|
|
106
|
+
return os.environ[env_var]
|
|
107
|
+
return default
|
|
108
|
+
|
|
109
|
+
def get_value(self, key: str, expected_type: type[T], section: str = 'pixeltable') -> Optional[T]:
|
|
110
|
+
value = self.lookup_env(section, key) # Try to get from environment first
|
|
111
|
+
# Next try the config file
|
|
112
|
+
if value is None and section in self.__config_dict and key in self.__config_dict[section]:
|
|
84
113
|
value = self.__config_dict[section][key]
|
|
85
|
-
|
|
86
|
-
|
|
114
|
+
|
|
115
|
+
if value is None:
|
|
116
|
+
return None # Not specified
|
|
87
117
|
|
|
88
118
|
try:
|
|
89
119
|
if expected_type is bool and isinstance(value, str):
|
|
@@ -91,7 +121,7 @@ class Config:
|
|
|
91
121
|
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}')
|
|
92
122
|
return value.lower() == 'true' # type: ignore[return-value]
|
|
93
123
|
return expected_type(value) # type: ignore[call-arg]
|
|
94
|
-
except ValueError as exc:
|
|
124
|
+
except (ValueError, TypeError) as exc:
|
|
95
125
|
raise excs.Error(f'Invalid value for configuration parameter {section}.{key}: {value}') from exc
|
|
96
126
|
|
|
97
127
|
def get_string_value(self, key: str, section: str = 'pixeltable') -> Optional[str]:
|
|
@@ -105,3 +135,37 @@ class Config:
|
|
|
105
135
|
|
|
106
136
|
def get_bool_value(self, key: str, section: str = 'pixeltable') -> Optional[bool]:
|
|
107
137
|
return self.get_value(key, bool, section)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
KNOWN_CONFIG_OPTIONS = {
|
|
141
|
+
'pixeltable': {
|
|
142
|
+
'home': 'Path to the Pixeltable home directory',
|
|
143
|
+
'config': 'Path to the Pixeltable config file',
|
|
144
|
+
'pgdata': 'Path to the Pixeltable postgres data directory',
|
|
145
|
+
'db': 'Postgres database name',
|
|
146
|
+
'file_cache_size_g': 'Size of the file cache in GB',
|
|
147
|
+
'time_zone': 'Default time zone for timestamps',
|
|
148
|
+
'hide_warnings': 'Hide warnings from the console',
|
|
149
|
+
'verbosity': 'Verbosity level for console output',
|
|
150
|
+
'api_key': 'API key for Pixeltable cloud',
|
|
151
|
+
},
|
|
152
|
+
'anthropic': {'api_key': 'Anthropic API key'},
|
|
153
|
+
'bedrock': {'api_key': 'AWS Bedrock API key'},
|
|
154
|
+
'deepseek': {'api_key': 'Deepseek API key'},
|
|
155
|
+
'fireworks': {'api_key': 'Fireworks API key'},
|
|
156
|
+
'gemini': {'api_key': 'Gemini API key'},
|
|
157
|
+
'groq': {'api_key': 'Groq API key'},
|
|
158
|
+
'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
|
|
159
|
+
'mistral': {'api_key': 'Mistral API key'},
|
|
160
|
+
'openai': {'api_key': 'OpenAI API key'},
|
|
161
|
+
'replicate': {'api_token': 'Replicate API token'},
|
|
162
|
+
'together': {'api_key': 'Together API key'},
|
|
163
|
+
'pypi': {'api_key': 'PyPI API key (for internal use only)'},
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
KNOWN_CONFIG_OVERRIDES = {
|
|
168
|
+
f'{section}.{key}': info
|
|
169
|
+
for section, section_dict in KNOWN_CONFIG_OPTIONS.items()
|
|
170
|
+
for key, info in section_dict.items()
|
|
171
|
+
}
|
pixeltable/dataframe.py
CHANGED
|
@@ -15,7 +15,7 @@ import sqlalchemy as sql
|
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
17
17
|
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
18
|
-
from pixeltable.catalog.
|
|
18
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
19
19
|
from pixeltable.env import Env
|
|
20
20
|
from pixeltable.plan import Planner, SampleClause
|
|
21
21
|
from pixeltable.type_system import ColumnType
|
|
@@ -1185,7 +1185,7 @@ class DataFrame:
|
|
|
1185
1185
|
"""
|
|
1186
1186
|
self._validate_mutable('delete', False)
|
|
1187
1187
|
if not self._first_tbl.is_insertable():
|
|
1188
|
-
raise excs.Error('Cannot delete
|
|
1188
|
+
raise excs.Error('Cannot use `delete` on a view.')
|
|
1189
1189
|
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1190
1190
|
return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
|
|
1191
1191
|
|
|
@@ -1196,14 +1196,27 @@ class DataFrame:
|
|
|
1196
1196
|
op_name: The name of the operation for which the test is being performed.
|
|
1197
1197
|
allow_select: If True, allow a select() specification in the Dataframe.
|
|
1198
1198
|
"""
|
|
1199
|
+
self._validate_mutable_op_sequence(op_name, allow_select)
|
|
1200
|
+
|
|
1201
|
+
# TODO: Reconcile these with Table.__check_mutable()
|
|
1202
|
+
assert len(self._from_clause.tbls) == 1
|
|
1203
|
+
if self._first_tbl.is_snapshot():
|
|
1204
|
+
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1205
|
+
if self._first_tbl.is_replica():
|
|
1206
|
+
raise excs.Error(f'Cannot use `{op_name}` on a replica.')
|
|
1207
|
+
|
|
1208
|
+
def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
|
|
1209
|
+
"""Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
|
|
1199
1210
|
if self.group_by_clause is not None or self.grouping_tbl is not None:
|
|
1200
|
-
raise excs.Error(f'Cannot use `{op_name}` after `group_by
|
|
1211
|
+
raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
|
|
1201
1212
|
if self.order_by_clause is not None:
|
|
1202
|
-
raise excs.Error(f'Cannot use `{op_name}` after `order_by
|
|
1213
|
+
raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
|
|
1203
1214
|
if self.select_list is not None and not allow_select:
|
|
1204
|
-
raise excs.Error(f'Cannot use `{op_name}` after `select
|
|
1215
|
+
raise excs.Error(f'Cannot use `{op_name}` after `select`.')
|
|
1205
1216
|
if self.limit_val is not None:
|
|
1206
|
-
raise excs.Error(f'Cannot use `{op_name}` after `limit
|
|
1217
|
+
raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
|
|
1218
|
+
if self._has_joins():
|
|
1219
|
+
raise excs.Error(f'Cannot use `{op_name}` after `join`.')
|
|
1207
1220
|
|
|
1208
1221
|
def as_dict(self) -> dict[str, Any]:
|
|
1209
1222
|
"""
|