pixeltable 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/__init__.py +1 -1
  3. pixeltable/catalog/catalog.py +619 -255
  4. pixeltable/catalog/dir.py +1 -2
  5. pixeltable/catalog/insertable_table.py +9 -9
  6. pixeltable/catalog/path.py +59 -20
  7. pixeltable/catalog/schema_object.py +10 -4
  8. pixeltable/catalog/table.py +51 -53
  9. pixeltable/catalog/table_version.py +216 -156
  10. pixeltable/catalog/table_version_path.py +1 -1
  11. pixeltable/catalog/tbl_ops.py +44 -0
  12. pixeltable/catalog/view.py +63 -65
  13. pixeltable/config.py +12 -4
  14. pixeltable/dataframe.py +75 -6
  15. pixeltable/env.py +46 -17
  16. pixeltable/exec/aggregation_node.py +1 -1
  17. pixeltable/exec/cache_prefetch_node.py +2 -6
  18. pixeltable/exec/component_iteration_node.py +4 -3
  19. pixeltable/exec/data_row_batch.py +10 -51
  20. pixeltable/exec/expr_eval/expr_eval_node.py +2 -2
  21. pixeltable/exec/in_memory_data_node.py +17 -16
  22. pixeltable/exec/sql_node.py +6 -7
  23. pixeltable/exprs/column_ref.py +2 -1
  24. pixeltable/exprs/data_row.py +13 -13
  25. pixeltable/exprs/row_builder.py +16 -4
  26. pixeltable/exprs/string_op.py +1 -1
  27. pixeltable/func/expr_template_function.py +1 -4
  28. pixeltable/functions/date.py +1 -1
  29. pixeltable/functions/gemini.py +4 -4
  30. pixeltable/functions/math.py +1 -1
  31. pixeltable/functions/openai.py +9 -6
  32. pixeltable/functions/timestamp.py +6 -6
  33. pixeltable/functions/video.py +2 -6
  34. pixeltable/globals.py +62 -33
  35. pixeltable/io/datarows.py +2 -1
  36. pixeltable/io/pandas.py +1 -0
  37. pixeltable/io/table_data_conduit.py +12 -13
  38. pixeltable/iterators/audio.py +17 -8
  39. pixeltable/iterators/image.py +5 -2
  40. pixeltable/metadata/schema.py +39 -2
  41. pixeltable/plan.py +5 -14
  42. pixeltable/share/packager.py +13 -13
  43. pixeltable/store.py +31 -7
  44. pixeltable/type_system.py +2 -1
  45. pixeltable/utils/filecache.py +1 -1
  46. pixeltable/utils/http_server.py +2 -3
  47. pixeltable/utils/media_store.py +90 -34
  48. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/METADATA +1 -1
  49. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/RECORD +52 -51
  50. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/LICENSE +0 -0
  51. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/WHEEL +0 -0
  52. {pixeltable-0.4.3.dist-info → pixeltable-0.4.5.dist-info}/entry_points.txt +0 -0
@@ -76,7 +76,7 @@ class TableVersionPath:
76
76
  elif self._cached_tbl_version is not None:
77
77
  return
78
78
 
79
- with Catalog.get().begin_xact(for_write=False):
79
+ with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
80
80
  self._cached_tbl_version = self.tbl_version.get()
81
81
 
82
82
  def clear_cached_md(self) -> None:
@@ -0,0 +1,44 @@
1
+ # This file contains all dataclasses related to schema.PendingTableOp:
2
+ # - TableOp: the container for each log entry
3
+ # - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
4
+ # enough information for exec_op() to perform the operation without having to reference data outside of
5
+ # TableVersion
6
+
7
+ import dataclasses
8
+ from typing import Any, Optional
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class CreateStoreTableOp:
13
+ pass
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class LoadViewOp:
18
+ view_path: dict[str, Any] # needed to create the view load plan
19
+
20
+
21
+ @dataclasses.dataclass
22
+ class DeleteTableMdOp:
23
+ pass
24
+
25
+
26
+ @dataclasses.dataclass
27
+ class DeleteTableMediaFilesOp:
28
+ pass
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class DropStoreTableOp:
33
+ pass
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class TableOp:
38
+ tbl_id: str # uuid.UUID
39
+ op_sn: int # sequence number within the update operation; [0, num_ops)
40
+ num_ops: int # total number of ops forming the update operation
41
+ needs_xact: bool # if True, op must be run as part of a transaction
42
+
43
+ create_store_table_op: Optional[CreateStoreTableOp] = None
44
+ load_view_op: Optional[LoadViewOp] = None
@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
9
9
  import pixeltable.metadata.schema as md_schema
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import catalog, exprs, func
12
- from pixeltable.env import Env
13
12
  from pixeltable.iterators import ComponentIterator
14
13
 
15
14
  if TYPE_CHECKING:
@@ -19,9 +18,10 @@ if TYPE_CHECKING:
19
18
  from .column import Column
20
19
  from .globals import _POS_COLUMN_NAME, MediaValidation
21
20
  from .table import Table
22
- from .table_version import TableVersion
21
+ from .table_version import TableVersion, TableVersionMd
23
22
  from .table_version_handle import TableVersionHandle
24
23
  from .table_version_path import TableVersionPath
24
+ from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
25
25
  from .update_status import UpdateStatus
26
26
 
27
27
  if TYPE_CHECKING:
@@ -45,9 +45,18 @@ class View(Table):
45
45
  if not snapshot_only:
46
46
  self._tbl_version = tbl_version_path.tbl_version
47
47
 
48
- @classmethod
49
- def _display_name(cls) -> str:
50
- return 'view'
48
+ def _display_name(self) -> str:
49
+ name: str
50
+ if self._tbl_version_path.is_snapshot():
51
+ name = 'snapshot'
52
+ elif self._tbl_version_path.is_view():
53
+ name = 'view'
54
+ else:
55
+ assert self._tbl_version_path.is_replica()
56
+ name = 'table'
57
+ if self._tbl_version_path.is_replica():
58
+ name = f'{name}-replica'
59
+ return name
51
60
 
52
61
  @classmethod
53
62
  def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -80,7 +89,7 @@ class View(Table):
80
89
  media_validation: MediaValidation,
81
90
  iterator_cls: Optional[type[ComponentIterator]],
82
91
  iterator_args: Optional[dict],
83
- ) -> View:
92
+ ) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
84
93
  from pixeltable.plan import SampleClause
85
94
 
86
95
  # Convert select_list to more additional_columns if present
@@ -167,11 +176,10 @@ class View(Table):
167
176
  for col in columns:
168
177
  if col.name in iterator_col_names:
169
178
  raise excs.Error(
170
- f'Duplicate name: column {col.name} is already present in the iterator output schema'
179
+ f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
171
180
  )
172
181
  columns = iterator_cols + columns
173
182
 
174
- session = Env.get().session
175
183
  from pixeltable.exprs import InlineDict
176
184
 
177
185
  iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
@@ -200,54 +208,26 @@ class View(Table):
200
208
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
201
209
  )
202
210
 
203
- id, tbl_version = TableVersion.create(
204
- dir_id,
205
- name,
206
- columns,
207
- num_retained_versions,
208
- comment,
209
- media_validation=media_validation,
210
- # base_path=base_version_path,
211
- view_md=view_md,
211
+ md = TableVersion.create_initial_md(
212
+ name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
212
213
  )
213
- if tbl_version is None:
214
- # this is purely a snapshot: we use the base's tbl version path
215
- view = cls(id, dir_id, name, base_version_path, snapshot_only=True)
216
- _logger.info(f'created snapshot {name}')
214
+ if md.tbl_md.is_pure_snapshot:
215
+ # this is purely a snapshot: no store table to create or load
216
+ return md, None
217
217
  else:
218
- view = cls(
219
- id,
220
- dir_id,
221
- name,
222
- TableVersionPath(
223
- TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
224
- ),
225
- snapshot_only=False,
226
- )
227
- _logger.info(f'Created view `{name}`, id={tbl_version.id}')
228
-
229
- from pixeltable.plan import Planner
230
-
231
- try:
232
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
233
- _, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
234
- status = UpdateStatus(row_count_stats=row_counts)
235
- tbl_version._write_md_update_status(0, update_status=status)
236
-
237
- except:
238
- # we need to remove the orphaned TableVersion instance
239
- del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
240
- base_tbl_version = base.tbl_version.get()
241
- if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
242
- # also remove tbl_version from the base
243
- base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
244
- raise
245
- Env.get().console_logger.info(
246
- f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
218
+ tbl_id = md.tbl_md.tbl_id
219
+ view_path = TableVersionPath(
220
+ TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
247
221
  )
248
-
249
- session.commit()
250
- return view
222
+ ops = [
223
+ TableOp(
224
+ tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
225
+ ),
226
+ TableOp(
227
+ tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
228
+ ),
229
+ ]
230
+ return md, ops
251
231
 
252
232
  @classmethod
253
233
  def _verify_column(cls, col: Column) -> None:
@@ -275,13 +255,26 @@ class View(Table):
275
255
  base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
276
256
  )
277
257
 
258
+ def _is_anonymous_snapshot(self) -> bool:
259
+ """
260
+ Returns True if this is an unnamed snapshot (i.e., a snapshot that is not a separate schema object).
261
+ """
262
+ return self._snapshot_only and self._id == self._tbl_version_path.tbl_id
263
+
278
264
  def _get_metadata(self) -> dict[str, Any]:
279
265
  md = super()._get_metadata()
280
266
  md['is_view'] = True
281
267
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
268
+ if self._is_anonymous_snapshot():
269
+ # Update name and path with version qualifiers.
270
+ md['name'] = f'{self._name}:{self._tbl_version_path.version()}'
271
+ md['path'] = f'{self._path()}:{self._tbl_version_path.version()}'
282
272
  base_tbl = self._get_base_table()
283
- base_version = self._effective_base_versions[0]
284
- md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
273
+ if base_tbl is None:
274
+ md['base'] = None
275
+ else:
276
+ base_version = self._effective_base_versions[0]
277
+ md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
285
278
  return md
286
279
 
287
280
  def insert(
@@ -295,28 +288,33 @@ class View(Table):
295
288
  print_stats: bool = False,
296
289
  **kwargs: Any,
297
290
  ) -> UpdateStatus:
298
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
291
+ raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
299
292
 
300
293
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
301
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot delete from view')
294
+ raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
302
295
 
303
296
  def _get_base_table(self) -> Optional['Table']:
304
- # if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
305
- # for the snapshot itself)
306
- base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
307
- return catalog.Catalog.get().get_table_by_id(base_id)
297
+ if self._tbl_version_path.tbl_id != self._id:
298
+ # _tbl_version_path represents a different schema object from this one. This can only happen if this is a
299
+ # named pure snapshot.
300
+ base_id = self._tbl_version_path.tbl_id
301
+ elif self._tbl_version_path.base is None:
302
+ return None
303
+ else:
304
+ base_id = self._tbl_version_path.base.tbl_id
305
+ with catalog.Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
306
+ return catalog.Catalog.get().get_table_by_id(base_id)
308
307
 
309
308
  @property
310
309
  def _effective_base_versions(self) -> list[Optional[int]]:
311
310
  effective_versions = [tv.effective_version for tv in self._tbl_version_path.get_tbl_versions()]
312
- if self._snapshot_only:
313
- return effective_versions
311
+ if self._snapshot_only and not self._is_anonymous_snapshot():
312
+ return effective_versions # Named pure snapshot
314
313
  else:
315
314
  return effective_versions[1:]
316
315
 
317
316
  def _table_descriptor(self) -> str:
318
- display_name = 'Snapshot' if self._snapshot_only else 'View'
319
- result = [f'{display_name} {self._path()!r}']
317
+ result = [self._display_str()]
320
318
  bases_descrs: list[str] = []
321
319
  for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
322
320
  if effective_version is None:
pixeltable/config.py CHANGED
@@ -8,7 +8,7 @@ from typing import Any, ClassVar, Optional, TypeVar
8
8
 
9
9
  import toml
10
10
 
11
- from pixeltable import exceptions as excs
11
+ from pixeltable import env, exceptions as excs
12
12
 
13
13
  _logger = logging.getLogger('pixeltable')
14
14
 
@@ -82,7 +82,11 @@ class Config:
82
82
  return cls.__instance
83
83
 
84
84
  @classmethod
85
- def init(cls, config_overrides: dict[str, Any]) -> None:
85
+ def init(cls, config_overrides: dict[str, Any], reinit: bool = False) -> None:
86
+ if reinit:
87
+ cls.__instance = None
88
+ for cl in env._registered_clients.values():
89
+ cl.client_obj = None
86
90
  if cls.__instance is None:
87
91
  cls.__instance = cls(config_overrides)
88
92
  elif len(config_overrides) > 0:
@@ -102,7 +106,7 @@ class Config:
102
106
  env_var = f'{section.upper()}_{key.upper()}'
103
107
  if override_var in self.__config_overrides:
104
108
  return self.__config_overrides[override_var]
105
- if env_var in os.environ:
109
+ if env_var in os.environ and len(os.environ[env_var]) > 0:
106
110
  return os.environ[env_var]
107
111
  return default
108
112
 
@@ -157,7 +161,11 @@ KNOWN_CONFIG_OPTIONS = {
157
161
  'groq': {'api_key': 'Groq API key'},
158
162
  'label_studio': {'api_key': 'Label Studio API key', 'url': 'Label Studio server URL'},
159
163
  'mistral': {'api_key': 'Mistral API key'},
160
- 'openai': {'api_key': 'OpenAI API key'},
164
+ 'openai': {
165
+ 'api_key': 'OpenAI API key',
166
+ 'base_url': 'OpenAI API base URL',
167
+ 'api_version': 'API version if using Azure OpenAI',
168
+ },
161
169
  'replicate': {'api_token': 'Replicate API token'},
162
170
  'together': {'api_key': 'Together API key'},
163
171
  'pypi': {'api_key': 'PyPI API key (for internal use only)'},
pixeltable/dataframe.py CHANGED
@@ -8,9 +8,22 @@ import json
8
8
  import logging
9
9
  import traceback
10
10
  from pathlib import Path
11
- from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
11
+ from typing import (
12
+ TYPE_CHECKING,
13
+ Any,
14
+ AsyncIterator,
15
+ Callable,
16
+ Hashable,
17
+ Iterator,
18
+ NoReturn,
19
+ Optional,
20
+ Sequence,
21
+ TypeVar,
22
+ Union,
23
+ )
12
24
 
13
25
  import pandas as pd
26
+ import pydantic
14
27
  import sqlalchemy as sql
15
28
 
16
29
  from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
@@ -32,6 +45,11 @@ _logger = logging.getLogger('pixeltable')
32
45
 
33
46
 
34
47
  class DataFrameResultSet:
48
+ _rows: list[list[Any]]
49
+ _col_names: list[str]
50
+ __schema: dict[str, ColumnType]
51
+ __formatter: Formatter
52
+
35
53
  def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
36
54
  self._rows = rows
37
55
  self._col_names = list(schema.keys())
@@ -66,6 +84,44 @@ class DataFrameResultSet:
66
84
  def to_pandas(self) -> pd.DataFrame:
67
85
  return pd.DataFrame.from_records(self._rows, columns=self._col_names)
68
86
 
87
+ BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
88
+
89
+ def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
90
+ """
91
+ Convert the DataFrameResultSet to a list of Pydantic model instances.
92
+
93
+ Args:
94
+ model: A Pydantic model class.
95
+
96
+ Returns:
97
+ An iterator over Pydantic model instances, one for each row in the result set.
98
+
99
+ Raises:
100
+ Error: If the row data doesn't match the model schema.
101
+ """
102
+ model_fields = model.model_fields
103
+ model_config = getattr(model, 'model_config', {})
104
+ forbid_extra_fields = model_config.get('extra') == 'forbid'
105
+
106
+ # schema validation
107
+ required_fields = {name for name, field in model_fields.items() if field.is_required()}
108
+ col_names = set(self._col_names)
109
+ missing_fields = required_fields - col_names
110
+ if len(missing_fields) > 0:
111
+ raise excs.Error(
112
+ f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
113
+ )
114
+ if forbid_extra_fields:
115
+ extra_fields = col_names - set(model_fields.keys())
116
+ if len(extra_fields) > 0:
117
+ raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
118
+
119
+ for row in self:
120
+ try:
121
+ yield model(**row)
122
+ except pydantic.ValidationError as e:
123
+ raise excs.Error(str(e)) from e
124
+
69
125
  def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
70
126
  return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
71
127
 
@@ -1185,7 +1241,7 @@ class DataFrame:
1185
1241
  """
1186
1242
  self._validate_mutable('delete', False)
1187
1243
  if not self._first_tbl.is_insertable():
1188
- raise excs.Error('Cannot delete from view')
1244
+ raise excs.Error('Cannot use `delete` on a view.')
1189
1245
  with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1190
1246
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1191
1247
 
@@ -1196,14 +1252,27 @@ class DataFrame:
1196
1252
  op_name: The name of the operation for which the test is being performed.
1197
1253
  allow_select: If True, allow a select() specification in the Dataframe.
1198
1254
  """
1255
+ self._validate_mutable_op_sequence(op_name, allow_select)
1256
+
1257
+ # TODO: Reconcile these with Table.__check_mutable()
1258
+ assert len(self._from_clause.tbls) == 1
1259
+ if self._first_tbl.is_snapshot():
1260
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1261
+ if self._first_tbl.is_replica():
1262
+ raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1263
+
1264
+ def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1265
+ """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
1199
1266
  if self.group_by_clause is not None or self.grouping_tbl is not None:
1200
- raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
1267
+ raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
1201
1268
  if self.order_by_clause is not None:
1202
- raise excs.Error(f'Cannot use `{op_name}` after `order_by`')
1269
+ raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
1203
1270
  if self.select_list is not None and not allow_select:
1204
- raise excs.Error(f'Cannot use `{op_name}` after `select`')
1271
+ raise excs.Error(f'Cannot use `{op_name}` after `select`.')
1205
1272
  if self.limit_val is not None:
1206
- raise excs.Error(f'Cannot use `{op_name}` after `limit`')
1273
+ raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
1274
+ if self._has_joins():
1275
+ raise excs.Error(f'Cannot use `{op_name}` after `join`.')
1207
1276
 
1208
1277
  def as_dict(self) -> dict[str, Any]:
1209
1278
  """
pixeltable/env.py CHANGED
@@ -13,6 +13,8 @@ import platform
13
13
  import shutil
14
14
  import sys
15
15
  import threading
16
+ import types
17
+ import typing
16
18
  import uuid
17
19
  import warnings
18
20
  from abc import abstractmethod
@@ -20,7 +22,7 @@ from contextlib import contextmanager
20
22
  from dataclasses import dataclass, field
21
23
  from pathlib import Path
22
24
  from sys import stdout
23
- from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
25
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
24
26
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
25
27
 
26
28
  import nest_asyncio # type: ignore[import-untyped]
@@ -86,6 +88,7 @@ class Env:
86
88
  _resource_pool_info: dict[str, Any]
87
89
  _current_conn: Optional[sql.Connection]
88
90
  _current_session: Optional[sql.orm.Session]
91
+ _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
89
92
  _dbms: Optional[Dbms]
90
93
  _event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
91
94
 
@@ -99,6 +102,7 @@ class Env:
99
102
  def _init_env(cls, reinit_db: bool = False) -> None:
100
103
  assert not cls.__initializing, 'Circular env initialization detected.'
101
104
  cls.__initializing = True
105
+ cls._instance = None
102
106
  env = Env()
103
107
  env._set_up(reinit_db=reinit_db)
104
108
  env._upgrade_metadata()
@@ -142,6 +146,7 @@ class Env:
142
146
  self._resource_pool_info = {}
143
147
  self._current_conn = None
144
148
  self._current_session = None
149
+ self._current_isolation_level = None
145
150
  self._dbms = None
146
151
  self._event_loop = None
147
152
 
@@ -230,20 +235,34 @@ class Env:
230
235
  return self._db_server is not None
231
236
 
232
237
  @contextmanager
233
- def begin_xact(self) -> Iterator[sql.Connection]:
234
- """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
238
+ def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
239
+ """
240
+ Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
241
+
242
+ for_write: if True, uses serializable isolation; if False, uses repeatable_read
243
+
244
+ TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
245
+ that avoids tripping over any pending ops
246
+ """
235
247
  if self._current_conn is None:
236
248
  assert self._current_session is None
237
249
  try:
238
- with self.engine.begin() as conn, sql.orm.Session(conn) as session:
250
+ self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
251
+ with (
252
+ self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
253
+ sql.orm.Session(conn) as session,
254
+ conn.begin(),
255
+ ):
239
256
  self._current_conn = conn
240
257
  self._current_session = session
241
258
  yield conn
242
259
  finally:
243
260
  self._current_session = None
244
261
  self._current_conn = None
262
+ self._current_isolation_level = None
245
263
  else:
246
264
  assert self._current_session is not None
265
+ assert for_write == (self._current_isolation_level == 'serializable')
247
266
  yield self._current_conn
248
267
 
249
268
  def configure_logging(
@@ -587,16 +606,26 @@ class Env:
587
606
 
588
607
  # Construct a client, retrieving each parameter from config.
589
608
 
590
- init_kwargs: dict[str, str] = {}
591
- for param in cl.param_names:
592
- arg = Config.get().get_string_value(param, section=name)
593
- if arg is not None and len(arg) > 0:
594
- init_kwargs[param] = arg
595
- else:
609
+ init_kwargs: dict[str, Any] = {}
610
+ for param in cl.params.values():
611
+ # Determine the type of the parameter for proper config parsing.
612
+ t = param.annotation
613
+ # Deference Optional[T]
614
+ if typing.get_origin(t) in (typing.Union, types.UnionType):
615
+ args = typing.get_args(t)
616
+ if args[0] is type(None):
617
+ t = args[1]
618
+ elif args[1] is type(None):
619
+ t = args[0]
620
+ assert isinstance(t, type), t
621
+ arg: Any = Config.get().get_value(param.name, t, section=name)
622
+ if arg is not None:
623
+ init_kwargs[param.name] = arg
624
+ elif param.default is inspect.Parameter.empty:
596
625
  raise excs.Error(
597
- f'`{name}` client not initialized: parameter `{param}` is not configured.\n'
598
- f'To fix this, specify the `{name.upper()}_{param.upper()}` environment variable, '
599
- f'or put `{param.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
626
+ f'`{name}` client not initialized: parameter `{param.name}` is not configured.\n'
627
+ f'To fix this, specify the `{name.upper()}_{param.name.upper()}` environment variable, '
628
+ f'or put `{param.name.lower()}` in the `{name.lower()}` section of $PIXELTABLE_HOME/config.toml.'
600
629
  )
601
630
 
602
631
  cl.client_obj = cl.init_fn(**init_kwargs)
@@ -607,7 +636,7 @@ class Env:
607
636
  """
608
637
  The http server root is the file system root.
609
638
  eg: /home/media/foo.mp4 is located at http://127.0.0.1:{port}/home/media/foo.mp4
610
- in windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
639
+ On Windows, the server will translate paths like http://127.0.0.1:{port}/c:/media/foo.mp4
611
640
  This arrangement enables serving media hosted within _home,
612
641
  as well as external media inserted into pixeltable or produced by pixeltable.
613
642
  The port is chosen dynamically to prevent conflicts.
@@ -815,8 +844,8 @@ def register_client(name: str) -> Callable:
815
844
 
816
845
  def decorator(fn: Callable) -> None:
817
846
  sig = inspect.signature(fn)
818
- param_names = list(sig.parameters.keys())
819
- _registered_clients[name] = ApiClient(init_fn=fn, param_names=param_names)
847
+ params = dict(sig.parameters)
848
+ _registered_clients[name] = ApiClient(init_fn=fn, params=params)
820
849
 
821
850
  return decorator
822
851
 
@@ -827,7 +856,7 @@ _registered_clients: dict[str, ApiClient] = {}
827
856
  @dataclass
828
857
  class ApiClient:
829
858
  init_fn: Callable
830
- param_names: list[str]
859
+ params: dict[str, inspect.Parameter]
831
860
  client_obj: Optional[Any] = None
832
861
 
833
862
 
@@ -45,7 +45,7 @@ class AggregationNode(ExecNode):
45
45
  # we need to make sure to refer to the same exprs that RowBuilder.eval() will use
46
46
  self.agg_fn_calls = [cast(exprs.FunctionCall, e) for e in self.agg_fn_eval_ctx.target_exprs]
47
47
  # create output_batch here, rather than in __iter__(), so we don't need to remember tbl and row_builder
48
- self.output_batch = DataRowBatch(tbl, row_builder, 0)
48
+ self.output_batch = DataRowBatch(row_builder)
49
49
  self.limit = None
50
50
 
51
51
  def set_limit(self, limit: int) -> None:
@@ -12,7 +12,7 @@ from pathlib import Path
12
12
  from typing import Any, AsyncIterator, Iterator, Optional
13
13
  from uuid import UUID
14
14
 
15
- from pixeltable import catalog, env, exceptions as excs, exprs
15
+ from pixeltable import env, exceptions as excs, exprs
16
16
  from pixeltable.utils.filecache import FileCache
17
17
 
18
18
  from .data_row_batch import DataRowBatch
@@ -37,7 +37,6 @@ class CachePrefetchNode(ExecNode):
37
37
  boto_client_lock: threading.Lock
38
38
 
39
39
  # execution state
40
- batch_tbl_version: Optional[catalog.TableVersionHandle] # needed to construct output batches
41
40
  num_returned_rows: int
42
41
 
43
42
  # ready_rows: rows that are ready to be returned, ordered by row idx;
@@ -68,7 +67,6 @@ class CachePrefetchNode(ExecNode):
68
67
  self.boto_client = None
69
68
  self.boto_client_lock = threading.Lock()
70
69
 
71
- self.batch_tbl_version = None
72
70
  self.num_returned_rows = 0
73
71
  self.ready_rows = deque()
74
72
  self.in_flight_rows = {}
@@ -95,7 +93,7 @@ class CachePrefetchNode(ExecNode):
95
93
 
96
94
  if len(self.ready_rows) > 0:
97
95
  # create DataRowBatch from the first BATCH_SIZE ready rows
98
- batch = DataRowBatch(self.batch_tbl_version, self.row_builder)
96
+ batch = DataRowBatch(self.row_builder)
99
97
  rows = [self.ready_rows.popleft() for _ in range(min(self.BATCH_SIZE, len(self.ready_rows)))]
100
98
  for row in rows:
101
99
  assert row is not None
@@ -173,8 +171,6 @@ class CachePrefetchNode(ExecNode):
173
171
  if input_batch is None:
174
172
  self.input_finished = True
175
173
  return
176
- if self.batch_tbl_version is None:
177
- self.batch_tbl_version = input_batch.tbl
178
174
 
179
175
  file_cache = FileCache.get()
180
176
 
@@ -40,7 +40,7 @@ class ComponentIterationNode(ExecNode):
40
40
  }
41
41
 
42
42
  async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
43
- output_batch = DataRowBatch(self.view, self.row_builder)
43
+ output_batch = DataRowBatch(self.row_builder)
44
44
  async for input_batch in self.input:
45
45
  for input_row in input_batch:
46
46
  self.row_builder.eval(input_row, self.iterator_args_ctx)
@@ -52,13 +52,14 @@ class ComponentIterationNode(ExecNode):
52
52
  if self.__non_nullable_args_specified(iterator_args):
53
53
  iterator = self.view.get().iterator_cls(**iterator_args)
54
54
  for pos, component_dict in enumerate(iterator):
55
- output_row = output_batch.add_row()
55
+ output_row = self.row_builder.make_row()
56
56
  input_row.copy(output_row)
57
57
  # we're expanding the input and need to add the iterator position to the pk
58
58
  self.__populate_output_row(output_row, pos, component_dict)
59
+ output_batch.add_row(output_row)
59
60
  if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
60
61
  yield output_batch
61
- output_batch = DataRowBatch(self.view, self.row_builder)
62
+ output_batch = DataRowBatch(self.row_builder)
62
63
 
63
64
  if len(output_batch) > 0:
64
65
  yield output_batch