pixeltable 0.4.3__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

@@ -76,7 +76,7 @@ class TableVersionPath:
76
76
  elif self._cached_tbl_version is not None:
77
77
  return
78
78
 
79
- with Catalog.get().begin_xact(for_write=False):
79
+ with Catalog.get().begin_xact(tbl_id=self.tbl_version.id, for_write=False):
80
80
  self._cached_tbl_version = self.tbl_version.get()
81
81
 
82
82
  def clear_cached_md(self) -> None:
@@ -0,0 +1,44 @@
1
+ # This file contains all dataclasses related to schema.PendingTableOp:
2
+ # - TableOp: the container for each log entry
3
+ # - <>Op: the actual operation, which is performed by TableVersion.exec_op(); each <>Op class contains
4
+ # enough information for exec_op() to perform the operation without having to reference data outside of
5
+ # TableVersion
6
+
7
+ import dataclasses
8
+ from typing import Any, Optional
9
+
10
+
11
+ @dataclasses.dataclass
12
+ class CreateStoreTableOp:
13
+ pass
14
+
15
+
16
+ @dataclasses.dataclass
17
+ class LoadViewOp:
18
+ view_path: dict[str, Any] # needed to create the view load plan
19
+
20
+
21
+ @dataclasses.dataclass
22
+ class DeleteTableMdOp:
23
+ pass
24
+
25
+
26
+ @dataclasses.dataclass
27
+ class DeleteTableMediaFilesOp:
28
+ pass
29
+
30
+
31
+ @dataclasses.dataclass
32
+ class DropStoreTableOp:
33
+ pass
34
+
35
+
36
+ @dataclasses.dataclass
37
+ class TableOp:
38
+ tbl_id: str # uuid.UUID
39
+ op_sn: int # sequence number within the update operation; [0, num_ops)
40
+ num_ops: int # total number of ops forming the update operation
41
+ needs_xact: bool # if True, op must be run as part of a transaction
42
+
43
+ create_store_table_op: Optional[CreateStoreTableOp] = None
44
+ load_view_op: Optional[LoadViewOp] = None
@@ -9,7 +9,6 @@ import pixeltable.exceptions as excs
9
9
  import pixeltable.metadata.schema as md_schema
10
10
  import pixeltable.type_system as ts
11
11
  from pixeltable import catalog, exprs, func
12
- from pixeltable.env import Env
13
12
  from pixeltable.iterators import ComponentIterator
14
13
 
15
14
  if TYPE_CHECKING:
@@ -19,9 +18,10 @@ if TYPE_CHECKING:
19
18
  from .column import Column
20
19
  from .globals import _POS_COLUMN_NAME, MediaValidation
21
20
  from .table import Table
22
- from .table_version import TableVersion
21
+ from .table_version import TableVersion, TableVersionMd
23
22
  from .table_version_handle import TableVersionHandle
24
23
  from .table_version_path import TableVersionPath
24
+ from .tbl_ops import CreateStoreTableOp, LoadViewOp, TableOp
25
25
  from .update_status import UpdateStatus
26
26
 
27
27
  if TYPE_CHECKING:
@@ -45,9 +45,18 @@ class View(Table):
45
45
  if not snapshot_only:
46
46
  self._tbl_version = tbl_version_path.tbl_version
47
47
 
48
- @classmethod
49
- def _display_name(cls) -> str:
50
- return 'view'
48
+ def _display_name(self) -> str:
49
+ name: str
50
+ if self._tbl_version_path.is_snapshot():
51
+ name = 'snapshot'
52
+ elif self._tbl_version_path.is_view():
53
+ name = 'view'
54
+ else:
55
+ assert self._tbl_version_path.is_replica()
56
+ name = 'table'
57
+ if self._tbl_version_path.is_replica():
58
+ name = f'{name}-replica'
59
+ return name
51
60
 
52
61
  @classmethod
53
62
  def select_list_to_additional_columns(cls, select_list: list[tuple[exprs.Expr, Optional[str]]]) -> dict[str, dict]:
@@ -80,7 +89,7 @@ class View(Table):
80
89
  media_validation: MediaValidation,
81
90
  iterator_cls: Optional[type[ComponentIterator]],
82
91
  iterator_args: Optional[dict],
83
- ) -> View:
92
+ ) -> tuple[TableVersionMd, Optional[list[TableOp]]]:
84
93
  from pixeltable.plan import SampleClause
85
94
 
86
95
  # Convert select_list to more additional_columns if present
@@ -167,11 +176,10 @@ class View(Table):
167
176
  for col in columns:
168
177
  if col.name in iterator_col_names:
169
178
  raise excs.Error(
170
- f'Duplicate name: column {col.name} is already present in the iterator output schema'
179
+ f'Duplicate name: column {col.name!r} is already present in the iterator output schema'
171
180
  )
172
181
  columns = iterator_cols + columns
173
182
 
174
- session = Env.get().session
175
183
  from pixeltable.exprs import InlineDict
176
184
 
177
185
  iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
@@ -200,54 +208,26 @@ class View(Table):
200
208
  iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
201
209
  )
202
210
 
203
- id, tbl_version = TableVersion.create(
204
- dir_id,
205
- name,
206
- columns,
207
- num_retained_versions,
208
- comment,
209
- media_validation=media_validation,
210
- # base_path=base_version_path,
211
- view_md=view_md,
211
+ md = TableVersion.create_initial_md(
212
+ name, columns, num_retained_versions, comment, media_validation=media_validation, view_md=view_md
212
213
  )
213
- if tbl_version is None:
214
- # this is purely a snapshot: we use the base's tbl version path
215
- view = cls(id, dir_id, name, base_version_path, snapshot_only=True)
216
- _logger.info(f'created snapshot {name}')
214
+ if md.tbl_md.is_pure_snapshot:
215
+ # this is purely a snapshot: no store table to create or load
216
+ return md, None
217
217
  else:
218
- view = cls(
219
- id,
220
- dir_id,
221
- name,
222
- TableVersionPath(
223
- TableVersionHandle(tbl_version.id, tbl_version.effective_version), base=base_version_path
224
- ),
225
- snapshot_only=False,
226
- )
227
- _logger.info(f'Created view `{name}`, id={tbl_version.id}')
228
-
229
- from pixeltable.plan import Planner
230
-
231
- try:
232
- plan, _ = Planner.create_view_load_plan(view._tbl_version_path)
233
- _, row_counts = tbl_version.store_tbl.insert_rows(plan, v_min=tbl_version.version)
234
- status = UpdateStatus(row_count_stats=row_counts)
235
- tbl_version._write_md_update_status(0, update_status=status)
236
-
237
- except:
238
- # we need to remove the orphaned TableVersion instance
239
- del catalog.Catalog.get()._tbl_versions[tbl_version.id, tbl_version.effective_version]
240
- base_tbl_version = base.tbl_version.get()
241
- if tbl_version.effective_version is None and not base_tbl_version.is_snapshot:
242
- # also remove tbl_version from the base
243
- base_tbl_version.mutable_views.remove(TableVersionHandle.create(tbl_version))
244
- raise
245
- Env.get().console_logger.info(
246
- f'Created view `{name}` with {status.num_rows} rows, {status.num_excs} exceptions.'
218
+ tbl_id = md.tbl_md.tbl_id
219
+ view_path = TableVersionPath(
220
+ TableVersionHandle(UUID(tbl_id), effective_version=0 if is_snapshot else None), base=base_version_path
247
221
  )
248
-
249
- session.commit()
250
- return view
222
+ ops = [
223
+ TableOp(
224
+ tbl_id=tbl_id, op_sn=0, num_ops=2, needs_xact=False, create_store_table_op=CreateStoreTableOp()
225
+ ),
226
+ TableOp(
227
+ tbl_id=tbl_id, op_sn=1, num_ops=2, needs_xact=True, load_view_op=LoadViewOp(view_path.as_dict())
228
+ ),
229
+ ]
230
+ return md, ops
251
231
 
252
232
  @classmethod
253
233
  def _verify_column(cls, col: Column) -> None:
@@ -280,8 +260,11 @@ class View(Table):
280
260
  md['is_view'] = True
281
261
  md['is_snapshot'] = self._tbl_version_path.is_snapshot()
282
262
  base_tbl = self._get_base_table()
283
- base_version = self._effective_base_versions[0]
284
- md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
263
+ if base_tbl is None:
264
+ md['base'] = None
265
+ else:
266
+ base_version = self._effective_base_versions[0]
267
+ md['base'] = base_tbl._path() if base_version is None else f'{base_tbl._path()}:{base_version}'
285
268
  return md
286
269
 
287
270
  def insert(
@@ -295,16 +278,21 @@ class View(Table):
295
278
  print_stats: bool = False,
296
279
  **kwargs: Any,
297
280
  ) -> UpdateStatus:
298
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
281
+ raise excs.Error(f'{self._display_str()}: Cannot insert into a {self._display_name()}.')
299
282
 
300
283
  def delete(self, where: Optional[exprs.Expr] = None) -> UpdateStatus:
301
- raise excs.Error(f'{self._display_name()} {self._name!r}: cannot delete from view')
284
+ raise excs.Error(f'{self._display_str()}: Cannot delete from a {self._display_name()}.')
302
285
 
303
286
  def _get_base_table(self) -> Optional['Table']:
287
+ if self._tbl_version_path.base is None and not self._snapshot_only:
288
+ return None # this can happen for a replica of a base table
304
289
  # if this is a pure snapshot, our tbl_version_path only reflects the base (there is no TableVersion instance
305
290
  # for the snapshot itself)
291
+ from pixeltable.catalog import Catalog
292
+
306
293
  base_id = self._tbl_version_path.tbl_id if self._snapshot_only else self._tbl_version_path.base.tbl_id
307
- return catalog.Catalog.get().get_table_by_id(base_id)
294
+ with Catalog.get().begin_xact(tbl_id=base_id, for_write=False):
295
+ return catalog.Catalog.get().get_table_by_id(base_id)
308
296
 
309
297
  @property
310
298
  def _effective_base_versions(self) -> list[Optional[int]]:
@@ -315,8 +303,7 @@ class View(Table):
315
303
  return effective_versions[1:]
316
304
 
317
305
  def _table_descriptor(self) -> str:
318
- display_name = 'Snapshot' if self._snapshot_only else 'View'
319
- result = [f'{display_name} {self._path()!r}']
306
+ result = [self._display_str()]
320
307
  bases_descrs: list[str] = []
321
308
  for base, effective_version in zip(self._get_base_tables(), self._effective_base_versions):
322
309
  if effective_version is None:
pixeltable/dataframe.py CHANGED
@@ -1185,7 +1185,7 @@ class DataFrame:
1185
1185
  """
1186
1186
  self._validate_mutable('delete', False)
1187
1187
  if not self._first_tbl.is_insertable():
1188
- raise excs.Error('Cannot delete from view')
1188
+ raise excs.Error('Cannot use `delete` on a view.')
1189
1189
  with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
1190
1190
  return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
1191
1191
 
@@ -1196,14 +1196,27 @@ class DataFrame:
1196
1196
  op_name: The name of the operation for which the test is being performed.
1197
1197
  allow_select: If True, allow a select() specification in the Dataframe.
1198
1198
  """
1199
+ self._validate_mutable_op_sequence(op_name, allow_select)
1200
+
1201
+ # TODO: Reconcile these with Table.__check_mutable()
1202
+ assert len(self._from_clause.tbls) == 1
1203
+ if self._first_tbl.is_snapshot():
1204
+ raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
1205
+ if self._first_tbl.is_replica():
1206
+ raise excs.Error(f'Cannot use `{op_name}` on a replica.')
1207
+
1208
+ def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
1209
+ """Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
1199
1210
  if self.group_by_clause is not None or self.grouping_tbl is not None:
1200
- raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
1211
+ raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
1201
1212
  if self.order_by_clause is not None:
1202
- raise excs.Error(f'Cannot use `{op_name}` after `order_by`')
1213
+ raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
1203
1214
  if self.select_list is not None and not allow_select:
1204
- raise excs.Error(f'Cannot use `{op_name}` after `select`')
1215
+ raise excs.Error(f'Cannot use `{op_name}` after `select`.')
1205
1216
  if self.limit_val is not None:
1206
- raise excs.Error(f'Cannot use `{op_name}` after `limit`')
1217
+ raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
1218
+ if self._has_joins():
1219
+ raise excs.Error(f'Cannot use `{op_name}` after `join`.')
1207
1220
 
1208
1221
  def as_dict(self) -> dict[str, Any]:
1209
1222
  """
pixeltable/env.py CHANGED
@@ -20,7 +20,7 @@ from contextlib import contextmanager
20
20
  from dataclasses import dataclass, field
21
21
  from pathlib import Path
22
22
  from sys import stdout
23
- from typing import TYPE_CHECKING, Any, Callable, Iterator, Optional, TypeVar
23
+ from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Optional, TypeVar
24
24
  from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
25
25
 
26
26
  import nest_asyncio # type: ignore[import-untyped]
@@ -86,6 +86,7 @@ class Env:
86
86
  _resource_pool_info: dict[str, Any]
87
87
  _current_conn: Optional[sql.Connection]
88
88
  _current_session: Optional[sql.orm.Session]
89
+ _current_isolation_level: Optional[Literal['REPEATABLE_READ', 'SERIALIZABLE']]
89
90
  _dbms: Optional[Dbms]
90
91
  _event_loop: Optional[asyncio.AbstractEventLoop] # event loop for ExecNode
91
92
 
@@ -99,6 +100,7 @@ class Env:
99
100
  def _init_env(cls, reinit_db: bool = False) -> None:
100
101
  assert not cls.__initializing, 'Circular env initialization detected.'
101
102
  cls.__initializing = True
103
+ cls._instance = None
102
104
  env = Env()
103
105
  env._set_up(reinit_db=reinit_db)
104
106
  env._upgrade_metadata()
@@ -142,6 +144,7 @@ class Env:
142
144
  self._resource_pool_info = {}
143
145
  self._current_conn = None
144
146
  self._current_session = None
147
+ self._current_isolation_level = None
145
148
  self._dbms = None
146
149
  self._event_loop = None
147
150
 
@@ -230,20 +233,34 @@ class Env:
230
233
  return self._db_server is not None
231
234
 
232
235
  @contextmanager
233
- def begin_xact(self) -> Iterator[sql.Connection]:
234
- """Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
236
+ def begin_xact(self, for_write: bool = False) -> Iterator[sql.Connection]:
237
+ """
238
+ Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly.
239
+
240
+ for_write: if True, uses serializable isolation; if False, uses repeatable_read
241
+
242
+ TODO: repeatable read is not available in Cockroachdb; instead, run queries against a snapshot TVP
243
+ that avoids tripping over any pending ops
244
+ """
235
245
  if self._current_conn is None:
236
246
  assert self._current_session is None
237
247
  try:
238
- with self.engine.begin() as conn, sql.orm.Session(conn) as session:
248
+ self._current_isolation_level = 'SERIALIZABLE' if for_write else 'REPEATABLE_READ'
249
+ with (
250
+ self.engine.connect().execution_options(isolation_level=self._current_isolation_level) as conn,
251
+ sql.orm.Session(conn) as session,
252
+ conn.begin(),
253
+ ):
239
254
  self._current_conn = conn
240
255
  self._current_session = session
241
256
  yield conn
242
257
  finally:
243
258
  self._current_session = None
244
259
  self._current_conn = None
260
+ self._current_isolation_level = None
245
261
  else:
246
262
  assert self._current_session is not None
263
+ assert for_write == (self._current_isolation_level == 'serializable')
247
264
  yield self._current_conn
248
265
 
249
266
  def configure_logging(
@@ -90,7 +90,9 @@ class DataRowBatch:
90
90
  idx_range = slice(0, len(self.rows))
91
91
  for row in self.rows[idx_range]:
92
92
  for info in stored_img_info:
93
- filepath = str(MediaStore.prepare_media_path(self.tbl.id, info.col.id, self.tbl.get().version))
93
+ col = info.col
94
+ assert col.tbl.id == self.tbl.id
95
+ filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
94
96
  row.flush_img(info.slot_idx, filepath)
95
97
  for slot_idx in flushed_slot_idxs:
96
98
  row.flush_img(slot_idx)
@@ -63,13 +63,12 @@ class InMemoryDataNode(ExecNode):
63
63
  for col_name, val in input_row.items():
64
64
  col_info = user_cols_by_name.get(col_name)
65
65
  assert col_info is not None
66
-
67
- if col_info.col.col_type.is_image_type() and isinstance(val, bytes):
68
- # this is a literal image, ie, a sequence of bytes; we save this as a media file and store the path
69
- path = str(MediaStore.prepare_media_path(self.tbl.id, col_info.col.id, self.tbl.get().version))
70
- with open(path, 'wb') as fp:
71
- fp.write(val)
72
- self.output_rows[row_idx][col_info.slot_idx] = path
66
+ col = col_info.col
67
+ if col.col_type.is_image_type() and isinstance(val, bytes):
68
+ # this is a literal media file, ie, a sequence of bytes; save it as a binary file and store the path
69
+ assert col.tbl.id == self.tbl.id
70
+ path = MediaStore.save_media_file(val, col.tbl.id, col.id, col.tbl.version)
71
+ self.output_rows[row_idx][col_info.slot_idx] = str(path)
73
72
  else:
74
73
  self.output_rows[row_idx][col_info.slot_idx] = val
75
74
 
@@ -325,7 +325,8 @@ class ColumnRef(Expr):
325
325
  @classmethod
326
326
  def get_column(cls, d: dict) -> catalog.Column:
327
327
  tbl_id, version, col_id = UUID(d['tbl_id']), d['tbl_version'], d['col_id']
328
- tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version)
328
+ # validate_initialized=False: this gets called as part of TableVersion.init()
329
+ tbl_version = catalog.Catalog.get().get_tbl_version(tbl_id, version, validate_initialized=False)
329
330
  # don't use tbl_version.cols_by_id here, this might be a snapshot reference to a column that was then dropped
330
331
  col = next(col for col in tbl_version.cols if col.id == col_id)
331
332
  return col
@@ -7,7 +7,6 @@ the [Working with Gemini](https://pixeltable.readme.io/docs/working-with-gemini)
7
7
 
8
8
  import asyncio
9
9
  import io
10
- import tempfile
11
10
  from pathlib import Path
12
11
  from typing import TYPE_CHECKING, Optional
13
12
 
@@ -215,9 +214,10 @@ async def generate_videos(
215
214
  video_bytes = await _genai_client().aio.files.download(file=video.video) # type: ignore[arg-type]
216
215
  assert video_bytes is not None
217
216
 
218
- _, output_filename = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
219
- Path(output_filename).write_bytes(video_bytes)
220
- return output_filename
217
+ # Create a temporary file to store the video bytes
218
+ output_path = env.Env.get().create_tmp_path('.mp4')
219
+ Path(output_path).write_bytes(video_bytes)
220
+ return str(output_path)
221
221
 
222
222
 
223
223
  @generate_videos.resource_pool
@@ -13,7 +13,6 @@ import logging
13
13
  import math
14
14
  import pathlib
15
15
  import re
16
- import uuid
17
16
  from typing import TYPE_CHECKING, Any, Callable, Optional, Type
18
17
 
19
18
  import httpx
@@ -207,7 +206,7 @@ async def speech(input: str, *, model: str, voice: str, model_kwargs: Optional[d
207
206
 
208
207
  content = await _openai_client().audio.speech.create(input=input, model=model, voice=voice, **model_kwargs)
209
208
  ext = model_kwargs.get('response_format', 'mp3')
210
- output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
209
+ output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
211
210
  content.write_to_file(output_filename)
212
211
  return output_filename
213
212
 
@@ -2,9 +2,6 @@
2
2
  Pixeltable [UDFs](https://pixeltable.readme.io/docs/user-defined-functions-udfs) for `VideoType`.
3
3
  """
4
4
 
5
- import tempfile
6
- import uuid
7
- from pathlib import Path
8
5
  from typing import Any, Optional
9
6
 
10
7
  import av
@@ -59,8 +56,7 @@ class make_video(pxt.Aggregator):
59
56
  if frame is None:
60
57
  return
61
58
  if self.container is None:
62
- (_, output_filename) = tempfile.mkstemp(suffix='.mp4', dir=str(env.Env.get().tmp_dir))
63
- self.out_file = Path(output_filename)
59
+ self.out_file = env.Env.get().create_tmp_path('.mp4')
64
60
  self.container = av.open(str(self.out_file), mode='w')
65
61
  self.stream = self.container.add_stream('h264', rate=self.fps)
66
62
  self.stream.pix_fmt = 'yuv420p'
@@ -109,7 +105,7 @@ def extract_audio(
109
105
  return None
110
106
  audio_stream = container.streams.audio[stream_idx]
111
107
  # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
112
- output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
108
+ output_filename = str(env.Env.get().create_tmp_path(f'.{ext}'))
113
109
 
114
110
  with av.open(output_filename, 'w', format=format) as output_container:
115
111
  output_stream = output_container.add_stream(codec or default_codec)
pixeltable/globals.py CHANGED
@@ -8,7 +8,7 @@ from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, Optional, Un
8
8
  import pandas as pd
9
9
  from pandas.io.formats.style import Styler
10
10
 
11
- from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share
11
+ from pixeltable import DataFrame, catalog, exceptions as excs, exprs, func, share, type_system as ts
12
12
  from pixeltable.catalog import Catalog, TableVersionPath
13
13
  from pixeltable.catalog.insertable_table import OnErrorParameter
14
14
  from pixeltable.config import Config
@@ -44,7 +44,7 @@ def init(config_overrides: Optional[dict[str, Any]] = None) -> None:
44
44
 
45
45
 
46
46
  def create_table(
47
- path_str: str,
47
+ path: str,
48
48
  schema: Optional[dict[str, Any]] = None,
49
49
  *,
50
50
  source: Optional[TableDataSource] = None,
@@ -58,14 +58,24 @@ def create_table(
58
58
  if_exists: Literal['error', 'ignore', 'replace', 'replace_force'] = 'error',
59
59
  extra_args: Optional[dict[str, Any]] = None, # Additional arguments to data source provider
60
60
  ) -> catalog.Table:
61
- """Create a new base table.
61
+ """Create a new base table. Exactly one of `schema` or `source` must be provided.
62
+
63
+ If a `schema` is provided, then an empty table will be created with the specified schema.
64
+
65
+ If a `source` is provided, then Pixeltable will attempt to infer a data source format and table schema from the
66
+ contents of the specified data, and the data will be imported from the specified source into the new table. The
67
+ source format and/or schema can be specified directly via the `source_format` and `schema_overrides` parameters.
62
68
 
63
69
  Args:
64
- path_str: Path to the table.
65
- schema: A dictionary that maps column names to column types
66
- source: A data source from which a table schema can be inferred and data imported
67
- source_format: A hint to the format of the source data
68
- schema_overrides: If specified, then columns in `schema_overrides` will be given the specified types
70
+ path: Pixeltable path (qualified name) of the table, such as `'my_table'` or `'my_dir.my_subdir.my_table'`.
71
+ schema: Schema for the new table, mapping column names to Pixeltable types.
72
+ source: A data source (file, URL, DataFrame, or list of rows) to import from.
73
+ source_format: Must be used in conjunction with a `source`.
74
+ If specified, then the given format will be used to read the source data. (Otherwise,
75
+ Pixeltable will attempt to infer the format from the source data.)
76
+ schema_overrides: Must be used in conjunction with a `source`.
77
+ If specified, then columns in `schema_overrides` will be given the specified types.
78
+ (Pixeltable will attempt to infer the types of any columns not specified.)
69
79
  on_error: Determines the behavior if an error occurs while evaluating a computed column or detecting an
70
80
  invalid media file (such as a corrupt image) for one of the inserted rows.
71
81
 
@@ -81,14 +91,15 @@ def create_table(
81
91
 
82
92
  - `'on_read'`: validate media files at query time
83
93
  - `'on_write'`: validate media files during insert/update operations
84
- if_exists: Directive regarding how to handle if the path already exists.
85
- Must be one of the following:
94
+ if_exists: Determines the behavior if a table already exists at the specified path location.
86
95
 
87
96
  - `'error'`: raise an error
88
97
  - `'ignore'`: do nothing and return the existing table handle
89
- - `'replace'`: if the existing table has no views, drop and replace it with a new one
90
- - `'replace_force'`: drop the existing table and all its views, and create a new one
91
- extra_args: Additional arguments to pass to the source data provider
98
+ - `'replace'`: if the existing table has no views or snapshots, drop and replace it with a new one;
99
+ raise an error if the existing table has views or snapshots
100
+ - `'replace_force'`: drop the existing table and all its views and snapshots, and create a new one
101
+ extra_args: Must be used in conjunction with a `source`. If specified, then additional arguments will be
102
+ passed along to the source data provider.
92
103
 
93
104
  Returns:
94
105
  A handle to the newly created table, or to an already existing table at the path when `if_exists='ignore'`.
@@ -114,7 +125,7 @@ def create_table(
114
125
  >>> tbl1 = pxt.get_table('orig_table')
115
126
  ... tbl2 = pxt.create_table('new_table', tbl1.where(tbl1.col1 < 10).select(tbl1.col2))
116
127
 
117
- Create a table if does not already exist, otherwise get the existing table:
128
+ Create a table if it does not already exist, otherwise get the existing table:
118
129
 
119
130
  >>> tbl = pxt.create_table('my_table', schema={'col1': pxt.Int, 'col2': pxt.String}, if_exists='ignore')
120
131
 
@@ -130,12 +141,12 @@ def create_table(
130
141
  from pixeltable.io.utils import normalize_primary_key_parameter
131
142
 
132
143
  if (schema is None) == (source is None):
133
- raise excs.Error('Must provide either a `schema` or a `source`')
144
+ raise excs.Error('Either a `schema` or a `source` must be provided (but not both)')
134
145
 
135
146
  if schema is not None and (len(schema) == 0 or not isinstance(schema, dict)):
136
147
  raise excs.Error('`schema` must be a non-empty dictionary')
137
148
 
138
- path_obj = catalog.Path(path_str)
149
+ path_obj = catalog.Path(path)
139
150
  if_exists_ = catalog.IfExistsParam.validated(if_exists, 'if_exists')
140
151
  media_validation_ = catalog.MediaValidation.validated(media_validation, 'media_validation')
141
152
  primary_key: Optional[list[str]] = normalize_primary_key_parameter(primary_key)
@@ -146,7 +157,14 @@ def create_table(
146
157
  tds = UnkTableDataConduit(source, source_format=source_format, extra_fields=extra_args)
147
158
  tds.check_source_format()
148
159
  data_source = tds.specialize()
149
- data_source.src_schema_overrides = schema_overrides
160
+ src_schema_overrides: dict[str, ts.ColumnType] = {}
161
+ if schema_overrides is not None:
162
+ for col_name, py_type in schema_overrides.items():
163
+ col_type = ts.ColumnType.normalize_type(py_type, nullable_default=True, allow_builtin_types=False)
164
+ if col_type is None:
165
+ raise excs.Error(f'Invalid type for column {col_name!r} in `schema_overrides`: {py_type}')
166
+ src_schema_overrides[col_name] = col_type
167
+ data_source.src_schema_overrides = src_schema_overrides
150
168
  data_source.src_pk = primary_key
151
169
  data_source.infer_schema()
152
170
  schema = data_source.pxt_schema
@@ -255,9 +273,7 @@ def create_view(
255
273
  tbl_version_path = base._tbl_version_path
256
274
  sample_clause = None
257
275
  elif isinstance(base, DataFrame):
258
- base._validate_mutable('create_view', allow_select=True)
259
- if len(base._from_clause.tbls) > 1:
260
- raise excs.Error('Cannot create a view of a join')
276
+ base._validate_mutable_op_sequence('create_view', allow_select=True)
261
277
  tbl_version_path = base._from_clause.tbls[0]
262
278
  where = base.where_clause
263
279
  sample_clause = base.sample_clause
@@ -537,9 +553,12 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
537
553
 
538
554
  >>> pxt.list_tables('dir1')
539
555
  """
540
- path_obj = catalog.Path(dir_path, empty_is_valid=True) # validate format
541
- cat = Catalog.get()
542
- contents = cat.get_dir_contents(path_obj, recursive=recursive)
556
+ return _list_tables(dir_path, recursive=recursive, allow_system_paths=False)
557
+
558
+
559
+ def _list_tables(dir_path: str = '', recursive: bool = True, allow_system_paths: bool = False) -> list[str]:
560
+ path_obj = catalog.Path(dir_path, empty_is_valid=True, allow_system_paths=allow_system_paths)
561
+ contents = Catalog.get().get_dir_contents(path_obj, recursive=recursive)
543
562
  return [str(p) for p in _extract_paths(contents, parent=path_obj, entry_type=catalog.Table)]
544
563
 
545
564
 
@@ -647,13 +666,16 @@ def ls(path: str = '') -> pd.DataFrame:
647
666
  To get a programmatic list of tables and/or directories, use [list_tables()][pixeltable.list_tables] and/or
648
667
  [list_dirs()][pixeltable.list_dirs] instead.
649
668
  """
669
+ from pixeltable.catalog import retry_loop
650
670
  from pixeltable.metadata import schema
651
671
 
652
672
  cat = Catalog.get()
653
673
  path_obj = catalog.Path(path, empty_is_valid=True)
654
674
  dir_entries = cat.get_dir_contents(path_obj)
655
- rows: list[list[str]] = []
656
- with Catalog.get().begin_xact():
675
+
676
+ @retry_loop(for_write=False)
677
+ def op() -> list[list[str]]:
678
+ rows: list[list[str]] = []
657
679
  for name, entry in dir_entries.items():
658
680
  if name.startswith('_'):
659
681
  continue
@@ -679,6 +701,9 @@ def ls(path: str = '') -> pd.DataFrame:
679
701
  if md['is_replica']:
680
702
  kind = f'{kind}-replica'
681
703
  rows.append([name, kind, version, base])
704
+ return rows
705
+
706
+ rows = op()
682
707
 
683
708
  rows = sorted(rows, key=lambda x: x[0])
684
709
  df = pd.DataFrame(
pixeltable/io/datarows.py CHANGED
@@ -8,7 +8,7 @@ from pixeltable import exceptions as excs
8
8
 
9
9
 
10
10
  def _infer_schema_from_rows(
11
- rows: Iterable[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
11
+ rows: Iterable[dict[str, Any]], schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
12
12
  ) -> dict[str, ts.ColumnType]:
13
13
  schema: dict[str, ts.ColumnType] = {}
14
14
  cols_with_nones: set[str] = set()
@@ -20,6 +20,7 @@ def _infer_schema_from_rows(
20
20
  # in which the column names are encountered in the input data, even if `schema_overrides`
21
21
  # is specified.
22
22
  if col_name not in schema:
23
+ assert isinstance(schema_overrides[col_name], ts.ColumnType)
23
24
  schema[col_name] = schema_overrides[col_name]
24
25
  elif value is not None:
25
26
  # If `key` is not in `schema_overrides`, then we infer its type from the data.
pixeltable/io/pandas.py CHANGED
@@ -132,6 +132,7 @@ def df_infer_schema(
132
132
  pd_schema: dict[str, ts.ColumnType] = {}
133
133
  for pd_name, pd_dtype in zip(df.columns, df.dtypes):
134
134
  if pd_name in schema_overrides:
135
+ assert isinstance(schema_overrides[pd_name], ts.ColumnType)
135
136
  pxt_type = schema_overrides[pd_name]
136
137
  else:
137
138
  pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)